Commit 9cfc9112 authored by Allison Vacanti's avatar Allison Vacanti Committed by Kitware Robot
Browse files

Merge topic '228-consolidate_cuda_internals'

1f6a662c Merge DevAdaptAlgoThrust --> DevAdaptAlgoCuda.
be0c6a17 Move DevAdaptAtomicArrayImplementation to its own file.
3af9f660

 Merge ArrayManagerExecutionThrustDevice into AMECuda.
Acked-by: Kitware Robot's avatarKitware Robot <kwrobot@kitware.com>
Acked-by: default avatarRobert Maynard <robert.maynard@kitware.com>
Merge-request: !1228
parents c94422d2 1f6a662c
......@@ -20,16 +20,30 @@
#ifndef vtk_m_cont_cuda_internal_ArrayManagerExecutionCuda_h
#define vtk_m_cont_cuda_internal_ArrayManagerExecutionCuda_h
#include <vtkm/cont/cuda/ErrorCuda.h>
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
#include <vtkm/cont/cuda/internal/DeviceAdapterTagCuda.h>
#include <vtkm/cont/cuda/internal/ThrustExceptionHandler.h>
#include <vtkm/exec/cuda/internal/ArrayPortalFromThrust.h>
#include <vtkm/cont/cuda/internal/ArrayManagerExecutionThrustDevice.h>
#include <vtkm/cont/internal/ArrayExportMacros.h>
#include <vtkm/cont/internal/ArrayManagerExecution.h>
#include <vtkm/cont/ArrayPortalToIterators.h>
#include <vtkm/cont/ErrorBadAllocation.h>
#include <vtkm/cont/Storage.h>
//This is in a separate header so that ArrayHandleBasicImpl can include
//the interface without getting any CUDA headers
#include <vtkm/cont/cuda/internal/ExecutionArrayInterfaceBasicCuda.h>
VTKM_THIRDPARTY_PRE_INCLUDE
#include <thrust/copy.h>
#include <thrust/device_ptr.h>
VTKM_THIRDPARTY_POST_INCLUDE
#include <limits>
// These must be placed in the vtkm::cont::internal namespace so that
// the template can be found.
......@@ -42,29 +56,43 @@ namespace internal
template <typename T, class StorageTag>
class ArrayManagerExecution<T, StorageTag, vtkm::cont::DeviceAdapterTagCuda>
: public vtkm::cont::cuda::internal::ArrayManagerExecutionThrustDevice<T, StorageTag>
{
public:
using Superclass = vtkm::cont::cuda::internal::ArrayManagerExecutionThrustDevice<T, StorageTag>;
using ValueType = typename Superclass::ValueType;
using PortalType = typename Superclass::PortalType;
using PortalConstType = typename Superclass::PortalConstType;
using StorageType = typename Superclass::StorageType;
using ValueType = T;
using PointerType = T*;
using PortalType = vtkm::exec::cuda::internal::ArrayPortalFromThrust<T>;
using PortalConstType = vtkm::exec::cuda::internal::ConstArrayPortalFromThrust<T>;
using StorageType = vtkm::cont::internal::Storage<ValueType, StorageTag>;
using difference_type = std::ptrdiff_t;
VTKM_CONT
ArrayManagerExecution(StorageType* storage)
: Superclass(storage)
: Storage(storage)
, Begin(nullptr)
, End(nullptr)
, Capacity(nullptr)
{
}
VTKM_CONT
~ArrayManagerExecution() { this->ReleaseResources(); }
/// Returns the size of the array.
///
VTKM_CONT
vtkm::Id GetNumberOfValues() const { return static_cast<vtkm::Id>(this->End - this->Begin); }
VTKM_CONT
PortalConstType PrepareForInput(bool updateData)
{
try
{
// This alternate form of PrepareForInput works around an issue
// with nvcc 7.5.
return this->Superclass::template _PrepareForInput<void>(updateData);
if (updateData)
{
this->CopyToExecution();
}
return PortalConstType(this->Begin, this->End);
}
catch (vtkm::cont::ErrorBadAllocation& error)
{
......@@ -83,9 +111,12 @@ public:
{
try
{
// This alternate form of PrepareForInPlace works around an issue
// with nvcc 7.5.
return this->Superclass::template _PrepareForInPlace<void>(updateData);
if (updateData)
{
this->CopyToExecution();
}
return PortalType(this->Begin, this->End);
}
catch (vtkm::cont::ErrorBadAllocation& error)
{
......@@ -104,9 +135,49 @@ public:
{
try
{
// This alternate form of PrepareForOutput works around an issue
// with nvcc 7.5.
return this->Superclass::template _PrepareForOutput<void>(numberOfValues);
// Can we reuse the existing buffer?
vtkm::Id curCapacity =
this->Begin != nullptr ? static_cast<vtkm::Id>(this->Capacity - this->Begin) : 0;
// Just mark a new end if we don't need to increase the allocation:
if (curCapacity >= numberOfValues)
{
this->End = this->Begin + static_cast<difference_type>(numberOfValues);
return PortalType(this->Begin, this->End);
}
const std::size_t maxNumVals = (std::numeric_limits<std::size_t>::max() / sizeof(ValueType));
if (static_cast<std::size_t>(numberOfValues) > maxNumVals)
{
std::ostringstream err;
err << "Failed to allocate " << numberOfValues << " values on device: "
<< "Number of bytes is not representable by std::size_t.";
throw vtkm::cont::ErrorBadAllocation(err.str());
}
this->ReleaseResources();
const std::size_t bufferSize = static_cast<std::size_t>(numberOfValues) * sizeof(ValueType);
// Attempt to allocate:
try
{
this->Begin =
static_cast<ValueType*>(vtkm::cont::cuda::internal::CudaAllocator::Allocate(bufferSize));
}
catch (const std::exception& error)
{
std::ostringstream err;
err << "Failed to allocate " << bufferSize << " bytes on device: " << error.what();
throw vtkm::cont::ErrorBadAllocation(err.str());
}
this->Capacity = this->Begin + static_cast<difference_type>(numberOfValues);
this->End = this->Capacity;
return PortalType(this->Begin, this->End);
}
catch (vtkm::cont::ErrorBadAllocation& error)
{
......@@ -119,6 +190,75 @@ public:
throw error;
}
}
/// Allocates enough space in \c storage and copies the data in the
/// device vector into it.
///
VTKM_CONT
void RetrieveOutputData(StorageType* storage) const
{
storage->Allocate(this->GetNumberOfValues());
try
{
::thrust::copy(thrust::cuda::pointer<ValueType>(this->Begin),
thrust::cuda::pointer<ValueType>(this->End),
vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal()));
}
catch (...)
{
vtkm::cont::cuda::internal::throwAsVTKmException();
}
}
/// Resizes the device vector.
///
VTKM_CONT void Shrink(vtkm::Id numberOfValues)
{
// The operation will succeed even if this assertion fails, but this
// is still supposed to be a precondition to Shrink.
VTKM_ASSERT(this->Begin != nullptr && this->Begin + numberOfValues <= this->End);
this->End = this->Begin + static_cast<difference_type>(numberOfValues);
}
/// Frees all memory.
///
VTKM_CONT void ReleaseResources()
{
if (this->Begin != nullptr)
{
vtkm::cont::cuda::internal::CudaAllocator::Free(this->Begin);
this->Begin = nullptr;
this->End = nullptr;
this->Capacity = nullptr;
}
}
private:
ArrayManagerExecution(ArrayManagerExecution&) = delete;
void operator=(ArrayManagerExecution&) = delete;
StorageType* Storage;
PointerType Begin;
PointerType End;
PointerType Capacity;
VTKM_CONT
void CopyToExecution()
{
try
{
this->PrepareForOutput(this->Storage->GetNumberOfValues());
::thrust::copy(vtkm::cont::ArrayPortalToIteratorBegin(this->Storage->GetPortalConst()),
vtkm::cont::ArrayPortalToIteratorEnd(this->Storage->GetPortalConst()),
thrust::cuda::pointer<ValueType>(this->Begin));
}
catch (...)
{
vtkm::cont::cuda::internal::throwAsVTKmException();
}
}
};
template <typename T>
......
//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//
// Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
// Copyright 2014 UT-Battelle, LLC.
// Copyright 2014 Los Alamos National Security.
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
// Laboratory (LANL), the U.S. Government retains certain rights in
// this software.
//============================================================================
#ifndef vtk_m_cont_cuda_internal_ArrayManagerExecutionThrustDevice_h
#define vtk_m_cont_cuda_internal_ArrayManagerExecutionThrustDevice_h
#include <vtkm/cont/ArrayPortalToIterators.h>
#include <vtkm/cont/ErrorBadAllocation.h>
#include <vtkm/cont/Storage.h>
#include <vtkm/cont/cuda/ErrorCuda.h>
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
#include <vtkm/cont/cuda/internal/ThrustExceptionHandler.h>
#include <vtkm/exec/cuda/internal/ArrayPortalFromThrust.h>
VTKM_THIRDPARTY_PRE_INCLUDE
#include <thrust/copy.h>
#include <thrust/device_ptr.h>
VTKM_THIRDPARTY_POST_INCLUDE
#include <limits>
namespace vtkm
{
namespace cont
{
namespace cuda
{
namespace internal
{
/// \c ArrayManagerExecutionThrustDevice provides an implementation for a \c
/// ArrayManagerExecution class for a thrust device adapter that is designed
/// for the cuda backend which has separate memory spaces for host and device.
template <typename T, class StorageTag>
class ArrayManagerExecutionThrustDevice
{
public:
using ValueType = T;
using PointerType = T*;
using difference_type = std::ptrdiff_t;
using StorageType = vtkm::cont::internal::Storage<ValueType, StorageTag>;
using PortalType = vtkm::exec::cuda::internal::ArrayPortalFromThrust<T>;
using PortalConstType = vtkm::exec::cuda::internal::ConstArrayPortalFromThrust<T>;
VTKM_CONT
ArrayManagerExecutionThrustDevice(StorageType* storage)
: Storage(storage)
, Begin(nullptr)
, End(nullptr)
, Capacity(nullptr)
{
}
VTKM_CONT
~ArrayManagerExecutionThrustDevice() { this->ReleaseResources(); }
/// Returns the size of the array.
///
VTKM_CONT
vtkm::Id GetNumberOfValues() const { return static_cast<vtkm::Id>(this->End - this->Begin); }
/// Allocates the appropriate size of the array and copies the given data
/// into the array.
///
VTKM_CONT
PortalConstType PrepareForInput(bool updateData)
{
if (updateData)
{
this->CopyToExecution();
}
else // !updateData
{
// The data in this->Array should already be valid.
}
return PortalConstType(this->Begin, this->End);
}
/// Workaround for nvcc 7.5 compiler warning bug.
template <typename DummyType>
VTKM_CONT PortalConstType _PrepareForInput(bool updateData)
{
return this->PrepareForInput(updateData);
}
/// Allocates the appropriate size of the array and copies the given data
/// into the array.
///
VTKM_CONT
PortalType PrepareForInPlace(bool updateData)
{
if (updateData)
{
this->CopyToExecution();
}
else // !updateData
{
// The data in this->Array should already be valid.
}
return PortalType(this->Begin, this->End);
}
/// Workaround for nvcc 7.5 compiler warning bug.
template <typename DummyType>
VTKM_CONT PortalType _PrepareForInPlace(bool updateData)
{
return this->PrepareForInPlace(updateData);
}
/// Allocates the array to the given size.
///
VTKM_CONT
PortalType PrepareForOutput(vtkm::Id numberOfValues)
{
// Can we reuse the existing buffer?
vtkm::Id curCapacity =
this->Begin != nullptr ? static_cast<vtkm::Id>(this->Capacity - this->Begin) : 0;
// Just mark a new end if we don't need to increase the allocation:
if (curCapacity >= numberOfValues)
{
this->End = this->Begin + static_cast<difference_type>(numberOfValues);
return PortalType(this->Begin, this->End);
}
const std::size_t maxNumVals = (std::numeric_limits<std::size_t>::max() / sizeof(ValueType));
if (static_cast<std::size_t>(numberOfValues) > maxNumVals)
{
std::ostringstream err;
err << "Failed to allocate " << numberOfValues << " values on device: "
<< "Number of bytes is not representable by std::size_t.";
throw vtkm::cont::ErrorBadAllocation(err.str());
}
this->ReleaseResources();
const std::size_t bufferSize = static_cast<std::size_t>(numberOfValues) * sizeof(ValueType);
// Attempt to allocate:
try
{
this->Begin =
static_cast<ValueType*>(vtkm::cont::cuda::internal::CudaAllocator::Allocate(bufferSize));
}
catch (const std::exception& error)
{
std::ostringstream err;
err << "Failed to allocate " << bufferSize << " bytes on device: " << error.what();
throw vtkm::cont::ErrorBadAllocation(err.str());
}
this->Capacity = this->Begin + static_cast<difference_type>(numberOfValues);
this->End = this->Capacity;
return PortalType(this->Begin, this->End);
}
/// Workaround for nvcc 7.5 compiler warning bug.
template <typename DummyType>
VTKM_CONT PortalType _PrepareForOutput(vtkm::Id numberOfValues)
{
return this->PrepareForOutput(numberOfValues);
}
/// Allocates enough space in \c storage and copies the data in the
/// device vector into it.
///
VTKM_CONT
void RetrieveOutputData(StorageType* storage) const
{
storage->Allocate(this->GetNumberOfValues());
try
{
::thrust::copy(thrust::cuda::pointer<ValueType>(this->Begin),
thrust::cuda::pointer<ValueType>(this->End),
vtkm::cont::ArrayPortalToIteratorBegin(storage->GetPortal()));
}
catch (...)
{
vtkm::cont::cuda::internal::throwAsVTKmException();
}
}
/// Resizes the device vector.
///
VTKM_CONT void Shrink(vtkm::Id numberOfValues)
{
// The operation will succeed even if this assertion fails, but this
// is still supposed to be a precondition to Shrink.
VTKM_ASSERT(this->Begin != nullptr && this->Begin + numberOfValues <= this->End);
this->End = this->Begin + static_cast<difference_type>(numberOfValues);
}
/// Frees all memory.
///
VTKM_CONT void ReleaseResources()
{
if (this->Begin != nullptr)
{
vtkm::cont::cuda::internal::CudaAllocator::Free(this->Begin);
this->Begin = nullptr;
this->End = nullptr;
this->Capacity = nullptr;
}
}
private:
ArrayManagerExecutionThrustDevice(ArrayManagerExecutionThrustDevice<T, StorageTag>&) = delete;
void operator=(ArrayManagerExecutionThrustDevice<T, StorageTag>&) = delete;
StorageType* Storage;
PointerType Begin;
PointerType End;
PointerType Capacity;
VTKM_CONT
void CopyToExecution()
{
try
{
this->PrepareForOutput(this->Storage->GetNumberOfValues());
::thrust::copy(vtkm::cont::ArrayPortalToIteratorBegin(this->Storage->GetPortalConst()),
vtkm::cont::ArrayPortalToIteratorEnd(this->Storage->GetPortalConst()),
thrust::cuda::pointer<ValueType>(this->Begin));
}
catch (...)
{
vtkm::cont::cuda::internal::throwAsVTKmException();
}
}
};
}
}
}
} // namespace vtkm::cont::cuda::internal
#endif // vtk_m_cont_cuda_internal_ArrayManagerExecutionThrustDevice_h
......@@ -20,10 +20,9 @@
set(headers
ArrayManagerExecutionCuda.h
ArrayManagerExecutionThrustDevice.h
CudaAllocator.h
DeviceAdapterAlgorithmCuda.h
DeviceAdapterAlgorithmThrust.h
DeviceAdapterAtomicArrayImplementationCuda.h
DeviceAdapterRuntimeDetectorCuda.h
DeviceAdapterTagCuda.h
DeviceAdapterTimerImplementationCuda.h
......@@ -49,7 +48,7 @@ endif()
target_sources(vtkm_cont PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/ArrayManagerExecutionCuda.cu
${CMAKE_CURRENT_SOURCE_DIR}/CudaAllocator.cu
${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterAlgorithmThrust.cu
${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterAlgorithmCuda.cu
${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterRuntimeDetectorCuda.cu
${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterTimerImplementationCuda.cu
${CMAKE_CURRENT_SOURCE_DIR}/ExecutionArrayInterfaceBasicCuda.cu
......
......@@ -18,7 +18,7 @@
// this software.
//============================================================================
#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h>
#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h>
#include <atomic>
#include <mutex>
......@@ -65,11 +65,12 @@ VTKM_CONT_EXPORT vtkm::UInt32 getNumSMs(int dId)
}
return numSMs[index];
}
}
} // end namespace cuda::internal
// we use cuda pinned memory to reduce the amount of synchronization
// and mem copies between the host and device.
template <>
char* DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetPinnedErrorArray(
char* DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetPinnedErrorArray(
vtkm::Id& arraySize,
char** hostPointer)
{
......@@ -92,8 +93,7 @@ char* DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetPinnedE
return devicePtr;
}
template <>
char* DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::SetupErrorBuffer(
char* DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::SetupErrorBuffer(
vtkm::exec::cuda::internal::TaskStrided& functor)
{
//since the memory is pinned we can access it safely on the host
......@@ -111,8 +111,7 @@ char* DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::SetupError
return hostErrorPtr;
}
template <>
void DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
vtkm::UInt32& grids,
vtkm::UInt32& blocks,
vtkm::Id size)
......@@ -120,19 +119,18 @@ void DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAnd
(void)size;
int deviceId;
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
grids = 32 * getNumSMs(deviceId);
grids = 32 * cuda::internal::getNumSMs(deviceId);
blocks = 128;
}
template <>
void DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAndBlocks(
vtkm::UInt32& grids,
dim3& blocks,
const dim3& size)
{
int deviceId;
VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
grids = 32 * getNumSMs(deviceId);
grids = 32 * cuda::internal::getNumSMs(deviceId);
if (size.x == 0)
{ //grids that have no x dimension
......@@ -154,6 +152,4 @@ void DeviceAdapterAlgorithmThrust<vtkm::cont::DeviceAdapterTagCuda>::GetGridsAnd
}
}
}
}
}
}
} // end namespace vtkm::cont
This diff is collapsed.
<
//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.