Commit fda02b3a authored by Robert Maynard's avatar Robert Maynard Committed by Kitware Robot

Merge topic 'thrust_stateless_resource_allocator_warnings'

f05940aa ThrustPatches now only patches fixes for relevant cuda versions
20d6201a Suppress thrust::mr::stateless_resource_allocator host/device warnings
Acked-by: Kitware Robot's avatarKitware Robot <kwrobot@kitware.com>
Merge-request: !1637
parents ece078b8 f05940aa
......@@ -24,108 +24,17 @@
#ifdef VTKM_ENABLE_CUDA
//So for thrust 1.8.0 - 1.8.2 the inclusive_scan has a bug when accumulating
//values when the binary operators states it is not commutative.
//For more complex value types, we patch thrust/bulk with fix that is found
//in issue: https://github.com/thrust/thrust/issues/692
//
//This specialization needs to be included before ANY thrust includes otherwise
//other device code inside thrust that calls it will not see it
namespace vtkm
{
namespace exec
{
namespace cuda
{
namespace internal
{
//Forward declare of WrappedBinaryOperator
template <typename T, typename F>
class WrappedBinaryOperator;
}
}
}
} //namespace vtkm::exec::cuda::internal
namespace thrust
{
namespace system
{
namespace cuda
{
namespace detail
{
namespace bulk_
{
namespace detail
{
namespace accumulate_detail
{
template <typename ConcurrentGroup,
typename RandomAccessIterator,
typename Size,
typename T,
typename F>
__device__ T
destructive_accumulate_n(ConcurrentGroup& g,
RandomAccessIterator first,
Size n,
T init,
vtkm::exec::cuda::internal::WrappedBinaryOperator<T, F> binary_op)
{
using size_type = typename ConcurrentGroup::size_type;
size_type tid = g.this_exec.index();
// Needed so we can conditionally include components
#include <thrust/version.h>
T x = init;
if (tid < n)
{
x = first[tid];
}
g.wait();
for (size_type offset = 1; offset < g.size(); offset += offset)
{
if (tid >= offset && tid - offset < n)
{
x = binary_op(first[tid - offset], x);
}
g.wait();
if (tid < n)
{
first[tid] = x;
}
g.wait();
}
T result = binary_op(init, first[n - 1]);
g.wait();
return result;
}
}
}
} //namespace bulk_::detail::accumulate_detail
}
}
}
} //namespace thrust::system::cuda::detail
#endif
//So for thrust 1.9.0+ the aligned_reinterpret_cast has a bug
#if THRUST_VERSION >= 100900
//So for thrust 1.9.0+ ( CUDA 9.X+ ) the aligned_reinterpret_cast has a bug
//where it is not marked as __host__device__. To fix this we add a new
//overload for void* with the correct markup (which is what everyone calls).
namespace thrust
{
namespace detail
{
//just in-case somebody has this fix also for primitive types
template <typename T, typename U>
T aligned_reinterpret_cast(U u);
......@@ -265,5 +174,169 @@ ALIGN_RE_PAIR(vtkm::Int64, vtkm::Float64);
#undef ALIGN_RE_PAIR
}
}
#endif //THRUST_VERSION >= 100900
#if THRUST_VERSION >= 100903
//So for thrust 1.9.3+ (CUDA 10.1+) the stateless_resource_allocator has a bug
//where it is not marked as __host__ __device__ && __thrust_exec_check_disable__.
//To fix this we add a new partial specialization on cuda::memory_resource
//which the correct markup (which is what everyone calls).
//See: https://github.com/thrust/thrust/issues/972
VTKM_THIRDPARTY_PRE_INCLUDE
#include <thrust/mr/allocator.h>
#include <thrust/system/cuda/memory_resource.h>
VTKM_THIRDPARTY_POST_INCLUDE
namespace thrust
{
namespace mr
{
template <typename T>
class stateless_resource_allocator<T, ::thrust::system::cuda::memory_resource>
: public thrust::mr::allocator<T, ::thrust::system::cuda::memory_resource>
{
typedef ::thrust::system::cuda::memory_resource Upstream;
typedef thrust::mr::allocator<T, Upstream> base;
public:
/*! The \p rebind metafunction provides the type of an \p stateless_resource_allocator instantiated with another type.
*
* \tparam U the other type to use for instantiation.
*/
template <typename U>
struct rebind
{
/*! The typedef \p other gives the type of the rebound \p stateless_resource_allocator.
*/
typedef stateless_resource_allocator<U, Upstream> other;
};
/*! Default constructor. Uses \p get_global_resource to get the global instance of \p Upstream and initializes the
* \p allocator base subobject with that resource.
*/
__thrust_exec_check_disable__ //modification, required to suppress warnings
__host__ __device__ //modification, required to suppress warnings
stateless_resource_allocator()
: base(get_global_resource<Upstream>())
{
}
/*! Copy constructor. Copies the memory resource pointer. */
__host__ __device__ stateless_resource_allocator(const stateless_resource_allocator& other)
: base(other)
{
}
/*! Conversion constructor from an allocator of a different type. Copies the memory resource pointer. */
template <typename U>
__host__ __device__
stateless_resource_allocator(const stateless_resource_allocator<U, Upstream>& other)
: base(other)
{
}
/*! Destructor. */
__host__ __device__ ~stateless_resource_allocator() {}
};
}
}
#endif //THRUST_VERSION >= 100903
#if THRUST_VERSION < 100900
//So for thrust 1.8.0 - 1.8.2 the inclusive_scan has a bug when accumulating
//values when the binary operators states it is not commutative.
//For more complex value types, we patch thrust/bulk with fix that is found
//in issue: https://github.com/thrust/thrust/issues/692
//
//This specialization needs to be included before ANY thrust includes otherwise
//other device code inside thrust that calls it will not see it
namespace vtkm
{
namespace exec
{
namespace cuda
{
namespace internal
{
//Forward declare of WrappedBinaryOperator
template <typename T, typename F>
class WrappedBinaryOperator;
}
}
}
} //namespace vtkm::exec::cuda::internal
namespace thrust
{
namespace system
{
namespace cuda
{
namespace detail
{
namespace bulk_
{
namespace detail
{
namespace accumulate_detail
{
template <typename ConcurrentGroup,
typename RandomAccessIterator,
typename Size,
typename T,
typename F>
__device__ T
destructive_accumulate_n(ConcurrentGroup& g,
RandomAccessIterator first,
Size n,
T init,
vtkm::exec::cuda::internal::WrappedBinaryOperator<T, F> binary_op)
{
using size_type = typename ConcurrentGroup::size_type;
size_type tid = g.this_exec.index();
T x = init;
if (tid < n)
{
x = first[tid];
}
g.wait();
for (size_type offset = 1; offset < g.size(); offset += offset)
{
if (tid >= offset && tid - offset < n)
{
x = binary_op(first[tid - offset], x);
}
g.wait();
if (tid < n)
{
first[tid] = x;
}
g.wait();
}
T result = binary_op(init, first[n - 1]);
g.wait();
return result;
}
}
}
} //namespace bulk_::detail::accumulate_detail
}
}
}
} //namespace thrust::system::cuda::detail
#endif //THRUST_VERSION < 100900
#endif //CUDA enabled
#endif //vtk_m_exec_cuda_internal_ThrustPatches_h
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment