Commit f05940aa authored by Robert Maynard's avatar Robert Maynard

ThrustPatches now only patches fixes for relevant cuda versions

Rather than always patch Thrust functions, we now only patch
based on the Thrust version.
parent 20d6201a
......@@ -24,108 +24,17 @@
#ifdef VTKM_ENABLE_CUDA
//So for thrust 1.8.0 - 1.8.2 the inclusive_scan has a bug when accumulating
//values when the binary operators states it is not commutative.
//For more complex value types, we patch thrust/bulk with fix that is found
//in issue: https://github.com/thrust/thrust/issues/692
//
//This specialization needs to be included before ANY thrust includes otherwise
//other device code inside thrust that calls it will not see it
namespace vtkm
{
namespace exec
{
namespace cuda
{
namespace internal
{
//Forward declare of WrappedBinaryOperator
template <typename T, typename F>
class WrappedBinaryOperator;
}
}
}
} //namespace vtkm::exec::cuda::internal
namespace thrust
{
namespace system
{
namespace cuda
{
namespace detail
{
namespace bulk_
{
namespace detail
{
namespace accumulate_detail
{
template <typename ConcurrentGroup,
typename RandomAccessIterator,
typename Size,
typename T,
typename F>
__device__ T
destructive_accumulate_n(ConcurrentGroup& g,
RandomAccessIterator first,
Size n,
T init,
vtkm::exec::cuda::internal::WrappedBinaryOperator<T, F> binary_op)
{
using size_type = typename ConcurrentGroup::size_type;
size_type tid = g.this_exec.index();
T x = init;
if (tid < n)
{
x = first[tid];
}
g.wait();
for (size_type offset = 1; offset < g.size(); offset += offset)
{
if (tid >= offset && tid - offset < n)
{
x = binary_op(first[tid - offset], x);
}
g.wait();
if (tid < n)
{
first[tid] = x;
}
g.wait();
}
T result = binary_op(init, first[n - 1]);
g.wait();
return result;
}
}
}
} //namespace bulk_::detail::accumulate_detail
}
}
}
} //namespace thrust::system::cuda::detail
#endif
// Needed so we can conditionally include components
#include <thrust/version.h>
//So for thrust 1.9.0+ the aligned_reinterpret_cast has a bug
#if THRUST_VERSION >= 100900
//So for thrust 1.9.0+ ( CUDA 9.X+ ) the aligned_reinterpret_cast has a bug
//where it is not marked as __host__device__. To fix this we add a new
//overload for void* with the correct markup (which is what everyone calls).
namespace thrust
{
namespace detail
{
//just in-case somebody has this fix also for primitive types
template <typename T, typename U>
T aligned_reinterpret_cast(U u);
......@@ -265,16 +174,18 @@ ALIGN_RE_PAIR(vtkm::Int64, vtkm::Float64);
#undef ALIGN_RE_PAIR
}
}
//need to guard in 1.9.0 check
#endif //THRUST_VERSION >= 100900
#if THRUST_VERSION >= 100903
//So for thrust 1.9.3+ (CUDA 10.1+) the stateless_resource_allocator has a bug
//where it is not marked as __host__ __device__ && __thrust_exec_check_disable__.
//To fix this we add a new partial specialization on cuda::memory_resource
//which the correct markup (which is what everyone calls).
//See: https://github.com/thrust/thrust/issues/972
VTKM_THIRDPARTY_PRE_INCLUDE
#include <thrust/mr/allocator.h>
#include <thrust/system/cuda/memory_resource.h>
VTKM_THIRDPARTY_POST_INCLUDE
//So for thrust 1.9.0+ the stateless_resource_allocator has a bug
//where it is not marked as __host__device__. To fix this we add a new
//overload for void* with the correct markup (which is what everyone calls).
namespace thrust
{
namespace mr
......@@ -329,5 +240,103 @@ public:
};
}
}
#endif //THRUST_VERSION >= 100903
#if THRUST_VERSION < 100900
//So for thrust 1.8.0 - 1.8.2 the inclusive_scan has a bug when accumulating
//values when the binary operators states it is not commutative.
//For more complex value types, we patch thrust/bulk with fix that is found
//in issue: https://github.com/thrust/thrust/issues/692
//
//This specialization needs to be included before ANY thrust includes otherwise
//other device code inside thrust that calls it will not see it
namespace vtkm
{
namespace exec
{
namespace cuda
{
namespace internal
{
//Forward declare of WrappedBinaryOperator
template <typename T, typename F>
class WrappedBinaryOperator;
}
}
}
} //namespace vtkm::exec::cuda::internal
namespace thrust
{
namespace system
{
namespace cuda
{
namespace detail
{
namespace bulk_
{
namespace detail
{
namespace accumulate_detail
{
template <typename ConcurrentGroup,
typename RandomAccessIterator,
typename Size,
typename T,
typename F>
__device__ T
destructive_accumulate_n(ConcurrentGroup& g,
RandomAccessIterator first,
Size n,
T init,
vtkm::exec::cuda::internal::WrappedBinaryOperator<T, F> binary_op)
{
using size_type = typename ConcurrentGroup::size_type;
size_type tid = g.this_exec.index();
T x = init;
if (tid < n)
{
x = first[tid];
}
g.wait();
for (size_type offset = 1; offset < g.size(); offset += offset)
{
if (tid >= offset && tid - offset < n)
{
x = binary_op(first[tid - offset], x);
}
g.wait();
if (tid < n)
{
first[tid] = x;
}
g.wait();
}
T result = binary_op(init, first[n - 1]);
g.wait();
return result;
}
}
}
} //namespace bulk_::detail::accumulate_detail
}
}
}
} //namespace thrust::system::cuda::detail
#endif //THRUST_VERSION < 100900
#endif //CUDA enabled
#endif //vtk_m_exec_cuda_internal_ThrustPatches_h
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment