ExecutionPolicy.h 7.76 KB
Newer Older
1
2
3
4
//============================================================================
//  Copyright (c) Kitware, Inc.
//  All rights reserved.
//  See LICENSE.txt for details.
5
//
6
7
8
9
10
11
12
13
//  This software is distributed WITHOUT ANY WARRANTY; without even
//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
//  PURPOSE.  See the above copyright notice for more information.
//============================================================================
#ifndef vtk_m_exec_cuda_internal_ExecutionPolicy_h
#define vtk_m_exec_cuda_internal_ExecutionPolicy_h

#include <vtkm/BinaryPredicates.h>
14
#include <vtkm/cont/cuda/ErrorCuda.h>
15
16
#include <vtkm/exec/cuda/internal/WrappedOperators.h>

17
#include <vtkm/exec/cuda/internal/ThrustPatches.h>
18
19
20
21
22
23
VTKM_THIRDPARTY_PRE_INCLUDE
#include <thrust/execution_policy.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/system/cuda/memory.h>
VTKM_THIRDPARTY_POST_INCLUDE

24
25
#define ThrustCudaPolicyPerThread ::thrust::cuda::par.on(cudaStreamPerThread)

26
27
28
struct vtkm_cuda_policy : thrust::device_execution_policy<vtkm_cuda_policy>
{
};
29
30
31
32
33

//Specialize the sort call for cuda pointers using less/greater operators.
//The purpose of this is that for 32bit types (UInt32,Int32,Float32) thrust
//will call a super fast radix sort only if the operator is thrust::less
//or thrust::greater.
34
35
template <typename T>
__host__ __device__ void sort(
36
  const vtkm_cuda_policy& exec,
37
38
  T* first,
  T* last,
39
40
41
  vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortLess> comp)
{ //sort for concrete pointers and less than op
  //this makes sure that we invoke the thrust radix sort and not merge sort
42
  return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::less<T>());
43
44
}

45
46
template <typename T, typename RandomAccessIterator>
__host__ __device__ void sort_by_key(
47
  const vtkm_cuda_policy& exec,
48
49
  T* first,
  T* last,
50
  RandomAccessIterator values_first,
51
52
53
  vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortLess> comp)
{ //sort for concrete pointers and less than op
  //this makes sure that we invoke the thrust radix sort and not merge sort
54
55
  return thrust::sort_by_key(
    ThrustCudaPolicyPerThread, first, last, values_first, thrust::less<T>());
56
57
}

58
59
template <typename T>
__host__ __device__ void sort(
60
  const vtkm_cuda_policy& exec,
61
62
  T* first,
  T* last,
63
  vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::less<T>> comp)
64
65
{ //sort for concrete pointers and less than op
  //this makes sure that we invoke the thrust radix sort and not merge sort
66
  return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::less<T>());
67
68
}

69
70
template <typename T, typename RandomAccessIterator>
__host__ __device__ void sort_by_key(
71
  const vtkm_cuda_policy& exec,
72
73
  T* first,
  T* last,
74
  RandomAccessIterator values_first,
75
  vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::less<T>> comp)
76
77
{ //sort for concrete pointers and less than op
  //this makes sure that we invoke the thrust radix sort and not merge sort
78
79
  return thrust::sort_by_key(
    ThrustCudaPolicyPerThread, first, last, values_first, thrust::less<T>());
80
81
}

82
83
template <typename T>
__host__ __device__ void sort(
84
  const vtkm_cuda_policy& exec,
85
86
  T* first,
  T* last,
87
  vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortGreater> comp)
88
89
{ //sort for concrete pointers and greater than op
  //this makes sure that we invoke the thrust radix sort and not merge sort
90
  return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::greater<T>());
91
92
}

93
94
template <typename T, typename RandomAccessIterator>
__host__ __device__ void sort_by_key(
95
  const vtkm_cuda_policy& exec,
96
97
  T* first,
  T* last,
98
  RandomAccessIterator values_first,
99
  vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, vtkm::SortGreater> comp)
100
101
{ //sort for concrete pointers and greater than op
  //this makes sure that we invoke the thrust radix sort and not merge sort
102
103
  return thrust::sort_by_key(
    ThrustCudaPolicyPerThread, first, last, values_first, thrust::greater<T>());
104
105
}

106
107
template <typename T>
__host__ __device__ void sort(
108
  const vtkm_cuda_policy& exec,
109
110
  T* first,
  T* last,
111
  vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::greater<T>> comp)
112
113
{ //sort for concrete pointers and greater than op
  //this makes sure that we invoke the thrust radix sort and not merge sort
114
  return thrust::sort(ThrustCudaPolicyPerThread, first, last, thrust::greater<T>());
115
116
}

117
118
template <typename T, typename RandomAccessIterator>
__host__ __device__ void sort_by_key(
119
  const vtkm_cuda_policy& exec,
120
121
  T* first,
  T* last,
122
  RandomAccessIterator values_first,
123
  vtkm::exec::cuda::internal::WrappedBinaryPredicate<T, ::thrust::greater<T>> comp)
124
125
{ //sort for concrete pointers and greater than op
  //this makes sure that we invoke the thrust radix sort and not merge sort
126
127
  return thrust::sort_by_key(
    ThrustCudaPolicyPerThread, first, last, values_first, thrust::greater<T>());
128
129
}

130
template <typename RandomAccessIterator, typename StrictWeakOrdering>
131
132
133
134
__host__ __device__ void sort(const vtkm_cuda_policy& exec,
                              RandomAccessIterator first,
                              RandomAccessIterator last,
                              StrictWeakOrdering comp)
135
136
137
138
139
{
  //At this point the pointer type is not a cuda pointers and/or
  //the operator is not an approved less/greater operator.
  //This most likely will cause thrust to internally determine that
  //the best sort implementation is merge sort.
140
  return thrust::sort(ThrustCudaPolicyPerThread, first, last, comp);
141
142
}

143
144
template <typename RandomAccessIteratorKeys,
          typename RandomAccessIteratorValues,
145
          typename StrictWeakOrdering>
146
147
__host__ __device__ void sort_by_key(const vtkm_cuda_policy& exec,
                                     RandomAccessIteratorKeys first,
148
149
150
                                     RandomAccessIteratorKeys last,
                                     RandomAccessIteratorValues values_first,
                                     StrictWeakOrdering comp)
151
152
153
154
155
{
  //At this point the pointer type is not a cuda pointers and/or
  //the operator is not an approved less/greater operator.
  //This most likely will cause thrust to internally determine that
  //the best sort implementation is merge sort.
156
  return thrust::sort_by_key(ThrustCudaPolicyPerThread, first, last, values_first, comp);
157
158
}

159
160
161
162
163
164
template <typename T,
          typename InputIterator2,
          typename OutputIterator1,
          typename OutputIterator2,
          typename BinaryPredicate,
          typename BinaryFunction>
165
__host__ __device__::thrust::pair<OutputIterator1, OutputIterator2> reduce_by_key(
166
  const vtkm_cuda_policy& exec,
167
168
  T* keys_first,
  T* keys_last,
169
170
171
172
  InputIterator2 values_first,
  OutputIterator1 keys_output,
  OutputIterator2 values_output,
  BinaryPredicate binary_pred,
173
  BinaryFunction binary_op)
174
175

{
176
177
#if defined(VTKM_CUDA_VERSION_MAJOR) && (VTKM_CUDA_VERSION_MAJOR == 7) &&                          \
  (VTKM_CUDA_VERSION_MINOR >= 5)
178
179
  ::thrust::pair<OutputIterator1, OutputIterator2> result =
    thrust::reduce_by_key(ThrustCudaPolicyPerThread,
180
181
                          keys_first,
                          keys_last,
182
183
184
185
186
                          values_first,
                          keys_output,
                          values_output,
                          binary_pred,
                          binary_op);
187

188
//only sync if we are being invoked from the host
189
#ifndef VTKM_CUDA_DEVICE_PASS
190
  VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
191
192
193
194
#endif

  return result;
#else
195
  return thrust::reduce_by_key(ThrustCudaPolicyPerThread,
196
197
198
199
200
201
202
                               keys_first,
                               keys_last,
                               values_first,
                               keys_output,
                               values_output,
                               binary_pred,
                               binary_op);
203
204
205
#endif
}

206
#endif