DeviceAdapterAlgorithmCuda.cu 8.09 KB
Newer Older
1
2
3
4
//============================================================================
//  Copyright (c) Kitware, Inc.
//  All rights reserved.
//  See LICENSE.txt for details.
5
//
6
7
8
9
10
//  This software is distributed WITHOUT ANY WARRANTY; without even
//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
//  PURPOSE.  See the above copyright notice for more information.
//============================================================================

11
#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmCuda.h>
12
13

#include <atomic>
14
15
#include <cstring>
#include <functional>
16
17
#include <mutex>

18
19
#include <cuda.h>

20
21
22
23
24
25
namespace vtkm
{
namespace cont
{
namespace cuda
{
26
27
28
29
30
31
32
33
34
35
36

static vtkm::cont::cuda::ScheduleParameters (
  *ComputeFromEnv)(const char*, int, int, int, int, int) = nullptr;

//Use the provided function as the the compute function for ScheduleParameterBuilder
VTKM_CONT_EXPORT void InitScheduleParameters(
  vtkm::cont::cuda::ScheduleParameters (*function)(const char*, int, int, int, int, int))
{
  ComputeFromEnv = function;
}

37
38
39
namespace internal
{

40
41
42
43
44
45
//These represent the best block/threads-per for scheduling on each GPU
static std::vector<std::pair<int, int>> scheduling_1d_parameters;
static std::vector<std::pair<int, dim3>> scheduling_2d_parameters;
static std::vector<std::pair<int, dim3>> scheduling_3d_parameters;

struct VTKM_CONT_EXPORT ScheduleParameterBuilder
46
{
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
  //This represents information that is used to compute the best
  //ScheduleParameters for a given GPU
  enum struct GPU_STRATA
  {
    ENV = 0,
    OLDER = 5,
    PASCAL = 6,
    VOLTA = 7,
    PASCAL_HPC = 6000,
    VOLTA_HPC = 7000
  };

  std::map<GPU_STRATA, vtkm::cont::cuda::ScheduleParameters> Presets;
  std::function<vtkm::cont::cuda::ScheduleParameters(const char*, int, int, int, int, int)> Compute;

  // clang-format off
  // The presets for [one,two,three]_d_blocks are before we multiply by the number of SMs on the hardware
  ScheduleParameterBuilder()
    : Presets{
      { GPU_STRATA::ENV,        {  0,   0,  0, {  0,  0, 0 },  0, { 0, 0, 0 } } }, //use env settings
      { GPU_STRATA::OLDER,
                                { 32, 128,  8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for less than pascal
      { GPU_STRATA::PASCAL,     { 32, 128,  8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for pascal
      { GPU_STRATA::VOLTA,      { 32, 128,  8, { 16, 16, 1 }, 32, { 8, 8, 4 } } }, //VTK-m default for volta
      { GPU_STRATA::PASCAL_HPC, { 32, 256, 16, { 16, 16, 1 }, 32, { 8, 8, 8 } } }, //P100
      { GPU_STRATA::VOLTA_HPC,  { 32, 256, 16, { 16, 16, 1 }, 32, { 8, 8, 8 } } }, //V100
    }
    , Compute(nullptr)
  {
    if (vtkm::cont::cuda::ComputeFromEnv != nullptr)
    {
      this->Compute = vtkm::cont::cuda::ComputeFromEnv;
    }
    else
    {
      this->Compute = [=] (const char* name, int major, int minor,
                          int numSMs, int maxThreadsPerSM, int maxThreadsPerBlock) -> ScheduleParameters  {
        return this->ComputeFromPreset(name, major, minor, numSMs, maxThreadsPerSM, maxThreadsPerBlock); };
    }
  }
  // clang-format on

  vtkm::cont::cuda::ScheduleParameters ComputeFromPreset(const char* name,
                                                         int major,
                                                         int minor,
                                                         int numSMs,
                                                         int maxThreadsPerSM,
                                                         int maxThreadsPerBlock)
95
  {
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
    (void)minor;
    (void)maxThreadsPerSM;
    (void)maxThreadsPerBlock;

    const constexpr int GPU_STRATA_MAX_GEN = 7;
    const constexpr int GPU_STRATA_MIN_GEN = 5;
    int strataAsInt = std::min(major, GPU_STRATA_MAX_GEN);
    strataAsInt = std::max(strataAsInt, GPU_STRATA_MIN_GEN);
    if (strataAsInt > GPU_STRATA_MIN_GEN)
    { //only pascal and above have fancy

      //Currently the only
      bool is_tesla = (0 == std::strncmp("Tesla", name, 4)); //see if the name starts with Tesla
      if (is_tesla)
      {
        strataAsInt *= 1000; //tesla modifier
      }
    }

    auto preset = this->Presets.find(static_cast<GPU_STRATA>(strataAsInt));
116
    ScheduleParameters params = preset->second;
117
118
119
120
    params.one_d_blocks = params.one_d_blocks * numSMs;
    params.two_d_blocks = params.two_d_blocks * numSMs;
    params.three_d_blocks = params.three_d_blocks * numSMs;
    return params;
121
  }
122
};
123

124
125
126
VTKM_CONT_EXPORT void SetupKernelSchedulingParameters()
{
  //check flag
127
  static std::once_flag lookupBuiltFlag;
128

129
  std::call_once(lookupBuiltFlag, []() {
130
    ScheduleParameterBuilder builder;
131
132
133
134
    //iterate over all devices
    int count = 0;
    VTKM_CUDA_CALL(cudaGetDeviceCount(&count));
    for (int deviceId = 0; deviceId < count; ++deviceId)
135
136
137
138
139
140
141
142
143
144
145
146
147
148
    {
      cudaDeviceProp deviceProp;
      cudaGetDeviceProperties(&deviceProp, deviceId);

      ScheduleParameters params = builder.Compute(deviceProp.name,
                                                  deviceProp.major,
                                                  deviceProp.minor,
                                                  deviceProp.multiProcessorCount,
                                                  deviceProp.maxThreadsPerMultiProcessor,
                                                  deviceProp.maxThreadsPerBlock);
      scheduling_1d_parameters.emplace_back(params.one_d_blocks, params.one_d_threads_per_block);
      scheduling_2d_parameters.emplace_back(params.two_d_blocks, params.two_d_threads_per_block);
      scheduling_3d_parameters.emplace_back(params.three_d_blocks,
                                            params.three_d_threads_per_block);
149
    }
150
  });
151
}
152
153
}
} // end namespace cuda::internal
154
155
156

// we use cuda pinned memory to reduce the amount of synchronization
// and mem copies between the host and device.
Sujin Philip's avatar
Sujin Philip committed
157
158
auto DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetPinnedErrorArray()
  -> const PinnedErrorArray&
159
{
Sujin Philip's avatar
Sujin Philip committed
160
161
162
163
  constexpr vtkm::Id ERROR_ARRAY_SIZE = 1024;
  static thread_local PinnedErrorArray local;

  if (!local.HostPtr)
164
  {
Sujin Philip's avatar
Sujin Philip committed
165
166
167
168
    VTKM_CUDA_CALL(cudaMallocHost((void**)&local.HostPtr, ERROR_ARRAY_SIZE, cudaHostAllocMapped));
    VTKM_CUDA_CALL(cudaHostGetDevicePointer(&local.DevicePtr, local.HostPtr, 0));
    local.HostPtr[0] = '\0'; // clear
    local.Size = ERROR_ARRAY_SIZE;
169
170
  }

Sujin Philip's avatar
Sujin Philip committed
171
  return local;
172
173
}

Sujin Philip's avatar
Sujin Philip committed
174
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::SetupErrorBuffer(
175
176
  vtkm::exec::cuda::internal::TaskStrided& functor)
{
Sujin Philip's avatar
Sujin Philip committed
177
178
  auto pinnedArray = GetPinnedErrorArray();
  vtkm::exec::internal::ErrorMessageBuffer errorMessage(pinnedArray.DevicePtr, pinnedArray.Size);
179
  functor.SetErrorMessageBuffer(errorMessage);
Sujin Philip's avatar
Sujin Philip committed
180
}
181

Sujin Philip's avatar
Sujin Philip committed
182
183
184
185
186
187
188
189
190
191
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::CheckForErrors()
{
  auto pinnedArray = GetPinnedErrorArray();
  if (pinnedArray.HostPtr[0] != '\0')
  {
    VTKM_CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
    auto excep = vtkm::cont::ErrorExecution(pinnedArray.HostPtr);
    pinnedArray.HostPtr[0] = '\0'; // clear
    throw excep;
  }
192
193
}

194
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
195
  vtkm::UInt32& blocks,
196
  vtkm::UInt32& threadsPerBlock,
197
198
199
  vtkm::Id size)
{
  (void)size;
200
201
  vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();

202
203
  int deviceId;
  VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
204
205
206
  const auto& params = cuda::internal::scheduling_1d_parameters[static_cast<size_t>(deviceId)];
  blocks = params.first;
  threadsPerBlock = params.second;
207
208
}

209
210
211
void DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>::GetBlocksAndThreads(
  vtkm::UInt32& blocks,
  dim3& threadsPerBlock,
212
213
  const dim3& size)
{
214
215
  vtkm::cont::cuda::internal::SetupKernelSchedulingParameters();

216
217
  int deviceId;
  VTKM_CUDA_CALL(cudaGetDevice(&deviceId)); //get deviceid from cuda
218
219
220
221
222
  if (size.z <= 1)
  { //2d images
    const auto& params = cuda::internal::scheduling_2d_parameters[static_cast<size_t>(deviceId)];
    blocks = params.first;
    threadsPerBlock = params.second;
223
224
  }
  else
225
226
227
228
  { //3d images
    const auto& params = cuda::internal::scheduling_3d_parameters[static_cast<size_t>(deviceId)];
    blocks = params.first;
    threadsPerBlock = params.second;
229
230
231
  }
}
}
232
} // end namespace vtkm::cont