CudaAllocator.cu 6.44 KB
Newer Older
1
2
3
4
5
6
7
8
//============================================================================
//  Copyright (c) Kitware, Inc.
//  All rights reserved.
//  See LICENSE.txt for details.
//  This software is distributed WITHOUT ANY WARRANTY; without even
//  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
//  PURPOSE.  See the above copyright notice for more information.
//
Kenneth Moreland's avatar
Kenneth Moreland committed
9
//  Copyright 2017 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
10
11
12
//  Copyright 2017 UT-Battelle, LLC.
//  Copyright 2017 Los Alamos National Security.
//
Kenneth Moreland's avatar
Kenneth Moreland committed
13
//  Under the terms of Contract DE-NA0003525 with NTESS,
14
15
16
17
18
19
20
21
22
23
//  the U.S. Government retains certain rights in this software.
//
//  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
//  Laboratory (LANL), the U.S. Government retains certain rights in
//  this software.
//============================================================================

#include <vtkm/cont/cuda/ErrorCuda.h>
#include <vtkm/cont/cuda/internal/CudaAllocator.h>

24
VTKM_THIRDPARTY_PRE_INCLUDE
25
#include <cuda_runtime.h>
26
VTKM_THIRDPARTY_POST_INCLUDE
27
28
29
30
31
32
33
34
35

// These static vars are in an anon namespace to work around MSVC linker issues.
namespace
{
// Has CudaAllocator::Initialize been called?
static bool IsInitialized = false;

// True if all devices support concurrent pagable managed memory.
static bool ManagedMemorySupported = false;
36
37
38
39

// Avoid overhead of cudaMemAdvise and cudaMemPrefetchAsync for small buffers.
// This value should be > 0 or else these functions will error out.
static std::size_t Threshold = 1 << 20;
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
}

namespace vtkm
{
namespace cont
{
namespace cuda
{
namespace internal
{

bool CudaAllocator::UsingManagedMemory()
{
  CudaAllocator::Initialize();
  return ManagedMemorySupported;
}

57
58
bool CudaAllocator::IsDevicePointer(const void* ptr)
{
59
  CudaAllocator::Initialize();
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
  if (!ptr)
  {
    return false;
  }

  cudaPointerAttributes attr;
  cudaError_t err = cudaPointerGetAttributes(&attr, ptr);
  // This function will return invalid value if the pointer is unknown to the
  // cuda runtime. Manually catch this value since it's not really an error.
  if (err == cudaErrorInvalidValue)
  {
    cudaGetLastError(); // Clear the error so we don't raise it later...
    return false;
  }
  VTKM_CUDA_CALL(err /*= cudaPointerGetAttributes(&attr, ptr)*/);
  return attr.devicePointer == ptr;
}

bool CudaAllocator::IsManagedPointer(const void* ptr)
{
80
  if (!ptr || !ManagedMemorySupported)
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
  {
    return false;
  }

  cudaPointerAttributes attr;
  cudaError_t err = cudaPointerGetAttributes(&attr, ptr);
  // This function will return invalid value if the pointer is unknown to the
  // cuda runtime. Manually catch this value since it's not really an error.
  if (err == cudaErrorInvalidValue)
  {
    cudaGetLastError(); // Clear the error so we don't raise it later...
    return false;
  }
  VTKM_CUDA_CALL(err /*= cudaPointerGetAttributes(&attr, ptr)*/);
  return attr.isManaged != 0;
}

98
99
100
void* CudaAllocator::Allocate(std::size_t numBytes)
{
  CudaAllocator::Initialize();
101
102
103
104
105
106
  // When numBytes is zero cudaMallocManaged returns an error and the behavior
  // of cudaMalloc is not documented. Just return nullptr.
  if (numBytes == 0)
  {
    return nullptr;
  }
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127

  void* ptr = nullptr;
  if (ManagedMemorySupported)
  {
    VTKM_CUDA_CALL(cudaMallocManaged(&ptr, numBytes));
  }
  else
  {
    VTKM_CUDA_CALL(cudaMalloc(&ptr, numBytes));
  }

  return ptr;
}

void CudaAllocator::Free(void* ptr)
{
  VTKM_CUDA_CALL(cudaFree(ptr));
}

void CudaAllocator::PrepareForControl(const void* ptr, std::size_t numBytes)
{
128
  if (IsManagedPointer(ptr) && numBytes >= Threshold)
129
  {
130
#if CUDART_VERSION >= 8000
131
132
    // TODO these hints need to be benchmarked and adjusted once we start
    // sharing the pointers between cont/exec
133
    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
134
    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, cudaCpuDeviceId, cudaStreamPerThread));
135
#endif // CUDA >= 8.0
136
137
138
139
140
  }
}

void CudaAllocator::PrepareForInput(const void* ptr, std::size_t numBytes)
{
141
  if (IsManagedPointer(ptr) && numBytes >= Threshold)
142
  {
143
#if CUDART_VERSION >= 8000
144
145
    int dev;
    VTKM_CUDA_CALL(cudaGetDevice(&dev));
146
147
148
    // VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
    // VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetReadMostly, dev));
    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, dev));
149
    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
150
#endif // CUDA >= 8.0
151
152
153
154
155
  }
}

void CudaAllocator::PrepareForOutput(const void* ptr, std::size_t numBytes)
{
156
  if (IsManagedPointer(ptr) && numBytes >= Threshold)
157
  {
158
#if CUDART_VERSION >= 8000
159
160
    int dev;
    VTKM_CUDA_CALL(cudaGetDevice(&dev));
161
162
163
    // VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
    // VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, dev));
164
    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
165
#endif // CUDA >= 8.0
166
167
168
169
170
  }
}

void CudaAllocator::PrepareForInPlace(const void* ptr, std::size_t numBytes)
{
171
  if (IsManagedPointer(ptr) && numBytes >= Threshold)
172
  {
173
#if CUDART_VERSION >= 8000
174
175
    int dev;
    VTKM_CUDA_CALL(cudaGetDevice(&dev));
176
177
178
    // VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetPreferredLocation, dev));
    // VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseUnsetReadMostly, dev));
    VTKM_CUDA_CALL(cudaMemAdvise(ptr, numBytes, cudaMemAdviseSetAccessedBy, dev));
179
    VTKM_CUDA_CALL(cudaMemPrefetchAsync(ptr, numBytes, dev, cudaStreamPerThread));
180
#endif // CUDA >= 8.0
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
  }
}

void CudaAllocator::Initialize()
{
  if (!IsInitialized)
  {
    int numDevices;
    VTKM_CUDA_CALL(cudaGetDeviceCount(&numDevices));

    if (numDevices == 0)
    {
      ManagedMemorySupported = false;
      IsInitialized = true;
      return;
    }

    // Check all devices, use the feature set supported by all
    bool managed = true;
    cudaDeviceProp prop;
    for (int i = 0; i < numDevices && managed; ++i)
    {
      VTKM_CUDA_CALL(cudaGetDeviceProperties(&prop, i));
      // We check for concurrentManagedAccess, as devices with only the
      // managedAccess property have extra synchronization requirements.
      managed = managed && prop.concurrentManagedAccess;
    }

    ManagedMemorySupported = managed;
    IsInitialized = true;
  }
}
}
}
}
} // end namespace vtkm::cont::cuda::internal