Commit c4bf46ec authored by Robert Maynard's avatar Robert Maynard Committed by Kitware Robot

Merge topic 'simplify_unified_memory_example'

8ca24bae Update the UnifiedMemory example to properly disable managed memory
718caaae CudaAllocator allows managed memory to be explicitly disabled
Acked-by: Kitware Robot's avatarKitware Robot <kwrobot@kitware.com>
Acked-by: Allison Vacanti's avatarAllison Vacanti <allison.vacanti@kitware.com>
Merge-request: !1492
parents 448ec292 8ca24bae
# CudaAllocator Managed Memory can be disabled from C++
Previously it was impossible for calling code to explicitly
disable managed memory. This can be desirable for projects
that know they don't need managed memory and are super
performance critical.
......@@ -18,107 +18,18 @@
// this software.
//============================================================================
#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_CUDA
#include <vtkm/cont/ArrayHandleStreaming.h>
#include <vtkm/cont/Initialize.h>
#include <vtkm/cont/Logging.h> //for GetHumanReadableSize
#include <vtkm/filter/MarchingCubes.h>
#include <vtkm/worklet/DispatcherMapField.h>
#include <vtkm/worklet/DispatcherStreamingMapField.h>
#include <vtkm/Math.h>
#include <vtkm/cont/ArrayHandleCounting.h>
#include <vtkm/cont/CellSetExplicit.h>
#include <vtkm/cont/DataSet.h>
#include <vtkm/cont/Timer.h>
#include <vtkm/cont/cuda/internal/CudaAllocator.h>
namespace
{
// Define the tangle field for the input data
class TangleField : public vtkm::worklet::WorkletMapField
{
public:
using ControlSignature = void(FieldIn<IdType> vertexId, FieldOut<Scalar> v);
using ExecutionSignature = void(_1, _2);
using InputDomain = _1;
const vtkm::Id xdim, ydim, zdim;
const vtkm::Float32 xmin, ymin, zmin, xmax, ymax, zmax;
const vtkm::Id cellsPerLayer;
VTKM_CONT
TangleField(const vtkm::Id3 dims, const vtkm::Float32 mins[3], const vtkm::Float32 maxs[3])
: xdim(dims[0])
, ydim(dims[1])
, zdim(dims[2])
, xmin(mins[0])
, ymin(mins[1])
, zmin(mins[2])
, xmax(maxs[0])
, ymax(maxs[1])
, zmax(maxs[2])
, cellsPerLayer((xdim) * (ydim)){};
VTKM_EXEC
void operator()(const vtkm::Id& vertexId, vtkm::Float32& v) const
{
const vtkm::Id x = vertexId % (xdim);
const vtkm::Id y = (vertexId / (xdim)) % (ydim);
const vtkm::Id z = vertexId / cellsPerLayer;
const vtkm::Float32 fx = static_cast<vtkm::Float32>(x) / static_cast<vtkm::Float32>(xdim - 1);
const vtkm::Float32 fy = static_cast<vtkm::Float32>(y) / static_cast<vtkm::Float32>(xdim - 1);
const vtkm::Float32 fz = static_cast<vtkm::Float32>(z) / static_cast<vtkm::Float32>(xdim - 1);
const vtkm::Float32 xx = 3.0f * (xmin + (xmax - xmin) * (fx));
const vtkm::Float32 yy = 3.0f * (ymin + (ymax - ymin) * (fy));
const vtkm::Float32 zz = 3.0f * (zmin + (zmax - zmin) * (fz));
v = (xx * xx * xx * xx - 5.0f * xx * xx + yy * yy * yy * yy - 5.0f * yy * yy +
zz * zz * zz * zz - 5.0f * zz * zz + 11.8f) *
0.2f +
0.5f;
}
};
// Construct an input data set using the tangle field worklet
vtkm::cont::DataSet MakeIsosurfaceTestDataSet(vtkm::Id3 dims)
{
vtkm::cont::DataSet dataSet;
const vtkm::Id3 vdims(dims[0] + 1, dims[1] + 1, dims[2] + 1);
vtkm::Float32 mins[3] = { -1.0f, -1.0f, -1.0f };
vtkm::Float32 maxs[3] = { 1.0f, 1.0f, 1.0f };
vtkm::cont::ArrayHandle<vtkm::Float32> fieldArray;
vtkm::cont::ArrayHandleCounting<vtkm::Id> vertexCountImplicitArray(
0, 1, vdims[0] * vdims[1] * vdims[2]);
vtkm::worklet::DispatcherMapField<TangleField> tangleFieldDispatcher(
TangleField(vdims, mins, maxs));
tangleFieldDispatcher.Invoke(vertexCountImplicitArray, fieldArray);
vtkm::Vec<vtkm::FloatDefault, 3> origin(0.0f, 0.0f, 0.0f);
vtkm::Vec<vtkm::FloatDefault, 3> spacing(1.0f / static_cast<vtkm::FloatDefault>(dims[0]),
1.0f / static_cast<vtkm::FloatDefault>(dims[2]),
1.0f / static_cast<vtkm::FloatDefault>(dims[1]));
vtkm::cont::ArrayHandleUniformPointCoordinates coordinates(vdims, origin, spacing);
dataSet.AddCoordinateSystem(vtkm::cont::CoordinateSystem("coordinates", coordinates));
dataSet.AddField(
vtkm::cont::Field("nodevar", vtkm::cont::Field::Association::POINTS, fieldArray));
static const vtkm::IdComponent ndim = 3;
vtkm::cont::CellSetStructured<ndim> cellSet("cells");
cellSet.SetPointDimensions(vdims);
dataSet.AddCellSet(cellSet);
return dataSet;
}
}
namespace vtkm
{
namespace worklet
......@@ -141,94 +52,82 @@ public:
// Run a simple worklet, and compute an isosurface
int main(int argc, char* argv[])
{
vtkm::Int64 N = 1024 * 1024 * 1024;
vtkm::cont::Initialize(argc, argv);
vtkm::Int64 N = 4 * 512 * 512 * 512;
if (argc > 1)
N = N * atoi(argv[1]);
else
N = N * 4;
std::cout << "Testing streaming worklet with size " << N << std::endl;
{
N = atoi(argv[1]);
}
std::cout << "Testing streaming worklet on "
<< vtkm::cont::GetHumanReadableSize(N * sizeof(vtkm::Int64)) << std::endl;
vtkm::cont::ArrayHandle<vtkm::Int64> input;
vtkm::cont::ArrayHandle<vtkm::Float32> output;
std::vector<vtkm::Int64> data(N);
for (vtkm::Int64 i = 0; i < N; i++)
data[i] = i;
input = vtkm::cont::make_ArrayHandle(data);
using DeviceAlgorithms = vtkm::cont::DeviceAdapterAlgorithm<VTKM_DEFAULT_DEVICE_ADAPTER_TAG>;
using DeviceTag = vtkm::cont::DeviceAdapterTagCuda;
const bool usingManagedMemory = vtkm::cont::cuda::internal::CudaAllocator::UsingManagedMemory();
vtkm::worklet::SineWorklet sineWorklet;
bool usingManagedMemory = vtkm::cont::cuda::internal::CudaAllocator::UsingManagedMemory();
if (usingManagedMemory)
{
std::cout << "Testing with unified memory" << std::endl;
vtkm::cont::ArrayHandle<vtkm::Int64> input = vtkm::cont::make_ArrayHandle(data);
vtkm::cont::ArrayHandle<vtkm::Float32> output;
std::cout << "Testing with unified memory" << std::endl;
vtkm::worklet::DispatcherMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
dispatcher.SetDevice(DeviceTag{});
vtkm::cont::Timer<> timer;
//run once to get the CUDA code warmed up
dispatcher.Invoke(input, output);
std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
vtkm::cont::Timer<DeviceTag> timer;
for (int i = 0; i < 3; ++i)
{
dispatcher.Invoke(input, output);
std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
}
vtkm::Float64 elapsedTime = timer.GetElapsedTime();
std::cout << "Time: " << elapsedTime << std::endl;
std::cout << "Time for 3 iterations with managed memory: " << elapsedTime << std::endl;
}
else
{
vtkm::worklet::DispatcherStreamingMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
vtkm::Id NBlocks = N / (1024 * 1024 * 1024);
NBlocks *= 2;
dispatcher.SetNumberOfBlocks(NBlocks);
std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks"
<< std::endl;
vtkm::cont::Timer<> timer;
if (usingManagedMemory)
{ //disable managed memory if it is enabled to get
//the correct performance numbers on GPU's that support managed memory
vtkm::cont::cuda::internal::CudaAllocator::ForceManagedMemoryOff();
}
vtkm::Id NBlocks = (N * sizeof(vtkm::Int64)) / (1 << 25);
NBlocks = std::max(vtkm::Id(1), NBlocks);
vtkm::worklet::DispatcherStreamingMapField<vtkm::worklet::SineWorklet> dispatcher(sineWorklet);
dispatcher.SetNumberOfBlocks(NBlocks);
vtkm::cont::ArrayHandle<vtkm::Int64> input = vtkm::cont::make_ArrayHandle(data);
vtkm::cont::ArrayHandle<vtkm::Float32> output;
std::cout << "Testing with streaming (without unified memory) with " << NBlocks << " blocks"
<< std::endl;
//run once to get the CUDA code warmed up
dispatcher.Invoke(input, output);
vtkm::cont::Timer<DeviceTag> timer;
for (int i = 0; i < 3; ++i)
{
dispatcher.Invoke(input, output);
std::cout << output.GetPortalConstControl().Get(output.GetNumberOfValues() - 1) << std::endl;
vtkm::Float64 elapsedTime = timer.GetElapsedTime();
std::cout << "Time: " << elapsedTime << std::endl;
}
int dim = 128;
if (argc > 2)
dim = atoi(argv[2]);
std::cout << "Testing Marching Cubes with size " << dim << "x" << dim << "x" << dim << std::endl;
vtkm::Id3 dims(dim, dim, dim);
vtkm::cont::ArrayHandle<vtkm::Vec<vtkm::Float32, 3>> verticesArray, normalsArray;
vtkm::cont::ArrayHandle<vtkm::Float32> scalarsArray;
vtkm::cont::DataSet dataSet = MakeIsosurfaceTestDataSet(dims);
vtkm::filter::MarchingCubes filter;
filter.SetGenerateNormals(true);
filter.SetMergeDuplicatePoints(false);
filter.SetActiveField("nodevar");
filter.SetIsoValue(0.5);
auto outputData = filter.Execute(dataSet);
//need to extract vertices, normals, and scalars
using VertType = vtkm::cont::ArrayHandle<vtkm::Vec<vtkm::Float32, 3>>;
vtkm::cont::CoordinateSystem coords = outputData.GetCoordinateSystem();
verticesArray = coords.GetData().Cast<VertType>();
normalsArray = outputData.GetField("normals").GetData().Cast<VertType>();
scalarsArray =
outputData.GetField("nodevar").GetData().Cast<vtkm::cont::ArrayHandle<vtkm::Float32>>();
std::cout << "Number of output vertices: " << verticesArray.GetNumberOfValues() << std::endl;
std::cout << "vertices: ";
vtkm::cont::printSummary_ArrayHandle(verticesArray, std::cout);
std::cout << std::endl;
std::cout << "normals: ";
vtkm::cont::printSummary_ArrayHandle(normalsArray, std::cout);
std::cout << std::endl;
std::cout << "scalars: ";
vtkm::cont::printSummary_ArrayHandle(scalarsArray, std::cout);
std::cout << std::endl;
vtkm::Float64 elapsedTime = timer.GetElapsedTime();
std::cout << "Time for 3 iterations: " << elapsedTime << std::endl;
return 0;
}
......@@ -40,10 +40,17 @@ namespace
static std::once_flag IsInitialized;
#endif
// True if concurrent pagable managed memory is not disabled by user via a system
// environment variable and all devices support it.
// Holds how VTK-m currently allocates memory.
// When VTK-m is initialized we set this based on the hardware support ( HardwareSupportsManagedMemory ).
// The user can explicitly disable managed memory through an enviornment variable
// or by calling a function on the CudaAllocator.
// Likewise managed memory can be re-enabled by calling a function on CudaAllocator
// if and only if the underlying hardware supports pageable managed memory
static bool ManagedMemoryEnabled = false;
// True if concurrent pagable managed memory is supported by the machines hardware.
static bool HardwareSupportsManagedMemory = false;
// Avoid overhead of cudaMemAdvise and cudaMemPrefetchAsync for small buffers.
// This value should be > 0 or else these functions will error out.
static std::size_t Threshold = 1 << 20;
......@@ -64,6 +71,35 @@ bool CudaAllocator::UsingManagedMemory()
return ManagedMemoryEnabled;
}
void CudaAllocator::ForceManagedMemoryOff()
{
if (HardwareSupportsManagedMemory)
{
ManagedMemoryEnabled = false;
VTKM_LOG_F(vtkm::cont::LogLevel::Info, "CudaAllocator disabling managed memory");
}
else
{
VTKM_LOG_F(
vtkm::cont::LogLevel::Warn,
"CudaAllocator trying to disable managed memory on hardware that doesn't support it");
}
}
void CudaAllocator::ForceManagedMemoryOn()
{
if (HardwareSupportsManagedMemory)
{
ManagedMemoryEnabled = true;
VTKM_LOG_F(vtkm::cont::LogLevel::Info, "CudaAllocator enabling managed memory");
}
else
{
VTKM_LOG_F(vtkm::cont::LogLevel::Warn,
"CudaAllocator trying to enable managed memory on hardware that doesn't support it");
}
}
bool CudaAllocator::IsDevicePointer(const void* ptr)
{
CudaAllocator::Initialize();
......@@ -273,6 +309,13 @@ void CudaAllocator::Initialize()
managedMemorySupported = managedMemorySupported && prop.concurrentManagedAccess;
}
HardwareSupportsManagedMemory = managedMemorySupported;
ManagedMemoryEnabled = managedMemorySupported;
VTKM_LOG_F(vtkm::cont::LogLevel::Info,
"CudaAllocator hardware %s managed memory",
HardwareSupportsManagedMemory ? "supports" : "doesn't support");
// Check if users want to disable managed memory
#pragma warning(push)
// getenv is not thread safe on windows but since it's inside a call_once block so
......@@ -283,9 +326,11 @@ void CudaAllocator::Initialize()
if (managedMemorySupported && buf != nullptr)
{ //only makes sense to disable managed memory if the hardware supports it
//in the first place
managedMemorySupported = (std::string(buf) != "1");
ManagedMemoryEnabled = false;
VTKM_LOG_F(
vtkm::cont::LogLevel::Info,
"CudaAllocator disabling managed memory due to NO_VTKM_MANAGED_MEMORY env variable");
}
ManagedMemoryEnabled = managedMemorySupported;
});
#endif
}
......
......@@ -42,6 +42,14 @@ struct VTKM_CONT_EXPORT CudaAllocator
/// that can be accessed concurrently by the CPU and GPUs.
static VTKM_CONT bool UsingManagedMemory();
/// Force CUDA allocations to occur with unmanaged memory (aka cudaMalloc).
static VTKM_CONT void ForceManagedMemoryOff();
/// Force CUDA allocations to occur with pageable managed memory.
/// If the current hardware doesn't support pageable managed memory
/// VTK-m will ignore the request and continue to use unmanaged memory (aka cudaMalloc).
static VTKM_CONT void ForceManagedMemoryOn();
/// Returns true if the pointer is accessible from a CUDA device.
static VTKM_CONT bool IsDevicePointer(const void* ptr);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment