Commit 571556d9 authored by Robert Maynard's avatar Robert Maynard
Browse files

CUDA's RuntimeDeviceTracker and Timer are now built as part of vtkm_cont

This is done to not only reduce the amount of code that users need
to generate but to reduce the amount of errors when using
the RuntimeDeviceTracker. If the runtime device tracker is initially
used in a library by a c++ file it will never properly detect the
cuda backend. By moving the code into vtkm_cont we can make sure
this problem doesn't occur.
parent 2a75dd50
......@@ -24,7 +24,9 @@ set(headers
CudaAllocator.h
DeviceAdapterAlgorithmCuda.h
DeviceAdapterAlgorithmThrust.h
DeviceAdapterRuntimeDetectorCuda.h
DeviceAdapterTagCuda.h
DeviceAdapterTimerImplementationCuda.h
ExecutionArrayInterfaceBasicCuda.h
MakeThrustIterator.h
TaskTuner.h
......@@ -41,7 +43,9 @@ endif()
target_sources(vtkm_cont PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/ArrayManagerExecutionCuda.cu
${CMAKE_CURRENT_SOURCE_DIR}/ExecutionArrayInterfaceBasicCuda.cu
${CMAKE_CURRENT_SOURCE_DIR}/CudaAllocator.cu
${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterAlgorithmThrust.cu
${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterRuntimeDetectorCuda.cu
${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterTimerImplementationCuda.cu
${CMAKE_CURRENT_SOURCE_DIR}/ExecutionArrayInterfaceBasicCuda.cu
)
......@@ -31,29 +31,14 @@
// Here are the actual implementation of the algorithms.
#include <vtkm/cont/cuda/internal/DeviceAdapterAlgorithmThrust.h>
// Here are the implementations of device adapter specific classes
#include <vtkm/cont/cuda/internal/DeviceAdapterRuntimeDetectorCuda.h>
#include <vtkm/cont/cuda/internal/DeviceAdapterTimerImplementationCuda.h>
#include <vtkm/exec/cuda/internal/TaskStrided.h>
#include <cuda.h>
namespace vtkm
{
namespace cont
{
namespace cuda
{
namespace internal
{
static __global__ void DetermineIfValidCudaDevice()
{
//used only to see if we can launch kernels. It is possible to have a
//CUDA capable device, but still fail to have CUDA support.
}
}
}
}
}
namespace vtkm
{
namespace cont
......@@ -71,129 +56,6 @@ struct DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>
}
};
/// CUDA contains its own high resolution timer.
///
template <>
class DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>
{
public:
VTKM_CONT DeviceAdapterTimerImplementation()
{
VTKM_CUDA_CALL(cudaEventCreate(&this->StartEvent));
VTKM_CUDA_CALL(cudaEventCreate(&this->EndEvent));
this->Reset();
}
VTKM_CONT ~DeviceAdapterTimerImplementation()
{
// These aren't wrapped in VTKM_CUDA_CALL because we can't throw errors
// from destructors. We're relying on cudaGetLastError in the
// VTKM_CUDA_CHECK_ASYNCHRONOUS_ERROR catching any issues from these calls
// later.
cudaEventDestroy(this->StartEvent);
cudaEventDestroy(this->EndEvent);
}
VTKM_CONT void Reset()
{
VTKM_CUDA_CALL(cudaEventRecord(this->StartEvent, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(this->StartEvent));
}
VTKM_CONT vtkm::Float64 GetElapsedTime()
{
VTKM_CUDA_CALL(cudaEventRecord(this->EndEvent, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(this->EndEvent));
float elapsedTimeMilliseconds;
VTKM_CUDA_CALL(
cudaEventElapsedTime(&elapsedTimeMilliseconds, this->StartEvent, this->EndEvent));
return static_cast<vtkm::Float64>(0.001f * elapsedTimeMilliseconds);
}
private:
// Copying CUDA events is problematic.
DeviceAdapterTimerImplementation(
const DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>&) = delete;
void operator=(const DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>&) =
delete;
cudaEvent_t StartEvent;
cudaEvent_t EndEvent;
};
/// \brief Class providing a CUDA runtime support detector.
///
/// The class provide the actual implementation used by
/// vtkm::cont::RuntimeDeviceInformation for the CUDA backend.
///
/// We will verify at runtime that the machine has at least one CUDA
/// capable device, and said device is from the 'fermi' (SM_20) generation
/// or newer.
///
template <>
class DeviceAdapterRuntimeDetector<vtkm::cont::DeviceAdapterTagCuda>
{
public:
VTKM_CONT DeviceAdapterRuntimeDetector()
: NumberOfDevices(0)
, HighestArchSupported(0)
{
static bool deviceQueryInit = false;
static int numDevices = 0;
static int archVersion = 0;
if (!deviceQueryInit)
{
deviceQueryInit = true;
//first query for the number of devices
VTKM_CUDA_CALL(cudaGetDeviceCount(&numDevices));
for (vtkm::Int32 i = 0; i < numDevices; i++)
{
cudaDeviceProp prop;
VTKM_CUDA_CALL(cudaGetDeviceProperties(&prop, i));
const vtkm::Int32 arch = (prop.major * 10) + prop.minor;
archVersion = vtkm::Max(arch, archVersion);
}
//Make sure we can actually launch a kernel. This could fail for any
//of the following reasons:
//
// 1. cudaErrorInsufficientDriver, caused by out of data drives
// 2. cudaErrorDevicesUnavailable, caused by another process locking the
// device or somebody disabling cuda support on the device
// 3. cudaErrorNoKernelImageForDevice we built for a compute version
// greater than the device we are running on
// Most likely others that I don't even know about
vtkm::cont::cuda::internal::DetermineIfValidCudaDevice<<<1, 1, 0, cudaStreamPerThread>>>();
if (cudaSuccess != cudaGetLastError())
{
numDevices = 0;
archVersion = 0;
}
}
this->NumberOfDevices = numDevices;
this->HighestArchSupported = archVersion;
}
/// Returns true if the given device adapter is supported on the current
/// machine.
///
/// Only returns true if we have at-least one CUDA capable device of SM_20 or
/// greater ( fermi ).
///
VTKM_CONT bool Exists() const
{
//
return this->NumberOfDevices > 0 && this->HighestArchSupported >= 20;
}
private:
vtkm::Int32 NumberOfDevices;
vtkm::Int32 HighestArchSupported;
};
/// CUDA contains its own atomic operations
///
template <typename T>
......
//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//
// Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
// Copyright 2014 UT-Battelle, LLC.
// Copyright 2014 Los Alamos National Security.
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
// Laboratory (LANL), the U.S. Government retains certain rights in
// this software.
//============================================================================
#include <vtkm/cont/cuda/internal/DeviceAdapterRuntimeDetectorCuda.h>
#include <vtkm/Math.h>
#include <vtkm/cont/cuda/ErrorCuda.h>
#include <cuda.h>
namespace vtkm
{
namespace cont
{
namespace cuda
{
namespace internal
{
static __global__ void DetermineIfValidCudaDevice()
{
//used only to see if we can launch kernels. It is possible to have a
//CUDA capable device, but still fail to have CUDA support.
}
}
}
}
}
namespace vtkm
{
namespace cont
{
DeviceAdapterRuntimeDetector<vtkm::cont::DeviceAdapterTagCuda>::DeviceAdapterRuntimeDetector()
: NumberOfDevices(0)
, HighestArchSupported(0)
{
static bool deviceQueryInit = false;
static int numDevices = 0;
static int archVersion = 0;
if (!deviceQueryInit)
{
deviceQueryInit = true;
//first query for the number of devices
VTKM_CUDA_CALL(cudaGetDeviceCount(&numDevices));
for (vtkm::Int32 i = 0; i < numDevices; i++)
{
cudaDeviceProp prop;
VTKM_CUDA_CALL(cudaGetDeviceProperties(&prop, i));
const vtkm::Int32 arch = (prop.major * 10) + prop.minor;
archVersion = vtkm::Max(arch, archVersion);
}
//Make sure we can actually launch a kernel. This could fail for any
//of the following reasons:
//
// 1. cudaErrorInsufficientDriver, caused by out of data drives
// 2. cudaErrorDevicesUnavailable, caused by another process locking the
// device or somebody disabling cuda support on the device
// 3. cudaErrorNoKernelImageForDevice we built for a compute version
// greater than the device we are running on
// Most likely others that I don't even know about
vtkm::cont::cuda::internal::DetermineIfValidCudaDevice<<<1, 1, 0, cudaStreamPerThread>>>();
if (cudaSuccess != cudaGetLastError())
{
numDevices = 0;
archVersion = 0;
}
}
this->NumberOfDevices = numDevices;
this->HighestArchSupported = archVersion;
}
bool DeviceAdapterRuntimeDetector<vtkm::cont::DeviceAdapterTagCuda>::Exists() const
{
return this->NumberOfDevices > 0 && this->HighestArchSupported >= 20;
}
}
} // namespace vtkm::cont
//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//
// Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
// Copyright 2014 UT-Battelle, LLC.
// Copyright 2014 Los Alamos National Security.
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
// Laboratory (LANL), the U.S. Government retains certain rights in
// this software.
//============================================================================
#ifndef vtk_m_cont_cuda_internal_DeviceAdapterRuntimeDetectorCuda_h
#define vtk_m_cont_cuda_internal_DeviceAdapterRuntimeDetectorCuda_h
#include <vtkm/cont/vtkm_cont_export.h>
#include <vtkm/Types.h>
#include <vtkm/cont/DeviceAdapterAlgorithm.h>
#include <vtkm/cont/cuda/internal/DeviceAdapterTagCuda.h>
namespace vtkm
{
namespace cont
{
/// \brief Class providing a CUDA runtime support detector.
///
/// The class provide the actual implementation used by
/// vtkm::cont::RuntimeDeviceInformation for the CUDA backend.
///
/// We will verify at runtime that the machine has at least one CUDA
/// capable device, and said device is from the 'fermi' (SM_20) generation
/// or newer.
///
template <>
class VTKM_CONT_EXPORT DeviceAdapterRuntimeDetector<vtkm::cont::DeviceAdapterTagCuda>
{
public:
VTKM_CONT DeviceAdapterRuntimeDetector();
/// Returns true if the given device adapter is supported on the current
/// machine.
///
/// Only returns true if we have at-least one CUDA capable device of SM_20 or
/// greater ( fermi ).
///
VTKM_CONT bool Exists() const;
private:
vtkm::Int32 NumberOfDevices;
vtkm::Int32 HighestArchSupported;
};
}
} // namespace vtkm::cont
#endif
//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//
// Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
// Copyright 2014 UT-Battelle, LLC.
// Copyright 2014 Los Alamos National Security.
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
// Laboratory (LANL), the U.S. Government retains certain rights in
// this software.
//============================================================================
#include <vtkm/cont/cuda/internal/DeviceAdapterTimerImplementationCuda.h>
#include <vtkm/Types.h>
#include <vtkm/cont/cuda/ErrorCuda.h>
#include <cuda.h>
namespace vtkm
{
namespace cont
{
DeviceAdapterTimerImplementation<
vtkm::cont::DeviceAdapterTagCuda>::DeviceAdapterTimerImplementation()
{
VTKM_CUDA_CALL(cudaEventCreate(&this->StartEvent));
VTKM_CUDA_CALL(cudaEventCreate(&this->EndEvent));
this->Reset();
}
DeviceAdapterTimerImplementation<
vtkm::cont::DeviceAdapterTagCuda>::~DeviceAdapterTimerImplementation()
{
// These aren't wrapped in VTKM_CUDA_CALL because we can't throw errors
// from destructors. We're relying on cudaGetLastError in the
// VTKM_CUDA_CHECK_ASYNCHRONOUS_ERROR catching any issues from these calls
// later.
cudaEventDestroy(this->StartEvent);
cudaEventDestroy(this->EndEvent);
}
void DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>::Reset()
{
VTKM_CUDA_CALL(cudaEventRecord(this->StartEvent, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(this->StartEvent));
}
vtkm::Float64 DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>::GetElapsedTime()
{
VTKM_CUDA_CALL(cudaEventRecord(this->EndEvent, cudaStreamPerThread));
VTKM_CUDA_CALL(cudaEventSynchronize(this->EndEvent));
float elapsedTimeMilliseconds;
VTKM_CUDA_CALL(cudaEventElapsedTime(&elapsedTimeMilliseconds, this->StartEvent, this->EndEvent));
return static_cast<vtkm::Float64>(0.001f * elapsedTimeMilliseconds);
}
}
} // namespace vtkm::cont
//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//
// Copyright 2014 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
// Copyright 2014 UT-Battelle, LLC.
// Copyright 2014 Los Alamos National Security.
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
// Laboratory (LANL), the U.S. Government retains certain rights in
// this software.
//============================================================================
#ifndef vtk_m_cont_cuda_internal_DeviceAdapterRuntimeDetectorCuda_h
#define vtk_m_cont_cuda_internal_DeviceAdapterRuntimeDetectorCuda_h
#include <vtkm/cont/vtkm_cont_export.h>
#include <vtkm/Types.h>
#include <vtkm/cont/DeviceAdapterAlgorithm.h>
#include <vtkm/cont/cuda/internal/DeviceAdapterTagCuda.h>
#include <cuda.h>
namespace vtkm
{
namespace cont
{
///
/// Specialization of DeviceAdapterTimerImplementation for CUDA
/// CUDA contains its own high resolution timer that are able
/// to track how long it takes to execute async kernels.
/// If we simply measured time on the CPU it would incorrectly
/// just capture how long it takes to launch a kernel.
template <>
class VTKM_CONT_EXPORT DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>
{
public:
VTKM_CONT DeviceAdapterTimerImplementation();
VTKM_CONT ~DeviceAdapterTimerImplementation();
VTKM_CONT void Reset();
VTKM_CONT vtkm::Float64 GetElapsedTime();
private:
// Copying CUDA events is problematic.
DeviceAdapterTimerImplementation(
const DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>&) = delete;
void operator=(const DeviceAdapterTimerImplementation<vtkm::cont::DeviceAdapterTagCuda>&) =
delete;
cudaEvent_t StartEvent;
cudaEvent_t EndEvent;
};
}
} // namespace vtkm::cont
#endif
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment