Commit 634f523d authored by Haocheng LIU's avatar Haocheng LIU

Merge benchmark executables into a device dependent shared library

VTK-m has been updated to replace old per device benchmark executables with a device
dependent shared library so that it's able to accept a device adapter at runtime through
the "--device=" argument.
parent c27a3366
......@@ -22,7 +22,9 @@
#include <vtkm/TypeTraits.h>
#include <vtkm/cont/Algorithm.h>
#include <vtkm/cont/ArrayHandle.h>
#include <vtkm/cont/DeviceAdapter.h>
#include <vtkm/cont/DeviceAdapterAlgorithm.h>
#include <vtkm/cont/Timer.h>
......@@ -40,10 +42,9 @@ namespace vtkm
namespace benchmarking
{
template <typename DeviceAdapter>
struct BenchmarkArrayTransfer
{
using Algo = vtkm::cont::DeviceAdapterAlgorithm<DeviceAdapter>;
using Algo = vtkm::cont::Algorithm;
using StorageTag = vtkm::cont::StorageTagBasic;
using Timer = vtkm::cont::Timer;
......@@ -139,11 +140,10 @@ struct BenchmarkArrayTransfer
// Copies NumValues from control environment to execution environment and
// accesses them as read-only.
template <typename ValueType>
template <typename ValueType, typename DeviceAdapter>
struct BenchContToExecRead
{
using ArrayType = vtkm::cont::ArrayHandle<ValueType, StorageTag>;
using PortalType = typename ArrayType::template ExecutionTypes<DeviceAdapter>::PortalConst;
using ValueTypeTraits = vtkm::TypeTraits<ValueType>;
vtkm::Id NumValues;
......@@ -164,7 +164,7 @@ struct BenchmarkArrayTransfer
}
VTKM_CONT
vtkm::Float64 operator()()
vtkm::Float64 operator()() const
{
std::vector<ValueType> vec(static_cast<std::size_t>(this->NumValues),
ValueTypeTraits::ZeroInitialization());
......@@ -173,8 +173,8 @@ struct BenchmarkArrayTransfer
// Time the copy:
Timer timer{ DeviceAdapter() };
timer.Start();
ReadValues<PortalType> functor(array.PrepareForInput(DeviceAdapter()),
ValueTypeTraits::ZeroInitialization());
auto portal = array.PrepareForInput(DeviceAdapter());
ReadValues<decltype(portal)> functor(portal, ValueTypeTraits::ZeroInitialization());
Algo::Schedule(functor, this->NumValues);
return timer.GetElapsedTime();
}
......@@ -183,11 +183,10 @@ struct BenchmarkArrayTransfer
// Writes values to ArrayHandle in execution environment. There is no actual
// copy between control/execution in this case.
template <typename ValueType>
template <typename ValueType, typename DeviceAdapter>
struct BenchContToExecWrite
{
using ArrayType = vtkm::cont::ArrayHandle<ValueType, StorageTag>;
using PortalType = typename ArrayType::template ExecutionTypes<DeviceAdapter>::Portal;
using ValueTypeTraits = vtkm::TypeTraits<ValueType>;
vtkm::Id NumValues;
......@@ -208,14 +207,15 @@ struct BenchmarkArrayTransfer
}
VTKM_CONT
vtkm::Float64 operator()()
vtkm::Float64 operator()() const
{
ArrayType array;
// Time the write:
Timer timer{ DeviceAdapter() };
timer.Start();
WriteValues<PortalType> functor(array.PrepareForOutput(this->NumValues, DeviceAdapter()));
auto portal = array.PrepareForOutput(this->NumValues, DeviceAdapter());
WriteValues<decltype(portal)> functor(portal);
Algo::Schedule(functor, this->NumValues);
return timer.GetElapsedTime();
......@@ -225,11 +225,10 @@ struct BenchmarkArrayTransfer
// Copies NumValues from control environment to execution environment and
// both reads and writes them.
template <typename ValueType>
template <typename ValueType, typename DeviceAdapter>
struct BenchContToExecReadWrite
{
using ArrayType = vtkm::cont::ArrayHandle<ValueType, StorageTag>;
using PortalType = typename ArrayType::template ExecutionTypes<DeviceAdapter>::Portal;
using ValueTypeTraits = vtkm::TypeTraits<ValueType>;
vtkm::Id NumValues;
......@@ -250,7 +249,7 @@ struct BenchmarkArrayTransfer
}
VTKM_CONT
vtkm::Float64 operator()()
vtkm::Float64 operator()() const
{
std::vector<ValueType> vec(static_cast<std::size_t>(this->NumValues),
ValueTypeTraits::ZeroInitialization());
......@@ -259,7 +258,8 @@ struct BenchmarkArrayTransfer
// Time the copy:
Timer timer{ DeviceAdapter() };
timer.Start();
ReadWriteValues<PortalType> functor(array.PrepareForInPlace(DeviceAdapter()));
auto portal = array.PrepareForInPlace(DeviceAdapter());
ReadWriteValues<decltype(portal)> functor(portal);
Algo::Schedule(functor, this->NumValues);
return timer.GetElapsedTime();
}
......@@ -268,7 +268,7 @@ struct BenchmarkArrayTransfer
// Copies NumValues from control environment to execution environment and
// back, then accesses them as read-only.
template <typename ValueType>
template <typename ValueType, typename DeviceAdapter>
struct BenchRoundTripRead
{
using ArrayType = vtkm::cont::ArrayHandle<ValueType, StorageTag>;
......@@ -295,7 +295,7 @@ struct BenchmarkArrayTransfer
}
VTKM_CONT
vtkm::Float64 operator()()
vtkm::Float64 operator()() const
{
std::vector<ValueType> vec(static_cast<std::size_t>(this->NumValues),
ValueTypeTraits::ZeroInitialization());
......@@ -309,8 +309,8 @@ struct BenchmarkArrayTransfer
timer.Start();
// Copy to device:
ReadValues<PortalExecType> functor(array.PrepareForInput(DeviceAdapter()),
ValueTypeTraits::ZeroInitialization());
auto portal = array.PrepareForInput(DeviceAdapter());
ReadValues<PortalExecType> functor(portal, ValueTypeTraits::ZeroInitialization());
Algo::Schedule(functor, this->NumValues);
// Copy back to host and read:
......@@ -328,7 +328,7 @@ struct BenchmarkArrayTransfer
// Copies NumValues from control environment to execution environment and
// back, then reads and writes them in-place.
template <typename ValueType>
template <typename ValueType, typename DeviceAdapter>
struct BenchRoundTripReadWrite
{
using ArrayType = vtkm::cont::ArrayHandle<ValueType, StorageTag>;
......@@ -355,7 +355,7 @@ struct BenchmarkArrayTransfer
}
VTKM_CONT
vtkm::Float64 operator()()
vtkm::Float64 operator()() const
{
std::vector<ValueType> vec(static_cast<std::size_t>(this->NumValues),
ValueTypeTraits::ZeroInitialization());
......@@ -369,7 +369,8 @@ struct BenchmarkArrayTransfer
timer.Start();
// Do work on device:
ReadWriteValues<PortalExecType> functor(array.PrepareForInPlace(DeviceAdapter()));
auto portal = array.PrepareForInPlace(DeviceAdapter());
ReadWriteValues<PortalExecType> functor(portal);
Algo::Schedule(functor, this->NumValues);
ReadWriteValues<PortalContType> cFunctor(array.GetPortalControl());
......@@ -385,7 +386,7 @@ struct BenchmarkArrayTransfer
// Write NumValues to device allocated memory and copies them back to control
// for reading.
template <typename ValueType>
template <typename ValueType, typename DeviceAdapter>
struct BenchExecToContRead
{
using ArrayType = vtkm::cont::ArrayHandle<ValueType, StorageTag>;
......@@ -412,7 +413,7 @@ struct BenchmarkArrayTransfer
}
VTKM_CONT
vtkm::Float64 operator()()
vtkm::Float64 operator()() const
{
ArrayType array;
......@@ -421,7 +422,8 @@ struct BenchmarkArrayTransfer
timer.Start();
// Allocate/write data on device
WriteValues<PortalExecType> functor(array.PrepareForOutput(this->NumValues, DeviceAdapter()));
auto portal = array.PrepareForOutput(this->NumValues, DeviceAdapter());
WriteValues<PortalExecType> functor(portal);
Algo::Schedule(functor, this->NumValues);
// Read back on host:
......@@ -439,7 +441,7 @@ struct BenchmarkArrayTransfer
// Write NumValues to device allocated memory and copies them back to control
// and overwrites them.
template <typename ValueType>
template <typename ValueType, typename DeviceAdapter>
struct BenchExecToContWrite
{
using ArrayType = vtkm::cont::ArrayHandle<ValueType, StorageTag>;
......@@ -475,7 +477,8 @@ struct BenchmarkArrayTransfer
timer.Start();
// Allocate/write data on device
WriteValues<PortalExecType> functor(array.PrepareForOutput(this->NumValues, DeviceAdapter()));
auto portal = array.PrepareForOutput(this->NumValues, DeviceAdapter());
WriteValues<PortalExecType> functor(portal);
Algo::Schedule(functor, this->NumValues);
// Read back on host:
......@@ -492,7 +495,7 @@ struct BenchmarkArrayTransfer
// Write NumValues to device allocated memory and copies them back to control
// for reading and writing.
template <typename ValueType>
template <typename ValueType, typename DeviceAdapter>
struct BenchExecToContReadWrite
{
using ArrayType = vtkm::cont::ArrayHandle<ValueType, StorageTag>;
......@@ -528,7 +531,8 @@ struct BenchmarkArrayTransfer
timer.Start();
// Allocate/write data on device
WriteValues<PortalExecType> functor(array.PrepareForOutput(this->NumValues, DeviceAdapter()));
auto portal = array.PrepareForOutput(this->NumValues, DeviceAdapter());
WriteValues<PortalExecType> functor(portal);
Algo::Schedule(functor, this->NumValues);
// Read back on host:
......@@ -547,17 +551,16 @@ struct BenchmarkArrayTransfer
using TestTypes = vtkm::ListTagBase<vtkm::Float32>;
static VTKM_CONT bool Run()
static VTKM_CONT bool Run(vtkm::cont::DeviceAdapterId id)
{
VTKM_RUN_BENCHMARK(ContToExecRead, TestTypes());
VTKM_RUN_BENCHMARK(ContToExecWrite, TestTypes());
VTKM_RUN_BENCHMARK(ContToExecReadWrite, TestTypes());
VTKM_RUN_BENCHMARK(RoundTripRead, TestTypes());
VTKM_RUN_BENCHMARK(RoundTripReadWrite, TestTypes());
VTKM_RUN_BENCHMARK(ExecToContRead, TestTypes());
VTKM_RUN_BENCHMARK(ExecToContWrite, TestTypes());
VTKM_RUN_BENCHMARK(ExecToContReadWrite, TestTypes());
VTKM_RUN_BENCHMARK(ContToExecRead, TestTypes(), id);
VTKM_RUN_BENCHMARK(ContToExecWrite, TestTypes(), id);
VTKM_RUN_BENCHMARK(ContToExecReadWrite, TestTypes(), id);
VTKM_RUN_BENCHMARK(RoundTripRead, TestTypes(), id);
VTKM_RUN_BENCHMARK(RoundTripReadWrite, TestTypes(), id);
VTKM_RUN_BENCHMARK(ExecToContRead, TestTypes(), id);
VTKM_RUN_BENCHMARK(ExecToContWrite, TestTypes(), id);
VTKM_RUN_BENCHMARK(ExecToContReadWrite, TestTypes(), id);
return true;
}
};
......@@ -566,14 +569,11 @@ struct BenchmarkArrayTransfer
int main(int argc, char* argv[])
{
vtkm::cont::InitLogging(argc, argv);
using DeviceAdapter = VTKM_DEFAULT_DEVICE_ADAPTER_TAG;
using Benchmarks = vtkm::benchmarking::BenchmarkArrayTransfer<DeviceAdapter>;
auto opts = vtkm::cont::InitializeOptions::RequireDevice;
auto config = vtkm::cont::Initialize(argc, argv, opts);
auto tracker = vtkm::cont::GetGlobalRuntimeDeviceTracker();
tracker.ForceDevice(DeviceAdapter{});
using Benchmarks = vtkm::benchmarking::BenchmarkArrayTransfer;
bool result = Benchmarks::Run();
bool result = Benchmarks::Run(config.Device);
return result ? EXIT_SUCCESS : EXIT_FAILURE;
}
......@@ -24,6 +24,7 @@
#include <vtkm/cont/AtomicArray.h>
#include <vtkm/cont/RuntimeDeviceTracker.h>
#include <vtkm/cont/Timer.h>
#include <vtkm/cont/internal/DeviceAdapterTag.h>
#include <vtkm/exec/FunctorBase.h>
......@@ -48,25 +49,24 @@ static constexpr vtkm::Id NumWrites = 33554432; // 2^25
VTKM_MAKE_BENCHMARK(Name##32768, Class, 32768); \
VTKM_MAKE_BENCHMARK(Name##1048576, Class, 1048576)
#define RUN_ATOMIC_BENCHMARKS(Name) \
VTKM_RUN_BENCHMARK(Name##1, vtkm::cont::AtomicArrayTypeListTag{}); \
VTKM_RUN_BENCHMARK(Name##8, vtkm::cont::AtomicArrayTypeListTag{}); \
VTKM_RUN_BENCHMARK(Name##32, vtkm::cont::AtomicArrayTypeListTag{}); \
VTKM_RUN_BENCHMARK(Name##512, vtkm::cont::AtomicArrayTypeListTag{}); \
VTKM_RUN_BENCHMARK(Name##2048, vtkm::cont::AtomicArrayTypeListTag{}); \
VTKM_RUN_BENCHMARK(Name##32768, vtkm::cont::AtomicArrayTypeListTag{}); \
VTKM_RUN_BENCHMARK(Name##1048576, vtkm::cont::AtomicArrayTypeListTag{})
#define RUN_ATOMIC_BENCHMARKS(Name, id) \
VTKM_RUN_BENCHMARK(Name##1, vtkm::cont::AtomicArrayTypeListTag{}, id); \
VTKM_RUN_BENCHMARK(Name##8, vtkm::cont::AtomicArrayTypeListTag{}, id); \
VTKM_RUN_BENCHMARK(Name##32, vtkm::cont::AtomicArrayTypeListTag{}, id); \
VTKM_RUN_BENCHMARK(Name##512, vtkm::cont::AtomicArrayTypeListTag{}, id); \
VTKM_RUN_BENCHMARK(Name##2048, vtkm::cont::AtomicArrayTypeListTag{}, id); \
VTKM_RUN_BENCHMARK(Name##32768, vtkm::cont::AtomicArrayTypeListTag{}, id); \
VTKM_RUN_BENCHMARK(Name##1048576, vtkm::cont::AtomicArrayTypeListTag{}, id)
template <class Device>
class BenchmarkAtomicArray
{
public:
using Algo = vtkm::cont::DeviceAdapterAlgorithm<Device>;
using Algo = vtkm::cont::Algorithm;
using Timer = vtkm::cont::Timer;
// Benchmarks AtomicArray::Add such that each work index writes to adjacent
// indices.
template <typename ValueType>
template <typename ValueType, typename DeviceAdapter>
struct BenchAddSeq
{
vtkm::Id ArraySize;
......@@ -92,17 +92,17 @@ public:
BenchAddSeq(vtkm::Id arraySize)
: ArraySize(arraySize)
{
this->Data.PrepareForOutput(this->ArraySize, Device{});
this->Data.PrepareForOutput(this->ArraySize, DeviceAdapter());
}
VTKM_CONT
vtkm::Float64 operator()()
{
vtkm::cont::AtomicArray<ValueType> array(this->Data);
auto portal = array.PrepareForExecution(Device{});
auto portal = array.PrepareForExecution(DeviceAdapter());
Worker<decltype(portal)> worker{ this->ArraySize, portal };
Timer timer{ Device() };
Timer timer{ DeviceAdapter() };
timer.Start();
Algo::Schedule(worker, NumWrites);
......@@ -120,7 +120,7 @@ public:
MAKE_ATOMIC_BENCHMARKS(AddSeq, BenchAddSeq);
// Provides a non-atomic baseline for BenchAddSeq
template <typename ValueType>
template <typename ValueType, typename DeviceAdapter>
struct BenchAddSeqBaseline
{
vtkm::Id ArraySize;
......@@ -155,10 +155,10 @@ public:
VTKM_CONT
vtkm::Float64 operator()()
{
auto portal = this->Data.PrepareForOutput(this->ArraySize, Device{});
auto portal = this->Data.PrepareForOutput(this->ArraySize, DeviceAdapter());
Worker<decltype(portal)> worker{ this->ArraySize, portal };
Timer timer{ Device() };
Timer timer{ DeviceAdapter() };
timer.Start();
Algo::Schedule(worker, NumWrites);
......@@ -177,7 +177,7 @@ public:
// Benchmarks AtomicArray::Add such that each work index writes to a strided
// index ( floor(i / stride) + stride * (i % stride)
template <typename ValueType>
template <typename ValueType, typename DeviceAdapter>
struct BenchAddStride
{
vtkm::Id ArraySize;
......@@ -211,17 +211,17 @@ public:
: ArraySize(arraySize)
, Stride(stride)
{
this->Data.PrepareForOutput(this->ArraySize, Device{});
this->Data.PrepareForOutput(this->ArraySize, DeviceAdapter());
}
VTKM_CONT
vtkm::Float64 operator()()
{
vtkm::cont::AtomicArray<ValueType> array(this->Data);
auto portal = array.PrepareForExecution(Device{});
auto portal = array.PrepareForExecution(DeviceAdapter());
Worker<decltype(portal)> worker{ this->ArraySize, this->Stride, portal };
Timer timer{ Device() };
Timer timer{ DeviceAdapter() };
timer.Start();
Algo::Schedule(worker, NumWrites);
......@@ -240,7 +240,7 @@ public:
MAKE_ATOMIC_BENCHMARKS(AddStride, BenchAddStride);
// Non-atomic baseline for AddStride
template <typename ValueType>
template <typename ValueType, typename DeviceAdapter>
struct BenchAddStrideBaseline
{
vtkm::Id ArraySize;
......@@ -279,10 +279,10 @@ public:
VTKM_CONT
vtkm::Float64 operator()()
{
auto portal = this->Data.PrepareForOutput(this->ArraySize, Device{});
auto portal = this->Data.PrepareForOutput(this->ArraySize, DeviceAdapter());
Worker<decltype(portal)> worker{ this->ArraySize, this->Stride, portal };
Timer timer{ Device() };
Timer timer{ DeviceAdapter() };
timer.Start();
Algo::Schedule(worker, NumWrites);
......@@ -302,7 +302,7 @@ public:
// Benchmarks AtomicArray::CompareAndSwap such that each work index writes to adjacent
// indices.
template <typename ValueType>
template <typename ValueType, typename DeviceAdapter>
struct BenchCASSeq
{
vtkm::Id ArraySize;
......@@ -340,17 +340,17 @@ public:
BenchCASSeq(vtkm::Id arraySize)
: ArraySize(arraySize)
{
this->Data.PrepareForOutput(this->ArraySize, Device{});
this->Data.PrepareForOutput(this->ArraySize, DeviceAdapter());
}
VTKM_CONT
vtkm::Float64 operator()()
{
vtkm::cont::AtomicArray<ValueType> array(this->Data);
auto portal = array.PrepareForExecution(Device{});
auto portal = array.PrepareForExecution(DeviceAdapter());
Worker<decltype(portal)> worker{ this->ArraySize, portal };
Timer timer{ Device() };
Timer timer{ DeviceAdapter() };
timer.Start();
Algo::Schedule(worker, NumWrites);
......@@ -368,7 +368,7 @@ public:
MAKE_ATOMIC_BENCHMARKS(CASSeq, BenchCASSeq);
// Provides a non-atomic baseline for BenchCASSeq
template <typename ValueType>
template <typename ValueType, typename DeviceAdapter>
struct BenchCASSeqBaseline
{
vtkm::Id ArraySize;
......@@ -405,10 +405,10 @@ public:
VTKM_CONT
vtkm::Float64 operator()()
{
auto portal = this->Data.PrepareForOutput(this->ArraySize, Device{});
auto portal = this->Data.PrepareForOutput(this->ArraySize, DeviceAdapter());
Worker<decltype(portal)> worker{ this->ArraySize, portal };
Timer timer{ Device() };
Timer timer{ DeviceAdapter() };
timer.Start();
Algo::Schedule(worker, NumWrites);
return timer.GetElapsedTime();
......@@ -427,7 +427,7 @@ public:
// Benchmarks AtomicArray::CompareAndSwap such that each work index writes to
// a strided index:
// ( floor(i / stride) + stride * (i % stride)
template <typename ValueType>
template <typename ValueType, typename DeviceAdapter>
struct BenchCASStride
{
vtkm::Id ArraySize;
......@@ -469,17 +469,17 @@ public:
: ArraySize(arraySize)
, Stride(stride)
{
this->Data.PrepareForOutput(this->ArraySize, Device{});
this->Data.PrepareForOutput(this->ArraySize, DeviceAdapter());
}
VTKM_CONT
vtkm::Float64 operator()()
{
vtkm::cont::AtomicArray<ValueType> array(this->Data);
auto portal = array.PrepareForExecution(Device{});
auto portal = array.PrepareForExecution(DeviceAdapter());
Worker<decltype(portal)> worker{ this->ArraySize, this->Stride, portal };
Timer timer{ Device() };
Timer timer{ DeviceAdapter() };
timer.Start();
Algo::Schedule(worker, NumWrites);
......@@ -498,7 +498,7 @@ public:
MAKE_ATOMIC_BENCHMARKS(CASStride, BenchCASStride);
// Non-atomic baseline for CASStride
template <typename ValueType>
template <typename ValueType, typename DeviceAdapter>
struct BenchCASStrideBaseline
{
vtkm::Id ArraySize;
......@@ -539,10 +539,10 @@ public:
VTKM_CONT
vtkm::Float64 operator()()
{
auto portal = this->Data.PrepareForOutput(this->ArraySize, Device{});
auto portal = this->Data.PrepareForOutput(this->ArraySize, DeviceAdapter());
Worker<decltype(portal)> worker{ this->ArraySize, this->Stride, portal };
Timer timer{ Device() };
Timer timer{ DeviceAdapter() };
timer.Start();
Algo::Schedule(worker, NumWrites);
......@@ -560,17 +560,17 @@ public:
};
MAKE_ATOMIC_BENCHMARKS(CASStrideBase, BenchCASStrideBaseline);
static void Run()
static void Run(vtkm::cont::DeviceAdapterId id)
{
RUN_ATOMIC_BENCHMARKS(AddSeq);
RUN_ATOMIC_BENCHMARKS(AddSeqBase);
RUN_ATOMIC_BENCHMARKS(AddStride);
RUN_ATOMIC_BENCHMARKS(AddStrideBase);
RUN_ATOMIC_BENCHMARKS(CASSeq);
RUN_ATOMIC_BENCHMARKS(CASSeqBase);
RUN_ATOMIC_BENCHMARKS(CASStride);
RUN_ATOMIC_BENCHMARKS(CASStrideBase);
RUN_ATOMIC_BENCHMARKS(AddSeq, id);
RUN_ATOMIC_BENCHMARKS(AddSeqBase, id);
RUN_ATOMIC_BENCHMARKS(AddStride, id);
RUN_ATOMIC_BENCHMARKS(AddStrideBase, id);
RUN_ATOMIC_BENCHMARKS(CASSeq, id);
RUN_ATOMIC_BENCHMARKS(CASSeqBase, id);
RUN_ATOMIC_BENCHMARKS(CASStride, id);
RUN_ATOMIC_BENCHMARKS(CASStrideBase, id);
}
};
}
......@@ -578,15 +578,12 @@ public:
int main(int argc, char* argv[])
{
vtkm::cont::InitLogging(argc, argv);
using Device = VTKM_DEFAULT_DEVICE_ADAPTER_TAG;
auto tracker = vtkm::cont::GetGlobalRuntimeDeviceTracker();
tracker.ForceDevice(Device{});
auto opts = vtkm::cont::InitializeOptions::RequireDevice;
auto config = vtkm::cont::Initialize(argc, argv, opts);
try
{
vtkm::benchmarking::BenchmarkAtomicArray<Device>::Run();
vtkm::benchmarking::BenchmarkAtomicArray::Run(config.Device);
}
catch (std::exception& e)
{
......
......@@ -37,7 +37,7 @@
#include <iostream>
#include <sstream>
#if VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_TBB
#ifdef VTKM_ENABLE_TBB
#include <tbb/task_scheduler_init.h>
#endif // TBB
......@@ -58,7 +58,7 @@ const size_t COL_WIDTH = 32;
template <typename ValueType, typename DeviceAdapter>
struct MeasureCopySpeed
{
using Algo = vtkm::cont::DeviceAdapterAlgorithm<DeviceAdapter>;
using Algo = vtkm::cont::Algorithm;
vtkm::cont::ArrayHandle<ValueType> Source;
vtkm::cont::ArrayHandle<ValueType> Destination;
......@@ -106,19 +106,17 @@ void PrintDivider(std::ostream& out)
out << "|-" << fillStr << "-|-" << fillStr << "-|" << std::endl;
}
template <typename ValueType>
void BenchmarkValueType()
template <typename ValueType, typename DeviceAdapter>
void BenchmarkValueType(vtkm::cont::DeviceAdapterId id)
{
PrintRow(std::cout,
vtkm::testing::TypeName<ValueType>::Name(),
vtkm::cont::DeviceAdapterTraits<VTKM_DEFAULT_DEVICE_ADAPTER_TAG>::GetName());
PrintRow(std::cout, vtkm::testing::TypeName<ValueType>::Name(), id.GetName());
PrintDivider(std::cout);
Benchmarker bench(15, 100);
for (vtkm::UInt64 size = COPY_SIZE_MIN; size <= COPY_SIZE_MAX; size <<= COPY_SIZE_INC)
{
MeasureCopySpeed<ValueType, VTKM_DEFAULT_DEVICE_ADAPTER_TAG> functor(size);
MeasureCopySpeed<ValueType, DeviceAdapter> functor(size);
bench.Reset();
std::string speedStr;
......@@ -142,22 +140,58 @@ void BenchmarkValueType()
}
} // end namespace vtkm::benchmarking
namespace
{
using namespace vtkm::benchmarking;
struct BenchmarkValueTypeFunctor
{
template <typename DeviceAdapter>
bool operator()(DeviceAdapter id)
{
BenchmarkValueType<vtkm::UInt8, DeviceAdapter>(id);
BenchmarkValueType<vtkm::Vec<vtkm::UInt8, 2>, DeviceAdapter>(id);
BenchmarkValueType<vtkm::Vec<vtkm::UInt8, 3>, DeviceAdapter>(id);
BenchmarkValueType<vtkm::Vec<vtkm::UInt8, 4>, DeviceAdapter>(id);
BenchmarkValueType<vtkm::UInt32, DeviceAdapter>(id);
BenchmarkValueType<vtkm::Vec<vtkm::UInt32, 2>, DeviceAdapter>(id);
BenchmarkValueType<vtkm::UInt64, DeviceAdapter>(id);
BenchmarkValueType<vtkm::Vec<vtkm::UInt64, 2>, DeviceAdapter>(id);
BenchmarkValueType<vtkm::Float32, DeviceAdapter>(id);
BenchmarkValueType<vtkm::Vec<vtkm::Float32, 2>, DeviceAdapter>(id);
BenchmarkValueType<vtkm::Float64, DeviceAdapter>(id);
BenchmarkValueType<vtkm::Vec<vtkm::Float64, 2>, DeviceAdapter>(id);
BenchmarkValueType<vtkm::Pair<vtkm::UInt32, vtkm::Float32>, DeviceAdapter>(id);
BenchmarkValueType<vtkm::Pair<vtkm::UInt32, vtkm::Float64>, DeviceAdapter>(id);
BenchmarkValueType<vtkm::Pair<vtkm::UInt64, vtkm::Float32>, DeviceAdapter>(id);
BenchmarkValueType<vtkm::Pair<vtkm::UInt64, vtkm::Float64>, DeviceAdapter>(id);
return true;
}
};
}
int main(int argc, char* argv[])
{
vtkm::cont::InitLogging(argc, argv);
auto opts = vtkm::cont::InitializeOptions::RequireDevice;
auto config = vtkm::cont::Initialize(argc, argv, opts);
using namespace vtkm::benchmarking;
#if VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_TBB
#ifdef VTKM_ENABLE_TBB
int numThreads = tbb::task_scheduler_init::automatic;
#endif // TBB
if (argc == 3)
if (config.Arguments.size() == 2)
{
if (std::string(argv[1]) == "NumThreads")
if (std::string(config.Arguments[0]) == "NumThreads")
{
#if VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_TBB
std::istringstream parse(argv[2]);
#ifdef VTKM_ENABLE_TBB
std::istringstream parse(config.Arguments[1]);
parse >> numThreads;
std::cout << "Selected " << numThreads << " TBB threads." << std::endl;
#else
......@@ -166,35 +200,11 @@ int main(int argc, char* argv[])
}
}
#if VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_TBB
#ifdef VTKM_ENABLE_TBB
// Must not be destroyed as long as benchmarks are running:
tbb::task_scheduler_init init(numThreads);
#endif // TBB
using Device = VTKM_DEFAULT_DEVICE_ADAPTER_TAG;
auto tracker = vtkm::cont::GetGlobalRuntimeDeviceTracker();
tracker.ForceDevice(Device{});
BenchmarkValueType<vtkm::UInt8>();
BenchmarkValueType<vtkm::Vec<vtkm::UInt8, 2>>();
BenchmarkValueType<vtkm::Vec<vtkm::UInt8, 3>>();
BenchmarkValueType<vtkm::Vec<vtkm::UInt8, 4>>();
BenchmarkValueType<vtkm::UInt32>();
BenchmarkValueType<vtkm::Vec<vtkm::UInt32, 2>>();
BenchmarkValueType<vtkm::UInt64>();
BenchmarkValueType<vtkm::Vec<vtkm::UInt64, 2>>();
BenchmarkValueType<vtkm::Float32>();
BenchmarkValueType<vtkm::Vec<vtkm::Float32, 2>>();
BenchmarkValueType<vtkm::Float64>();
BenchmarkValueType<vtkm::Vec<vtkm::Float64, 2>>();
BenchmarkValueType<vtkm::Pair<vtkm::UInt32, vtkm::Float32>>();
BenchmarkValueType<vtkm::Pair<vtkm::UInt32, vtkm::Float64>>();
BenchmarkValueType<vtkm::Pair<vtkm::UInt64, vtkm::Float32>>();
BenchmarkValueType<vtkm::Pair<vtkm::UInt64, vtkm::Float64>>();
BenchmarkValueTypeFunctor functor;
vtkm::cont::TryExecuteOnDevice(config.Device, functor);
}
......@@ -44,9 +44,10 @@
#include <vtkm/internal/Windows.h>
#if VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_TBB
#ifdef VTKM_ENABLE_TBB
#include <tbb/task_scheduler_init.h>
#elif VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_OPENMP
#endif
#ifdef VTKM_ENABLE_OPENMP
#include <omp.h>
#endif
......@@ -168,25 +169,23 @@ static const std::string DIVIDER(40, '-');