Commit a5dbe1ec authored by Robert Maynard's avatar Robert Maynard Committed by Kitware Robot

Merge topic 'bitfields'

661fb64d AtomicInterfaceControl functions are marked with VTKM_SUPPRESS_EXEC_WARNINGS
0c70f9b9 Add BitFieldIn/Out/InOut worklet signature tags.
a66510e8 Add ArrayHandleBitField, a boolean-valued AH backed by a BitField.
56cc5c3d Add support for BitFields.
d01b9738 Allow VTKM_SUPPRESS_EXEC_WARNINGS to be used inside macros.
2f2ca937 Add bit operations FindFirstSetBit and CountSetBits to Math.h.
Acked-by: Kitware Robot's avatarKitware Robot <kwrobot@kitware.com>
Merge-request: !1629
parents 0e847f2c 661fb64d
......@@ -26,6 +26,7 @@
#include <vtkm/cont/ArrayHandlePermutation.h>
#include <vtkm/cont/ArrayHandleZip.h>
#include <vtkm/cont/ArrayPortalToIterators.h>
#include <vtkm/cont/BitField.h>
#include <vtkm/cont/DeviceAdapterAlgorithm.h>
#include <vtkm/cont/ErrorExecution.h>
#include <vtkm/cont/StorageBasic.h>
......@@ -63,20 +64,24 @@ namespace benchmarking
enum BenchmarkName
{
COPY = 1,
COPY_IF = 1 << 1,
LOWER_BOUNDS = 1 << 2,
REDUCE = 1 << 3,
REDUCE_BY_KEY = 1 << 4,
SCAN_INCLUSIVE = 1 << 5,
SCAN_EXCLUSIVE = 1 << 6,
SORT = 1 << 7,
SORT_BY_KEY = 1 << 8,
STABLE_SORT_INDICES = 1 << 9,
STABLE_SORT_INDICES_UNIQUE = 1 << 10,
UNIQUE = 1 << 11,
UPPER_BOUNDS = 1 << 12,
ALL = COPY | COPY_IF | LOWER_BOUNDS | REDUCE | REDUCE_BY_KEY | SCAN_INCLUSIVE | SCAN_EXCLUSIVE |
BITFIELD_TO_UNORDERED_SET = 1 << 0,
COPY = 1 << 1,
COPY_IF = 1 << 2,
LOWER_BOUNDS = 1 << 3,
REDUCE = 1 << 4,
REDUCE_BY_KEY = 1 << 5,
SCAN_INCLUSIVE = 1 << 6,
SCAN_EXCLUSIVE = 1 << 7,
SORT = 1 << 8,
SORT_BY_KEY = 1 << 9,
STABLE_SORT_INDICES = 1 << 10,
STABLE_SORT_INDICES_UNIQUE = 1 << 11,
UNIQUE = 1 << 12,
UPPER_BOUNDS = 1 << 13,
ALL = BITFIELD_TO_UNORDERED_SET | COPY | COPY_IF | LOWER_BOUNDS | REDUCE | REDUCE_BY_KEY |
SCAN_INCLUSIVE |
SCAN_EXCLUSIVE |
SORT |
SORT_BY_KEY |
STABLE_SORT_INDICES |
......@@ -132,6 +137,20 @@ struct BenchDevAlgoConfig
? static_cast<vtkm::Id>(this->ArraySizeBytes / static_cast<vtkm::UInt64>(sizeof(T)))
: static_cast<vtkm::Id>(this->ArraySizeValues);
}
// Compute the number of words in a bit field with the given type.
// If DoByteSizes is true, the specified buffer is rounded down to the nearest
// number of words that fit into the byte limit. Otherwise, ArraySizeValues
// is used to indicate the number of bits.
template <typename WordType>
VTKM_CONT vtkm::Id ComputeNumberOfWords()
{
static constexpr vtkm::UInt64 BytesPerWord = static_cast<vtkm::UInt64>(sizeof(WordType));
static constexpr vtkm::UInt64 BitsPerWord = BytesPerWord * 8;
return this->DoByteSizes ? static_cast<vtkm::Id>(this->ArraySizeBytes / BytesPerWord)
: static_cast<vtkm::Id>(this->ArraySizeValues / BitsPerWord);
}
};
// Share a global instance of the config (only way to get it into the benchmark
......@@ -255,7 +274,170 @@ public:
}
};
template <typename WordType, typename BitFieldPortal>
struct GenerateBitFieldFunctor : public vtkm::exec::FunctorBase
{
WordType Exemplar;
vtkm::Id Stride;
vtkm::Id MaxMaskedWord;
BitFieldPortal Portal;
VTKM_EXEC_CONT
GenerateBitFieldFunctor(WordType exemplar,
vtkm::Id stride,
vtkm::Id maxMaskedWord,
const BitFieldPortal& portal)
: Exemplar(exemplar)
, Stride(stride)
, MaxMaskedWord(maxMaskedWord)
, Portal(portal)
{
}
VTKM_EXEC
void operator()(vtkm::Id wordIdx) const
{
if (wordIdx <= this->MaxMaskedWord && (wordIdx % this->Stride) == 0)
{
this->Portal.SetWord(wordIdx, this->Exemplar);
}
else
{
this->Portal.SetWord(wordIdx, static_cast<WordType>(0));
}
}
};
// Create a bit field for testing. The bit array will contain numWords words.
// The exemplar word is used to set bits in the array. Stride indicates how
// many words will be set to 0 between words initialized to the exemplar.
// Words with indices higher than maxMaskedWord will be set to 0.
// Stride and maxMaskedWord may be used to test different types of imbalanced
// loads.
template <typename WordType, typename DeviceAdapterTag>
static VTKM_CONT vtkm::cont::BitField GenerateBitField(WordType exemplar,
vtkm::Id stride,
vtkm::Id maxMaskedWord,
vtkm::Id numWords)
{
using Algo = vtkm::cont::DeviceAdapterAlgorithm<DeviceAdapterTag>;
if (stride == 0)
{
stride = 1;
}
vtkm::cont::BitField bits;
auto portal = bits.PrepareForOutput(numWords, DeviceAdapterTag{});
using Functor = GenerateBitFieldFunctor<WordType, decltype(portal)>;
Algo::Schedule(Functor{ exemplar, stride, maxMaskedWord, portal }, numWords);
Algo::Synchronize();
return bits;
}
private:
template <typename WordType, typename DeviceAdapter>
struct BenchBitFieldToUnorderedSet
{
using IndicesArray = vtkm::cont::ArrayHandle<vtkm::Id>;
vtkm::Id NumWords;
vtkm::Id NumBits;
WordType Exemplar;
vtkm::Id Stride;
vtkm::Float32 FillRatio;
vtkm::Id MaxMaskedIndex;
std::string Name;
vtkm::cont::BitField Bits;
IndicesArray Indices;
// See GenerateBitField for details. fillRatio is used to compute
// maxMaskedWord.
VTKM_CONT
BenchBitFieldToUnorderedSet(WordType exemplar,
vtkm::Id stride,
vtkm::Float32 fillRatio,
const std::string& name)
: NumWords(Config.ComputeNumberOfWords<WordType>())
, NumBits(this->NumWords * static_cast<vtkm::Id>(sizeof(WordType) * CHAR_BIT))
, Exemplar(exemplar)
, Stride(stride)
, FillRatio(fillRatio)
, MaxMaskedIndex(this->NumWords / static_cast<vtkm::Id>(1. / this->FillRatio))
, Name(name)
, Bits(GenerateBitField<WordType, DeviceAdapter>(this->Exemplar,
this->Stride,
this->MaxMaskedIndex,
this->NumWords))
{
}
VTKM_CONT
vtkm::Float64 operator()()
{
Timer timer(DeviceAdapter{});
timer.Start();
Algorithm::BitFieldToUnorderedSet(DeviceAdapter{}, this->Bits, this->Indices);
return timer.GetElapsedTime();
}
VTKM_CONT
std::string Description() const
{
const vtkm::Id numFilledWords = this->MaxMaskedIndex / this->Stride;
const vtkm::Id numSetBits = numFilledWords * vtkm::CountSetBits(this->Exemplar);
std::stringstream description;
description << "BitFieldToUnorderedSet" << this->Name << " ( "
<< "NumWords: " << this->NumWords << " "
<< "Exemplar: " << std::hex << this->Exemplar << std::dec << " "
<< "FillRatio: " << this->FillRatio << " "
<< "Stride: " << this->Stride << " "
<< "NumSetBits: " << numSetBits << " )";
return description.str();
}
};
VTKM_MAKE_BENCHMARK(BitFieldToUnorderedSetNull,
BenchBitFieldToUnorderedSet,
0x00000000,
1,
0.f,
"Null");
VTKM_MAKE_BENCHMARK(BitFieldToUnorderedSetFull,
BenchBitFieldToUnorderedSet,
0xffffffff,
1,
1.f,
"Full");
VTKM_MAKE_BENCHMARK(BitFieldToUnorderedSetHalfWord,
BenchBitFieldToUnorderedSet,
0xffff0000,
1,
1.f,
"HalfWord");
VTKM_MAKE_BENCHMARK(BitFieldToUnorderedSetHalfField,
BenchBitFieldToUnorderedSet,
0xffffffff,
1,
0.5f,
"HalfField");
VTKM_MAKE_BENCHMARK(BitFieldToUnorderedSetAlternateWords,
BenchBitFieldToUnorderedSet,
0xffffffff,
2,
1.f,
"AlternateWords");
VTKM_MAKE_BENCHMARK(BitFieldToUnorderedSetAlternateBits,
BenchBitFieldToUnorderedSet,
0x55555555,
1,
1.f,
"AlternateBits");
template <typename Value, typename DeviceAdapter>
struct BenchCopy
{
......@@ -982,6 +1164,19 @@ public:
template <typename ValueTypes>
static VTKM_CONT void RunInternal(vtkm::cont::DeviceAdapterId id)
{
using BitFieldWordTypes = vtkm::ListTagBase<vtkm::UInt32>;
if (Config.BenchmarkFlags & BITFIELD_TO_UNORDERED_SET)
{
std::cout << DIVIDER << "\nBenchmarking BitFieldToUnorderedSet\n";
VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetNull, BitFieldWordTypes{}, id);
VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetFull, BitFieldWordTypes{}, id);
VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetHalfWord, BitFieldWordTypes{}, id);
VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetHalfField, BitFieldWordTypes{}, id);
VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetAlternateWords, BitFieldWordTypes{}, id);
VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetAlternateBits, BitFieldWordTypes{}, id);
}
if (Config.BenchmarkFlags & COPY)
{
std::cout << DIVIDER << "\nBenchmarking Copy\n";
......@@ -1434,7 +1629,11 @@ int main(int argc, char* argv[])
std::transform(arg.begin(), arg.end(), arg.begin(), [](char c) {
return static_cast<char>(std::tolower(static_cast<unsigned char>(c)));
});
if (arg == "copy")
if (arg == "bitfieldtounorderedset")
{
config.BenchmarkFlags |= vtkm::benchmarking::BITFIELD_TO_UNORDERED_SET;
}
else if (arg == "copy")
{
config.BenchmarkFlags |= vtkm::benchmarking::COPY;
}
......
# Add support for BitFields.
BitFields are:
- Stored in memory using a contiguous buffer of bits.
- Accessible via portals, a la ArrayHandle.
- Portals operate on individual bits or words.
- Operations may be atomic for safe use from concurrent kernels.
The new BitFieldToUnorderedSet device algorithm produces an
ArrayHandle containing the indices of all set bits, in no particular
order.
The new AtomicInterface classes provide an abstraction into bitwise
atomic operations across control and execution environments and are
used to implement the BitPortals.
BitFields may be used as boolean-typed ArrayHandles using the
ArrayHandleBitField adapter. ArrayHandleBitField uses atomic operations to read
and write bits in the BitField, and is safe to use in concurrent code.
For example, a simple worklet that merges two arrays based on a boolean
condition is tested in TestingBitField:
```
class ConditionalMergeWorklet : public vtkm::worklet::WorkletMapField
{
public:
using ControlSignature = void(FieldIn cond,
FieldIn trueVals,
FieldIn falseVals,
FieldOut result);
using ExecutionSignature = _4(_1, _2, _3);
template <typename T>
VTKM_EXEC T operator()(bool cond, const T& trueVal, const T& falseVal) const
{
return cond ? trueVal : falseVal;
}
};
BitField bits = ...;
auto condArray = vtkm::cont::make_ArrayHandleBitField(bits);
auto trueArray = vtkm::cont::make_ArrayHandleCounting<vtkm::Id>(20, 2, NUM_BITS);
auto falseArray = vtkm::cont::make_ArrayHandleCounting<vtkm::Id>(13, 2, NUM_BITS);
vtkm::cont::ArrayHandle<vtkm::Id> output;
vtkm::worklet::DispatcherMapField<ConditionalMergeWorklet> dispatcher;
dispatcher.Invoke(condArray, trueArray, falseArray, output);
```
......@@ -41,9 +41,13 @@
#include <algorithm>
#endif
#if defined(VTKM_MSVC) && !defined(VTKM_CUDA)
#ifdef VTKM_MSVC
#include <intrin.h> // For bitwise intrinsics (__popcnt, etc)
#include <vtkm/internal/Windows.h> // for types used by MSVC intrinsics.
#ifndef VTKM_CUDA
#include <math.h>
#endif
#endif // VTKM_CUDA
#endif // VTKM_MSVC
#define VTKM_CUDA_MATH_FUNCTION_32(func) func##f
#define VTKM_CUDA_MATH_FUNCTION_64(func) func
......@@ -2592,6 +2596,191 @@ inline VTKM_EXEC_CONT vtkm::Float64 Ldexp(vtkm::Float64 x, vtkm::Int32 exponent)
#endif
}
/// Bitwise operations
///
/// Find the first set bit in @a word, and return its position (1-32). If no
/// bits are set, returns 0.
#ifdef VTKM_CUDA_DEVICE_PASS
// Need to explicitly mark this as __device__ since __ffs is device only.
inline __device__
vtkm::Int32 FindFirstSetBit(vtkm::UInt32 word)
{
// Output is [0,32], with ffs(0) == 0
return __ffs(static_cast<int>(word));
}
#else // CUDA_DEVICE_PASS
inline VTKM_EXEC_CONT
vtkm::Int32 FindFirstSetBit(vtkm::UInt32 word)
{
# if defined(VTKM_GCC) || defined(VTKM_CLANG)
// Output is [0,32], with ffs(0) == 0
return __builtin_ffs(static_cast<int>(word));
# elif defined(VTKM_MSVC)
// Output is [0, 31], check return code to see if bits are set:
vtkm::UInt32 firstSet;
return _BitScanForward(reinterpret_cast<DWORD*>(&firstSet), word) != 0
? static_cast<vtkm::Int32>(firstSet + 1) : 0;
# elif defined(VTKM_ICC)
// Output is [0, 31], undefined if word is 0.
return word != 0 ? _bit_scan_forward(word) + 1 : 0;
# else
// Naive implementation:
if (word == 0)
{
return 0;
}
vtkm::Int32 bit = 1;
while ((word & 0x1) == 0)
{
word >>= 1;
++bit;
}
return bit;
# endif
}
#endif // CUDA_DEVICE_PASS
/// Find the first set bit in @a word, and return its position (1-64). If no
/// bits are set, returns 0.
#ifdef VTKM_CUDA_DEVICE_PASS
// Need to explicitly mark this as __device__ since __ffsll is device only.
inline __device__
vtkm::Int32 FindFirstSetBit(vtkm::UInt64 word)
{
// Output is [0,64], with ffs(0) == 0
return __ffsll(static_cast<long long int>(word));
}
#else // CUDA_DEVICE_PASS
inline VTKM_EXEC_CONT
vtkm::Int32 FindFirstSetBit(vtkm::UInt64 word)
{
# if defined(VTKM_GCC) || defined(VTKM_CLANG)
// Output is [0,64], with ffs(0) == 0
return __builtin_ffsll(static_cast<long long int>(word));
# elif defined(VTKM_MSVC) || defined(VTKM_ICC)
// Output is [0, 63], check return code to see if bits are set:
vtkm::UInt32 firstSet;
return _BitScanForward64(reinterpret_cast<DWORD*>(&firstSet), word) != 0
? static_cast<vtkm::Int32>(firstSet + 1) : 0;
# else
// Naive implementation:
if (word == 0)
{
return 0;
}
vtkm::Int32 bit = 1;
while ((word & 0x1) == 0)
{
word >>= 1;
++bit;
}
return bit;
# endif
}
#endif // CUDA_DEVICE_PASS
/// Count the total number of bits set in @a word.
#ifdef VTKM_CUDA_DEVICE_PASS
// Need to explicitly mark this as __device__ since __popc is device only.
inline __device__
vtkm::Int32 CountSetBits(vtkm::UInt32 word)
{
return __popc(word);
}
#else // CUDA_DEVICE_PASS
inline VTKM_EXEC_CONT
vtkm::Int32 CountSetBits(vtkm::UInt32 word)
{
# if defined(VTKM_GCC) || defined(VTKM_CLANG)
return __builtin_popcount(word);
# elif defined(VTKM_MSVC)
return static_cast<vtkm::Int32>(__popcnt(word));
# elif defined(VTKM_ICC)
return _popcnt32(static_cast<int>(word));
# else
// Naive implementation:
vtkm::Int32 bits = 0;
while (word)
{
if (word & 0x1)
{
++bits;
}
word >>= 1;
}
return bits;
# endif
}
#endif // CUDA_DEVICE_PASS
/// Count the total number of bits set in @a word.
#ifdef VTKM_CUDA_DEVICE_PASS
// Need to explicitly mark this as __device__ since __popcll is device only.
inline __device__
vtkm::Int32 CountSetBits(vtkm::UInt64 word)
{
return __popcll(word);
}
#else // CUDA_DEVICE_PASS
inline VTKM_EXEC_CONT
vtkm::Int32 CountSetBits(vtkm::UInt64 word)
{
# if defined(VTKM_GCC) || defined(VTKM_CLANG)
return __builtin_popcountll(word);
# elif defined(VTKM_MSVC)
return static_cast<vtkm::Int32>(__popcnt64(word));
# elif defined(VTKM_ICC)
return _popcnt64(static_cast<vtkm::Int64>(word));
# else
// Naive implementation:
vtkm::Int32 bits = 0;
while (word)
{
if (word & 0x1)
{
++bits;
}
word >>= 1;
}
return bits;
# endif
}
#endif // CUDA_DEVICE_PASS
} // namespace vtkm
// clang-format on
......
......@@ -53,9 +53,14 @@ $# Ignore the following comment. It is meant for the generated file.
#include <algorithm>
#endif
#if defined(VTKM_MSVC) && !defined(VTKM_CUDA)
#ifdef VTKM_MSVC
#include <intrin.h> // For bitwise intrinsics (__popcnt, etc)
#include <vtkm/internal/Windows.h> // for types used by MSVC intrinsics.
#ifndef VTKM_CUDA
#include <math.h>
#endif
#endif // VTKM_CUDA
#endif // VTKM_MSVC
#define VTKM_CUDA_MATH_FUNCTION_32(func) func##f
#define VTKM_CUDA_MATH_FUNCTION_64(func) func
......@@ -1194,6 +1199,191 @@ inline VTKM_EXEC_CONT vtkm::Float64 Ldexp(vtkm::Float64 x, vtkm::Int32 exponent)
#endif
}
/// Bitwise operations
///
/// Find the first set bit in @a word, and return its position (1-32). If no
/// bits are set, returns 0.
#ifdef VTKM_CUDA_DEVICE_PASS
// Need to explicitly mark this as __device__ since __ffs is device only.
inline __device__
vtkm::Int32 FindFirstSetBit(vtkm::UInt32 word)
{
// Output is [0,32], with ffs(0) == 0
return __ffs(static_cast<int>(word));
}
#else // CUDA_DEVICE_PASS
inline VTKM_EXEC_CONT
vtkm::Int32 FindFirstSetBit(vtkm::UInt32 word)
{
# if defined(VTKM_GCC) || defined(VTKM_CLANG)
// Output is [0,32], with ffs(0) == 0
return __builtin_ffs(static_cast<int>(word));
# elif defined(VTKM_MSVC)
// Output is [0, 31], check return code to see if bits are set:
vtkm::UInt32 firstSet;
return _BitScanForward(reinterpret_cast<DWORD*>(&firstSet), word) != 0
? static_cast<vtkm::Int32>(firstSet + 1) : 0;
# elif defined(VTKM_ICC)
// Output is [0, 31], undefined if word is 0.
return word != 0 ? _bit_scan_forward(word) + 1 : 0;
# else
// Naive implementation:
if (word == 0)
{
return 0;
}
vtkm::Int32 bit = 1;
while ((word & 0x1) == 0)
{
word >>= 1;
++bit;
}
return bit;
# endif
}
#endif // CUDA_DEVICE_PASS
/// Find the first set bit in @a word, and return its position (1-64). If no
/// bits are set, returns 0.
#ifdef VTKM_CUDA_DEVICE_PASS
// Need to explicitly mark this as __device__ since __ffsll is device only.
inline __device__
vtkm::Int32 FindFirstSetBit(vtkm::UInt64 word)
{
// Output is [0,64], with ffs(0) == 0
return __ffsll(static_cast<long long int>(word));
}
#else // CUDA_DEVICE_PASS
inline VTKM_EXEC_CONT
vtkm::Int32 FindFirstSetBit(vtkm::UInt64 word)
{
# if defined(VTKM_GCC) || defined(VTKM_CLANG)
// Output is [0,64], with ffs(0) == 0
return __builtin_ffsll(static_cast<long long int>(word));
# elif defined(VTKM_MSVC) || defined(VTKM_ICC)
// Output is [0, 63], check return code to see if bits are set:
vtkm::UInt32 firstSet;
return _BitScanForward64(reinterpret_cast<DWORD*>(&firstSet), word) != 0
? static_cast<vtkm::Int32>(firstSet + 1) : 0;
# else
// Naive implementation:
if (word == 0)
{
return 0;
}
vtkm::Int32 bit = 1;
while ((word & 0x1) == 0)
{
word >>= 1;
++bit;
}
return bit;
# endif
}
#endif // CUDA_DEVICE_PASS
/// Count the total number of bits set in @a word.
#ifdef VTKM_CUDA_DEVICE_PASS
// Need to explicitly mark this as __device__ since __popc is device only.
inline __device__
vtkm::Int32 CountSetBits(vtkm::UInt32 word)
{
return __popc(word);
}
#else // CUDA_DEVICE_PASS
inline VTKM_EXEC_CONT
vtkm::Int32 CountSetBits(vtkm::UInt32 word)
{
# if defined(VTKM_GCC) || defined(VTKM_CLANG)
return __builtin_popcount(word);
# elif defined(VTKM_MSVC)
return static_cast<vtkm::Int32>(__popcnt(word));
# elif defined(VTKM_ICC)
return _popcnt32(static_cast<int>(word));
# else
// Naive implementation:
vtkm::Int32 bits = 0;
while (word)
{
if (word & 0x1)
{
++bits;
}
word >>= 1;
}
return bits;
# endif
}
#endif // CUDA_DEVICE_PASS
/// Count the total number of bits set in @a word.
#ifdef VTKM_CUDA_DEVICE_PASS
// Need to explicitly mark this as __device__ since __popcll is device only.
inline __device__
vtkm::Int32 CountSetBits(vtkm::UInt64 word)
{
return __popcll(word);
}
#else // CUDA_DEVICE_PASS
inline VTKM_EXEC_CONT
vtkm::Int32 CountSetBits(vtkm::UInt64 word)
{
# if defined(VTKM_GCC) || defined(VTKM_CLANG)
return __builtin_popcountll(word);
# elif defined(VTKM_MSVC)
return static_cast<vtkm::Int32>(__popcnt64(word));
# elif defined(VTKM_ICC)
return _popcnt64(static_cast<vtkm::Int64>(word));
# else
// Naive implementation:
vtkm::Int32 bits = 0;
while (word)
{
if (word & 0x1)
{
++bits;
}
word >>= 1;
}
return bits;