Commit 56cc5c3d authored by Allison Vacanti's avatar Allison Vacanti Committed by Robert Maynard

Add support for BitFields.

BitFields are:
- Stored in memory using a contiguous buffer of bits.
- Accessible via portals, a la ArrayHandle.
- Portals operate on individual bits or words.
- Operations may be atomic for safe use from concurrent kernels.

The new BitFieldToUnorderedSet device algorithm produces an ArrayHandle
containing the indices of all set bits, in no particular order.

The new AtomicInterface classes provide an abstraction into bitwise
atomic operations across control and execution environments and are used
to implement the BitPortals.
parent d01b9738
......@@ -26,6 +26,7 @@
#include <vtkm/cont/ArrayHandlePermutation.h>
#include <vtkm/cont/ArrayHandleZip.h>
#include <vtkm/cont/ArrayPortalToIterators.h>
#include <vtkm/cont/BitField.h>
#include <vtkm/cont/DeviceAdapterAlgorithm.h>
#include <vtkm/cont/ErrorExecution.h>
#include <vtkm/cont/StorageBasic.h>
......@@ -63,20 +64,24 @@ namespace benchmarking
enum BenchmarkName
{
COPY = 1,
COPY_IF = 1 << 1,
LOWER_BOUNDS = 1 << 2,
REDUCE = 1 << 3,
REDUCE_BY_KEY = 1 << 4,
SCAN_INCLUSIVE = 1 << 5,
SCAN_EXCLUSIVE = 1 << 6,
SORT = 1 << 7,
SORT_BY_KEY = 1 << 8,
STABLE_SORT_INDICES = 1 << 9,
STABLE_SORT_INDICES_UNIQUE = 1 << 10,
UNIQUE = 1 << 11,
UPPER_BOUNDS = 1 << 12,
ALL = COPY | COPY_IF | LOWER_BOUNDS | REDUCE | REDUCE_BY_KEY | SCAN_INCLUSIVE | SCAN_EXCLUSIVE |
BITFIELD_TO_UNORDERED_SET = 1 << 0,
COPY = 1 << 1,
COPY_IF = 1 << 2,
LOWER_BOUNDS = 1 << 3,
REDUCE = 1 << 4,
REDUCE_BY_KEY = 1 << 5,
SCAN_INCLUSIVE = 1 << 6,
SCAN_EXCLUSIVE = 1 << 7,
SORT = 1 << 8,
SORT_BY_KEY = 1 << 9,
STABLE_SORT_INDICES = 1 << 10,
STABLE_SORT_INDICES_UNIQUE = 1 << 11,
UNIQUE = 1 << 12,
UPPER_BOUNDS = 1 << 13,
ALL = BITFIELD_TO_UNORDERED_SET | COPY | COPY_IF | LOWER_BOUNDS | REDUCE | REDUCE_BY_KEY |
SCAN_INCLUSIVE |
SCAN_EXCLUSIVE |
SORT |
SORT_BY_KEY |
STABLE_SORT_INDICES |
......@@ -132,6 +137,20 @@ struct BenchDevAlgoConfig
? static_cast<vtkm::Id>(this->ArraySizeBytes / static_cast<vtkm::UInt64>(sizeof(T)))
: static_cast<vtkm::Id>(this->ArraySizeValues);
}
// Compute the number of words in a bit field with the given type.
// If DoByteSizes is true, the specified buffer is rounded down to the nearest
// number of words that fit into the byte limit. Otherwise, ArraySizeValues
// is used to indicate the number of bits.
template <typename WordType>
VTKM_CONT vtkm::Id ComputeNumberOfWords()
{
static constexpr vtkm::UInt64 BytesPerWord = static_cast<vtkm::UInt64>(sizeof(WordType));
static constexpr vtkm::UInt64 BitsPerWord = BytesPerWord * 8;
return this->DoByteSizes ? static_cast<vtkm::Id>(this->ArraySizeBytes / BytesPerWord)
: static_cast<vtkm::Id>(this->ArraySizeValues / BitsPerWord);
}
};
// Share a global instance of the config (only way to get it into the benchmark
......@@ -255,7 +274,170 @@ public:
}
};
template <typename WordType, typename BitFieldPortal>
struct GenerateBitFieldFunctor : public vtkm::exec::FunctorBase
{
WordType Exemplar;
vtkm::Id Stride;
vtkm::Id MaxMaskedWord;
BitFieldPortal Portal;
VTKM_EXEC_CONT
GenerateBitFieldFunctor(WordType exemplar,
vtkm::Id stride,
vtkm::Id maxMaskedWord,
const BitFieldPortal& portal)
: Exemplar(exemplar)
, Stride(stride)
, MaxMaskedWord(maxMaskedWord)
, Portal(portal)
{
}
VTKM_EXEC
void operator()(vtkm::Id wordIdx) const
{
if (wordIdx <= this->MaxMaskedWord && (wordIdx % this->Stride) == 0)
{
this->Portal.SetWord(wordIdx, this->Exemplar);
}
else
{
this->Portal.SetWord(wordIdx, static_cast<WordType>(0));
}
}
};
// Create a bit field for testing. The bit array will contain numWords words.
// The exemplar word is used to set bits in the array. Stride indicates how
// many words will be set to 0 between words initialized to the exemplar.
// Words with indices higher than maxMaskedWord will be set to 0.
// Stride and maxMaskedWord may be used to test different types of imbalanced
// loads.
template <typename WordType, typename DeviceAdapterTag>
static VTKM_CONT vtkm::cont::BitField GenerateBitField(WordType exemplar,
vtkm::Id stride,
vtkm::Id maxMaskedWord,
vtkm::Id numWords)
{
using Algo = vtkm::cont::DeviceAdapterAlgorithm<DeviceAdapterTag>;
if (stride == 0)
{
stride = 1;
}
vtkm::cont::BitField bits;
auto portal = bits.PrepareForOutput(numWords, DeviceAdapterTag{});
using Functor = GenerateBitFieldFunctor<WordType, decltype(portal)>;
Algo::Schedule(Functor{ exemplar, stride, maxMaskedWord, portal }, numWords);
Algo::Synchronize();
return bits;
}
private:
template <typename WordType, typename DeviceAdapter>
struct BenchBitFieldToUnorderedSet
{
using IndicesArray = vtkm::cont::ArrayHandle<vtkm::Id>;
vtkm::Id NumWords;
vtkm::Id NumBits;
WordType Exemplar;
vtkm::Id Stride;
vtkm::Float32 FillRatio;
vtkm::Id MaxMaskedIndex;
std::string Name;
vtkm::cont::BitField Bits;
IndicesArray Indices;
// See GenerateBitField for details. fillRatio is used to compute
// maxMaskedWord.
VTKM_CONT
BenchBitFieldToUnorderedSet(WordType exemplar,
vtkm::Id stride,
vtkm::Float32 fillRatio,
const std::string& name)
: NumWords(Config.ComputeNumberOfWords<WordType>())
, NumBits(this->NumWords * static_cast<vtkm::Id>(sizeof(WordType) * CHAR_BIT))
, Exemplar(exemplar)
, Stride(stride)
, FillRatio(fillRatio)
, MaxMaskedIndex(this->NumWords / static_cast<vtkm::Id>(1. / this->FillRatio))
, Name(name)
, Bits(GenerateBitField<WordType, DeviceAdapter>(this->Exemplar,
this->Stride,
this->MaxMaskedIndex,
this->NumWords))
{
}
VTKM_CONT
vtkm::Float64 operator()()
{
Timer timer(DeviceAdapter{});
timer.Start();
Algorithm::BitFieldToUnorderedSet(DeviceAdapter{}, this->Bits, this->Indices);
return timer.GetElapsedTime();
}
VTKM_CONT
std::string Description() const
{
const vtkm::Id numFilledWords = this->MaxMaskedIndex / this->Stride;
const vtkm::Id numSetBits = numFilledWords * vtkm::CountSetBits(this->Exemplar);
std::stringstream description;
description << "BitFieldToUnorderedSet" << this->Name << " ( "
<< "NumWords: " << this->NumWords << " "
<< "Exemplar: " << std::hex << this->Exemplar << std::dec << " "
<< "FillRatio: " << this->FillRatio << " "
<< "Stride: " << this->Stride << " "
<< "NumSetBits: " << numSetBits << " )";
return description.str();
}
};
VTKM_MAKE_BENCHMARK(BitFieldToUnorderedSetNull,
BenchBitFieldToUnorderedSet,
0x00000000,
1,
0.f,
"Null");
VTKM_MAKE_BENCHMARK(BitFieldToUnorderedSetFull,
BenchBitFieldToUnorderedSet,
0xffffffff,
1,
1.f,
"Full");
VTKM_MAKE_BENCHMARK(BitFieldToUnorderedSetHalfWord,
BenchBitFieldToUnorderedSet,
0xffff0000,
1,
1.f,
"HalfWord");
VTKM_MAKE_BENCHMARK(BitFieldToUnorderedSetHalfField,
BenchBitFieldToUnorderedSet,
0xffffffff,
1,
0.5f,
"HalfField");
VTKM_MAKE_BENCHMARK(BitFieldToUnorderedSetAlternateWords,
BenchBitFieldToUnorderedSet,
0xffffffff,
2,
1.f,
"AlternateWords");
VTKM_MAKE_BENCHMARK(BitFieldToUnorderedSetAlternateBits,
BenchBitFieldToUnorderedSet,
0x55555555,
1,
1.f,
"AlternateBits");
template <typename Value, typename DeviceAdapter>
struct BenchCopy
{
......@@ -982,6 +1164,19 @@ public:
template <typename ValueTypes>
static VTKM_CONT void RunInternal(vtkm::cont::DeviceAdapterId id)
{
using BitFieldWordTypes = vtkm::ListTagBase<vtkm::UInt32>;
if (Config.BenchmarkFlags & BITFIELD_TO_UNORDERED_SET)
{
std::cout << DIVIDER << "\nBenchmarking BitFieldToUnorderedSet\n";
VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetNull, BitFieldWordTypes{}, id);
VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetFull, BitFieldWordTypes{}, id);
VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetHalfWord, BitFieldWordTypes{}, id);
VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetHalfField, BitFieldWordTypes{}, id);
VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetAlternateWords, BitFieldWordTypes{}, id);
VTKM_RUN_BENCHMARK(BitFieldToUnorderedSetAlternateBits, BitFieldWordTypes{}, id);
}
if (Config.BenchmarkFlags & COPY)
{
std::cout << DIVIDER << "\nBenchmarking Copy\n";
......@@ -1434,7 +1629,11 @@ int main(int argc, char* argv[])
std::transform(arg.begin(), arg.end(), arg.begin(), [](char c) {
return static_cast<char>(std::tolower(static_cast<unsigned char>(c)));
});
if (arg == "copy")
if (arg == "bitfieldtounorderedset")
{
config.BenchmarkFlags |= vtkm::benchmarking::BITFIELD_TO_UNORDERED_SET;
}
else if (arg == "copy")
{
config.BenchmarkFlags |= vtkm::benchmarking::COPY;
}
......
# Add support for BitFields.
BitFields are:
- Stored in memory using a contiguous buffer of bits.
- Accessible via portals, a la ArrayHandle.
- Portals operate on individual bits or words.
- Operations may be atomic for safe use from concurrent kernels.
The new BitFieldToUnorderedSet device algorithm produces an
ArrayHandle containing the indices of all set bits, in no particular
order.
The new AtomicInterface classes provide an abstraction into bitwise
atomic operations across control and execution environments and are
used to implement the BitPortals.
......@@ -159,6 +159,10 @@ using UInt32 = unsigned int;
/// than smaller widths.
using IdComponent = vtkm::Int32;
/// The default word size used for atomic bitwise operations. Universally
/// supported on all devices.
using WordTypeDefault = vtkm::UInt32;
//In this order so that we exactly match the logic that exists in VTK
#if VTKM_SIZE_LONG_LONG == 8
using Int64 = long long;
......
......@@ -60,6 +60,20 @@ auto PrepareArgForExec(T&& object)
vtkm::cont::internal::IsExecutionObjectBase<T>{});
}
struct BitFieldToUnorderedSetFunctor
{
vtkm::Id Result{ 0 };
template <typename Device, typename... Args>
VTKM_CONT bool operator()(Device, Args&&... args)
{
VTKM_IS_DEVICE_ADAPTER_TAG(Device);
this->Result = vtkm::cont::DeviceAdapterAlgorithm<Device>::BitFieldToUnorderedSet(
PrepareArgForExec<Device>(std::forward<Args>(args))...);
return true;
}
};
struct CopyFunctor
{
template <typename Device, typename... Args>
......@@ -374,6 +388,27 @@ struct UpperBoundsFunctor
struct Algorithm
{
template <typename IndicesStorage>
VTKM_CONT static vtkm::Id BitFieldToUnorderedSet(
vtkm::cont::DeviceAdapterId devId,
const vtkm::cont::BitField& bits,
vtkm::cont::ArrayHandle<Id, IndicesStorage>& indices)
{
detail::BitFieldToUnorderedSetFunctor functor;
vtkm::cont::TryExecuteOnDevice(devId, functor, bits, indices);
return functor.Result;
}
template <typename IndicesStorage>
VTKM_CONT static vtkm::Id BitFieldToUnorderedSet(
const vtkm::cont::BitField& bits,
vtkm::cont::ArrayHandle<Id, IndicesStorage>& indices)
{
detail::BitFieldToUnorderedSetFunctor functor;
vtkm::cont::TryExecute(functor, bits, indices);
return functor.Result;
}
template <typename T, typename U, class CIn, class COut>
VTKM_CONT static bool Copy(vtkm::cont::DeviceAdapterId devId,
const vtkm::cont::ArrayHandle<T, CIn>& input,
......
This diff is collapsed.
......@@ -49,6 +49,7 @@ set(headers
ArrayRangeCompute.h
AssignerMultiBlock.h
AtomicArray.h
BitField.h
BoundsCompute.h
BoundsGlobalCompute.h
CastAndCall.h
......
......@@ -52,6 +52,15 @@ template <class DeviceAdapterTag>
struct DeviceAdapterAlgorithm
#ifdef VTKM_DOXYGEN_ONLY
{
/// \brief Create a unique, unsorted list of indices denoting which bits are
/// set in a bitfield.
///
/// Returns the total number of set bits.
template <typename IndicesStorage>
VTKM_CONT static vtkm::Id BitFieldToUnorderedSet(
const vtkm::cont::BitField& bits,
vtkm::cont::ArrayHandle<Id, IndicesStorage>& indices);
/// \brief Copy the contents of one ArrayHandle to another
///
/// Copies the contents of \c input to \c output. The array \c output will be
......@@ -660,9 +669,18 @@ public:
/// The class provide the actual implementation used by
/// vtkm::cont::DeviceAdapterAtomicArrayImplementation.
///
/// TODO combine this with AtomicInterfaceExecution.
template <typename T, typename DeviceTag>
class DeviceAdapterAtomicArrayImplementation;
/// \brief Class providing a device-specific support for atomic operations.
///
/// AtomicInterfaceControl provides atomic operations for the control
/// environment, and may be subclassed to implement the device interface when
/// appropriate for a CPU-based device.
template <typename DeviceTag>
class AtomicInterfaceExecution;
/// \brief Class providing a device-specific support for selecting the optimal
/// Task type for a given worklet.
///
......
//============================================================================
// Copyright (c) Kitware, Inc.
// All rights reserved.
// See LICENSE.txt for details.
// This software is distributed WITHOUT ANY WARRANTY; without even
// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE. See the above copyright notice for more information.
//
// Copyright 2019 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
// Copyright 2019 UT-Battelle, LLC.
// Copyright 2019 Los Alamos National Security.
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
// Laboratory (LANL), the U.S. Government retains certain rights in
// this software.
//============================================================================
#ifndef vtk_m_cont_cuda_internal_AtomicInterfaceExecutionCuda_h
#define vtk_m_cont_cuda_internal_AtomicInterfaceExecutionCuda_h
#include <vtkm/cont/cuda/internal/DeviceAdapterTagCuda.h>
#include <vtkm/cont/internal/AtomicInterfaceExecution.h>
#include <vtkm/ListTag.h>
#include <vtkm/Types.h>
namespace vtkm
{
namespace cont
{
namespace internal
{
template <>
class AtomicInterfaceExecution<DeviceAdapterTagCuda>
{
public:
// Note: There are 64-bit atomics available, but not on all devices. Stick
// with 32-bit only until we require compute capability 3.5+
using WordTypes = vtkm::ListTagBase<vtkm::UInt32>;
using WordTypePreferred = vtkm::UInt32;
#define VTKM_ATOMIC_OPS_FOR_TYPE(type) \
VTKM_SUPPRESS_EXEC_WARNINGS __device__ static type Load(const type* addr) \
{ \
const volatile type* vaddr = addr; /* volatile to bypass cache*/ \
const type value = *vaddr; \
/* fence to ensure that dependent reads are correctly ordered */ \
__threadfence(); \
return value; \
} \
VTKM_SUPPRESS_EXEC_WARNINGS __device__ static void Store(type* addr, type value) \
{ \
volatile type* vaddr = addr; /* volatile to bypass cache */ \
/* fence to ensure that previous non-atomic stores are visible to other threads */ \
__threadfence(); \
*vaddr = value; \
} \
VTKM_SUPPRESS_EXEC_WARNINGS __device__ static type Not(type* addr) \
{ \
return AtomicInterfaceExecution::Xor(addr, static_cast<type>(~type{ 0u })); \
} \
VTKM_SUPPRESS_EXEC_WARNINGS __device__ static type And(type* addr, type mask) \
{ \
__threadfence(); \
auto result = atomicAnd(addr, mask); \
__threadfence(); \
return result; \
} \
VTKM_SUPPRESS_EXEC_WARNINGS __device__ static type Or(type* addr, type mask) \
{ \
__threadfence(); \
auto result = atomicOr(addr, mask); \
__threadfence(); \
return result; \
} \
VTKM_SUPPRESS_EXEC_WARNINGS __device__ static type Xor(type* addr, type mask) \
{ \
__threadfence(); \
auto result = atomicXor(addr, mask); \
__threadfence(); \
return result; \
} \
VTKM_SUPPRESS_EXEC_WARNINGS __device__ static type CompareAndSwap( \
type* addr, type newWord, type expected) \
{ \
__threadfence(); \
auto result = atomicCAS(addr, expected, newWord); \
__threadfence(); \
return result; \
}
VTKM_ATOMIC_OPS_FOR_TYPE(vtkm::UInt32)
#undef VTKM_ATOMIC_OPS_FOR_TYPE
};
}
}
} // end namespace vtkm::cont::internal
#endif // vtk_m_cont_cuda_internal_AtomicInterfaceExecutionCuda_h
......@@ -20,6 +20,7 @@
set(headers
ArrayManagerExecutionCuda.h
AtomicInterfaceExecutionCuda.h
CudaAllocator.h
DeviceAdapterAlgorithmCuda.h
DeviceAdapterAtomicArrayImplementationCuda.h
......
......@@ -26,6 +26,7 @@
#include <vtkm/UnaryPredicates.h>
#include <vtkm/cont/ArrayHandle.h>
#include <vtkm/cont/BitField.h>
#include <vtkm/cont/DeviceAdapterAlgorithm.h>
#include <vtkm/cont/ErrorExecution.h>
#include <vtkm/cont/Logging.h>
......@@ -35,6 +36,7 @@
#include <vtkm/cont/cuda/ErrorCuda.h>
#include <vtkm/cont/cuda/internal/ArrayManagerExecutionCuda.h>
#include <vtkm/cont/cuda/internal/AtomicInterfaceExecutionCuda.h>
#include <vtkm/cont/cuda/internal/DeviceAdapterAtomicArrayImplementationCuda.h>
#include <vtkm/cont/cuda/internal/DeviceAdapterRuntimeDetectorCuda.h>
#include <vtkm/cont/cuda/internal/DeviceAdapterTagCuda.h>
......@@ -54,8 +56,7 @@
// Disable warnings we check vtkm for but Thrust does not.
VTKM_THIRDPARTY_PRE_INCLUDE
//This is required to be first so that we get patches for thrust included
//in the correct order
#include <cooperative_groups.h>
#include <cuda.h>
#include <thrust/advance.h>
#include <thrust/binary_search.h>
......@@ -71,6 +72,9 @@ VTKM_THIRDPARTY_PRE_INCLUDE
#include <vtkm/exec/cuda/internal/ThrustPatches.h>
VTKM_THIRDPARTY_POST_INCLUDE
#include <limits>
#include <memory>
namespace vtkm
{
namespace cont
......@@ -145,6 +149,22 @@ struct CastPortal
VTKM_EXEC
ValueType Get(vtkm::Id index) const { return static_cast<OutValueType>(this->Portal.Get(index)); }
};
struct CudaFreeFunctor
{
void operator()(void* ptr) const { VTKM_CUDA_CALL(cudaFree(ptr)); }
};
template <typename T>
using CudaUniquePtr = std::unique_ptr<T, CudaFreeFunctor>;
template <typename T>
CudaUniquePtr<T> make_CudaUniquePtr(std::size_t numElements)
{
T* ptr;
VTKM_CUDA_CALL(cudaMalloc(&ptr, sizeof(T) * numElements));
return CudaUniquePtr<T>(ptr);
}
}
} // end namespace cuda::internal
......@@ -159,6 +179,132 @@ struct DeviceAdapterAlgorithm<vtkm::cont::DeviceAdapterTagCuda>
#ifndef VTKM_CUDA
private:
#endif
template <typename BitsPortal, typename IndicesPortal, typename GlobalPopCountType>
struct BitFieldToUnorderedSetFunctor : public vtkm::exec::FunctorBase
{
VTKM_STATIC_ASSERT_MSG(VTKM_PASS_COMMAS(std::is_same<GlobalPopCountType, vtkm::Int32>::value ||
std::is_same<GlobalPopCountType, vtkm::UInt32>::value ||
std::is_same<GlobalPopCountType, vtkm::UInt64>::value),
"Unsupported GlobalPopCountType. Must support CUDA atomicAdd.");
using Word = typename BitsPortal::WordTypePreferred;
VTKM_STATIC_ASSERT(
VTKM_PASS_COMMAS(std::is_same<typename IndicesPortal::ValueType, vtkm::Id>::value));
VTKM_CONT
BitFieldToUnorderedSetFunctor(const BitsPortal& input,
const IndicesPortal& output,
GlobalPopCountType* globalPopCount)
: Input{ input }
, Output{ output }
, GlobalPopCount{ globalPopCount }
, FinalWordIndex{ input.GetNumberOfWords() - 1 }
, FinalWordMask(input.GetFinalWordMask())
{
}
~BitFieldToUnorderedSetFunctor() {}
VTKM_CONT void Initialize()
{
assert(this->GlobalPopCount != nullptr);
VTKM_CUDA_CALL(cudaMemset(this->GlobalPopCount, 0, sizeof(GlobalPopCountType)));
}
VTKM_SUPPRESS_EXEC_WARNINGS
__device__ void operator()(vtkm::Id wordIdx) const
{
Word word = this->Input.GetWord(wordIdx);
// The last word may be partial -- mask out trailing bits if needed.
const Word mask = wordIdx == this->FinalWordIndex ? this->FinalWordMask : ~Word{ 0 };
word &= mask;
if (word != 0)
{
this->LocalPopCount = vtkm::CountSetBits(word);
this->ReduceAllocate();
vtkm::Id firstBitIdx = wordIdx * sizeof(Word) * CHAR_BIT;
do
{
// Find next bit. FindFirstSetBit's result is indexed starting at 1.
vtkm::Int32 bit = vtkm::FindFirstSetBit(word) - 1;
vtkm::Id outIdx = this->GetNextOutputIndex();
// Write index of bit
this->Output.Set(outIdx, firstBitIdx + bit);
word ^= (1 << bit); // clear bit
} while (word != 0); // have bits
}
}
VTKM_CONT vtkm::Id Finalize() const
{
assert(this->GlobalPopCount != nullptr);
GlobalPopCountType result;
VTKM_CUDA_CALL(cudaMemcpy(
&result, this->GlobalPopCount, sizeof(GlobalPopCountType), cudaMemcpyDeviceToHost));
return static_cast<vtkm::Id>(result);
}
private:
// Every thread with a non-zero local popcount calls this function, which
// computes the total popcount for the coalesced threads and allocates
// a contiguous block in the output by atomically increasing the global
// popcount.
VTKM_SUPPRESS_EXEC_WARNINGS
__device__ void ReduceAllocate() const
{
const auto activeLanes = cooperative_groups::coalesced_threads();
const int activeRank = activeLanes.thread_rank();
const int activeSize = activeLanes.size();
// Reduction value:
vtkm::Int32 rVal = this->LocalPopCount;
for (int delta = 1; delta < activeSize; delta *= 2)
{
rVal += activeLanes.shfl_down(rVal, delta);
}
if (activeRank == 0)
{
this->AllocationHead =
atomicAdd(this->GlobalPopCount, static_cast<GlobalPopCountType>(rVal));
}
this->AllocationHead = activeLanes.shfl(this->AllocationHead, 0);
}
// The global output allocation is written to by striding the writes across
// the warp lanes, allowing the writes to global memory to be coalesced.
VTKM_SUPPRESS_EXEC_WARNINGS
__device__ vtkm::Id GetNextOutputIndex() const
{
// Only lanes with unwritten output indices left will call this method,
// so just check the coalesced threads:
const auto activeLanes = cooperative_groups::coalesced_threads();
const int activeRank = activeLanes.thread_rank();
const int activeSize = activeLanes.size();
vtkm::Id nextIdx = static_cast<vtkm::Id>(this->AllocationHead + activeRank);
this->AllocationHead += activeSize;
return nextIdx;
}
const BitsPortal Input;
const IndicesPortal Output;
GlobalPopCountType* GlobalPopCount;
mutable vtkm::UInt64 AllocationHead{ 0 };
mutable vtkm::Int32 LocalPopCount{ 0 };
// Used to mask trailing bits the in last word.
vtkm::Id FinalWordIndex{ 0 };
Word FinalWordMask{ 0 };
};
template <class InputPortal, class OutputPortal>
VTKM_CONT static void CopyPortal(const InputPortal& input, const OutputPortal& output)
{
......@@ -742,9 +888,43 @@ private:
}
}
template <typename GlobalPopCountType, typename BitsPortal, typename IndicesPortal>
VTKM_CONT static vtkm::Id BitFieldToUnorderedSetPortal(const BitsPortal& bits,
const IndicesPortal& indices)
{
using Functor = BitFieldToUnorderedSetFunctor<BitsPortal, IndicesPortal, GlobalPopCountType>;