Commit cad967ee authored by Ben Boeckel's avatar Ben Boeckel

Merge branch 'upstream-diy2' into update-diy-20180918

* upstream-diy2:
  diy2 2018-09-18 (f402e419)
parents ad9f7b29 5789e33e
project(diy2)
if (NOT VTK_INSTALL_NO_DEVELOPMENT)
install(
DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/include/vtkdiy"
DESTINATION "${VTK_INSTALL_INCLUDE_DIR}"
COMPONENT Development)
endif ()
install(DIRECTORY include
DESTINATION "${VTK_INSTALL_INCLUDE_DIR}/vtkdiy2"
COMPONENT Development)
endif()
## DIY2 is a block-parallel library
## DIY is a block-parallel library
DIY2 is a block-parallel library for implementing scalable algorithms that can execute both
DIY is a block-parallel library for implementing scalable algorithms that can execute both
in-core and out-of-core. The same program can be executed with one or more threads per MPI
process, seamlessly combining distributed-memory message passing with shared-memory thread
parallelism. The abstraction enabling these capabilities is block parallelism; blocks
and their message queues are mapped onto processing elements (MPI processes or threads) and are
migrated between memory and storage by the DIY2 runtime. Complex communication patterns,
migrated between memory and storage by the DIY runtime. Complex communication patterns,
including neighbor exchange, merge reduction, swap reduction, and all-to-all exchange, are
possible in- and out-of-core in DIY2.
possible in- and out-of-core in DIY.
## Licensing
DIY2 is released as open source software under a BSD style [license](./LICENSE.txt).
DIY is released as open source software under a BSD-style [license](./LICENSE.txt).
## Dependencies
DIY2 requires an MPI installation. We recommend [MPICH](http://www.mpich.org/).
DIY requires an MPI installation. We recommend [MPICH](http://www.mpich.org/).
## Download, build, install
......@@ -24,25 +24,25 @@ DIY2 requires an MPI installation. We recommend [MPICH](http://www.mpich.org/).
- You can download the [latest tarball](https://github.com/diatomic/diy2/archive/master.tar.gz).
DIY2 is a header-only library. It does not need to be built; you can simply
DIY is a header-only library. It does not need to be built; you can simply
include it in your project. The examples can be built using `cmake` from the
top level directory.
top-level directory.
## Documentation
[Doxygen pages](https://diatomic.github.io/diy2)
[Doxygen pages](https://diatomic.github.io/diy)
## Example
A simple DIY2 program, shown below, consists of the following components:
A simple DIY program, shown below, consists of the following components:
- `struct`s called blocks,
- a diy object called the `master`,
- a set of callback functions performed on each block by `master.foreach()`,
- optionally one or more message exchanges between the blocks by `master.exchange()`, and
- optionally, one or more message exchanges between the blocks by `master.exchange()`, and
- there may be other collectives and global reductions not shown below.
The callback functions (`enqueue_block()` and `average()` in the example below) are given the block
The callback functions (`enqueue_local()` and `average()` in the example below) receive the block
pointer and a communication proxy for the message exchange between blocks. It is usual for the
callback functions to enqueue or dequeue messages from the proxy, so that information can be
received and sent during rounds of message exchange.
......@@ -54,31 +54,24 @@ received and sent during rounds of message exchange.
Master master(world); // world = MPI_Comm
... // populate master with blocks
master.foreach<Block>(&enqueue_local); // call enqueue_local() for each block
master.foreach(&enqueue_local); // call enqueue_local() for each block
master.exchange(); // exchange enqueued data between blocks
master.foreach<Block>(&average); // call average() for each block
master.foreach(&average); // call average() for each block
// --- callback functions --- //
// enqueue block data prior to exchanging it
void enqueue_local(Block* b, // one block
const Proxy& cp, // communication proxy
// i.e., the neighbor blocks with which
// this block communicates
void* aux) // user-defined additional arguments
void enqueue_local(Block* b, // current block
const Proxy& cp) // communication proxy provides access to the neighbor blocks
{
for (size_t i = 0; i < cp.link()->size(); i++) // for all neighbor blocks
cp.enqueue(cp.link()->target(i), b->local); // enqueue the data to be sent
// to this neighbor block in the next
// exchange
cp.enqueue(cp.link()->target(i), b->local); // enqueue the data to be sent to this neighbor
// block in the next exchange
}
// use the received data after exchanging it, in this case compute its average
void average(Block* b, // one block
const Proxy& cp, // communication proxy
// i.e., the neighbor blocks with which
// this block communicates
void* aux) // user-defined additional arguments
void average(Block* b, // current block
const Proxy& cp) // communication proxy provides access to the neighbor blocks
{
float x, average = 0;
for (size_t i = 0; i < cp.link()->size(); i++) // for all neighbor blocks
......
......@@ -13,6 +13,8 @@
#include "detail/algorithms/kdtree.hpp"
#include "detail/algorithms/kdtree-sampling.hpp"
#include "log.hpp"
namespace diy
{
/**
......@@ -80,10 +82,7 @@ namespace diy
bool wrap = false)//!< periodic boundaries in all dimensions
{
if (assigner.nblocks() & (assigner.nblocks() - 1))
{
fprintf(stderr, "KD-tree requires a number of blocks that's a power of 2, got %d\n", assigner.nblocks());
std::abort();
}
throw std::runtime_error(fmt::format("KD-tree requires a number of blocks that's a power of 2, got {}", assigner.nblocks()));
typedef diy::RegularContinuousLink RCLink;
......@@ -100,14 +99,14 @@ namespace diy
diy::Direction dir, wrap_dir;
// left
dir.x[j] = -1; wrap_dir.x[j] = -1;
dir[j] = -1; wrap_dir[j] = -1;
link->add_neighbor(self);
link->add_bounds(domain);
link->add_direction(dir);
link->add_wrap(wrap_dir);
// right
dir.x[j] = 1; wrap_dir.x[j] = 1;
dir[j] = 1; wrap_dir[j] = 1;
link->add_neighbor(self);
link->add_bounds(domain);
link->add_direction(dir);
......@@ -143,10 +142,7 @@ namespace diy
bool wrap = false)//!< periodic boundaries in all dimensions
{
if (assigner.nblocks() & (assigner.nblocks() - 1))
{
fprintf(stderr, "KD-tree requires a number of blocks that's a power of 2, got %d\n", assigner.nblocks());
std::abort();
}
throw std::runtime_error(fmt::format("KD-tree requires a number of blocks that's a power of 2, got {}", assigner.nblocks()));
typedef diy::RegularContinuousLink RCLink;
......@@ -163,14 +159,14 @@ namespace diy
diy::Direction dir, wrap_dir;
// left
dir.x[j] = -1; wrap_dir.x[j] = -1;
dir[j] = -1; wrap_dir[j] = -1;
link->add_neighbor(self);
link->add_bounds(domain);
link->add_direction(dir);
link->add_wrap(wrap_dir);
// right
dir.x[j] = 1; wrap_dir.x[j] = 1;
dir[j] = 1; wrap_dir[j] = 1;
link->add_neighbor(self);
link->add_bounds(domain);
link->add_direction(dir);
......
......@@ -2,6 +2,9 @@
#define DIY_ASSIGNER_HPP
#include <vector>
#include <tuple>
#include "mpi.hpp" // needed for DynamicAssigner
namespace diy
{
......@@ -15,43 +18,55 @@ namespace diy
* \ingroup Assignment
* \brief Manages how blocks are assigned to processes
*/
Assigner(int size, //!< total number of processes
int nblocks //!< total (global) number of blocks
Assigner(int size__, //!< total number of processes
int nblocks__ //!< total (global) number of blocks
):
size_(size), nblocks_(nblocks) {}
size_(size__), nblocks_(nblocks__) {}
//! returns the total number of process ranks
int size() const { return size_; }
//! returns the total number of global blocks
int nblocks() const { return nblocks_; }
//! sets the total number of global blocks
void set_nblocks(int nblocks) { nblocks_ = nblocks; }
//! gets the local gids for a given process rank
virtual void local_gids(int rank, std::vector<int>& gids) const =0;
virtual void set_nblocks(int nblocks__) { nblocks_ = nblocks__; }
//! returns the process rank of the block with global id gid (need not be local)
virtual int rank(int gid) const =0;
//! batch lookup; returns the process ranks of the blocks with global id in the vector gids
inline
virtual std::vector<int>
ranks(const std::vector<int>& gids) const;
private:
int size_; // total number of ranks
int nblocks_; // total number of blocks
};
class ContiguousAssigner: public Assigner
class StaticAssigner: public Assigner
{
public:
/**
* \ingroup Assignment
* \brief Intermediate type to express assignment that cannot change; adds `local_gids` query method
*/
using Assigner::Assigner;
//! gets the local gids for a given process rank
virtual void local_gids(int rank, std::vector<int>& gids) const =0;
};
class ContiguousAssigner: public StaticAssigner
{
public:
/**
* \ingroup Assignment
* \brief Assigns blocks to processes in contiguous gid (block global id) order
*/
ContiguousAssigner(int size, //!< total number of processes
int nblocks //!< total (global) number of blocks
):
Assigner(size, nblocks) {}
using StaticAssigner::StaticAssigner;
using Assigner::size;
using Assigner::nblocks;
using StaticAssigner::size;
using StaticAssigner::nblocks;
int rank(int gid) const
int rank(int gid) const override
{
int div = nblocks() / size();
int mod = nblocks() % size();
......@@ -65,47 +80,87 @@ namespace diy
}
}
inline
void local_gids(int rank, std::vector<int>& gids) const;
void local_gids(int rank, std::vector<int>& gids) const override;
};
class RoundRobinAssigner: public Assigner
class RoundRobinAssigner: public StaticAssigner
{
public:
/**
* \ingroup Assignment
* \brief Assigns blocks to processes in cyclic or round-robin gid (block global id) order
*/
RoundRobinAssigner(int size, //!< total number of processes
int nblocks //!< total (global) number of blocks
):
Assigner(size, nblocks) {}
using StaticAssigner::StaticAssigner;
using StaticAssigner::size;
using StaticAssigner::nblocks;
int rank(int gid) const override { return gid % size(); }
inline
void local_gids(int rank, std::vector<int>& gids) const override;
};
using Assigner::size;
using Assigner::nblocks;
class DynamicAssigner: public Assigner
{
public:
DynamicAssigner(const mpi::communicator& comm, int size__, int nblocks__):
Assigner(size__, nblocks__),
comm_(comm),
div_(nblocks__ / size__ + ((nblocks__ % size__) == 0 ? 0 : 1)), // NB: same size window everywhere means the last rank may allocate extra space
rank_map_(comm_, div_) { rank_map_.lock_all(MPI_MODE_NOCHECK); }
~DynamicAssigner() { rank_map_.unlock_all(); }
int rank(int gid) const { return gid % size(); }
inline
void local_gids(int rank, std::vector<int>& gids) const;
virtual void set_nblocks(int nblocks__) override;
inline
virtual int rank(int gid) const override;
inline
virtual std::vector<int>
ranks(const std::vector<int>& gids) const override;
inline std::tuple<bool,int>
get_rank(int& rk, int gid) const;
inline void set_rank(const int& rk, int gid, bool flush = true);
inline void set_ranks(const std::vector<std::tuple<int,int>>& rank_gids);
std::tuple<int,int>
rank_offset(int gid) const { return std::make_tuple(gid / div_, gid % div_); }
private:
mpi::communicator comm_;
int div_;
mutable mpi::window<int> rank_map_;
};
}
std::vector<int>
diy::Assigner::
ranks(const std::vector<int>& gids) const
{
std::vector<int> result(gids.size());
for (size_t i = 0; i < gids.size(); ++i)
result[i] = rank(gids[i]);
return result;
}
void
diy::ContiguousAssigner::
local_gids(int rank, std::vector<int>& gids) const
local_gids(int rank_, std::vector<int>& gids) const
{
int div = nblocks() / size();
int mod = nblocks() % size();
int from, to;
if (rank < mod)
from = rank * (div + 1);
if (rank_ < mod)
from = rank_ * (div + 1);
else
from = mod * (div + 1) + (rank - mod) * div;
from = mod * (div + 1) + (rank_ - mod) * div;
if (rank + 1 < mod)
to = (rank + 1) * (div + 1);
if (rank_ + 1 < mod)
to = (rank_ + 1) * (div + 1);
else
to = mod * (div + 1) + (rank + 1 - mod) * div;
to = mod * (div + 1) + (rank_ + 1 - mod) * div;
for (int gid = from; gid < to; ++gid)
gids.push_back(gid);
......@@ -113,9 +168,9 @@ local_gids(int rank, std::vector<int>& gids) const
void
diy::RoundRobinAssigner::
local_gids(int rank, std::vector<int>& gids) const
local_gids(int rank_, std::vector<int>& gids) const
{
int cur = rank;
int cur = rank_;
while (cur < nblocks())
{
gids.push_back(cur);
......@@ -123,4 +178,87 @@ local_gids(int rank, std::vector<int>& gids) const
}
}
void
diy::DynamicAssigner::
set_nblocks(int nblocks__)
{
Assigner::set_nblocks(nblocks__);
div_ = nblocks() / size() + ((nblocks() % size()) == 0 ? 0 : 1);
rank_map_.unlock_all();
rank_map_ = mpi::window<int>(comm_, div_);
rank_map_.lock_all(MPI_MODE_NOCHECK);
}
std::tuple<bool,int>
diy::DynamicAssigner::
get_rank(int& rk, int gid) const
{
// TODO: check if the gid is in cache
int r,offset;
std::tie(r,offset) = rank_offset(gid);
rank_map_.get(rk, r, offset);
return std::make_tuple(false, r); // false indicates that the data wasn't read from cache
}
int
diy::DynamicAssigner::
rank(int gid) const
{
int rk;
auto cached_gidrk = get_rank(rk, gid);
int gidrk = std::get<1>(cached_gidrk);
rank_map_.flush_local(gidrk);
return rk;
}
std::vector<int>
diy::DynamicAssigner::
ranks(const std::vector<int>& gids) const
{
bool all_cached = true;
std::vector<int> result(gids.size(), -1);
for (size_t i = 0; i < gids.size(); ++i)
{
auto cached_gidrk = get_rank(result[i], gids[i]);
bool cached = std::get<0>(cached_gidrk);
all_cached &= cached;
}
if (!all_cached)
rank_map_.flush_local_all();
return result;
}
void
diy::DynamicAssigner::
set_rank(const int& rk, int gid, bool flush)
{
// TODO: update cache
int r,offset;
std::tie(r,offset) = rank_offset(gid);
rank_map_.put(rk, r, offset);
if (flush)
rank_map_.flush(r);
}
void
diy::DynamicAssigner::
set_ranks(const std::vector<std::tuple<int,int>>& rank_gids)
{
for (auto& rg : rank_gids)
set_rank(std::get<0>(rg), std::get<1>(rg), false);
rank_map_.flush_all();
}
#endif
......@@ -23,26 +23,26 @@ namespace diy
typedef detail::Load Load;
public:
Collection(Create create,
Destroy destroy,
ExternalStorage* storage,
Save save,
Load load):
create_(create),
destroy_(destroy),
storage_(storage),
save_(save),
load_(load),
Collection(Create create__,
Destroy destroy__,
ExternalStorage* storage__,
Save save__,
Load load__):
create_(create__),
destroy_(destroy__),
storage_(storage__),
save_(save__),
load_(load__),
in_memory_(0) {}
size_t size() const { return elements_.size(); }
const CInt& in_memory() const { return in_memory_; }
inline void clear();
int add(Element e) { elements_.push_back(e); external_.push_back(-1); ++(*in_memory_.access()); return elements_.size() - 1; }
void* release(int i) { void* e = get(i); elements_[i] = 0; return e; }
int add(Element e) { elements_.push_back(e); external_.push_back(-1); ++(*in_memory_.access()); return static_cast<int>(elements_.size()) - 1; }
void* release(int i) { void* e = get(i); elements_[static_cast<size_t>(i)] = 0; return e; }
void* find(int i) const { return elements_[i]; } // possibly returns 0, if the element is unloaded
void* find(int i) const { return elements_[static_cast<size_t>(i)]; } // possibly returns 0, if the element is unloaded
void* get(int i) { if (!find(i)) load(i); return find(i); } // loads the element first, and then returns its address
int available() const { int i = 0; for (; i < (int)size(); ++i) if (find(i) != 0) break; return i; }
......@@ -56,7 +56,7 @@ namespace diy
Save saver() const { return save_; }
void* create() const { return create_(); }
void destroy(int i) { if (find(i)) { destroy_(find(i)); elements_[i] = 0; } else if (external_[i] != -1) storage_->destroy(external_[i]); }
void destroy(int i) { if (find(i)) { destroy_(find(i)); elements_[static_cast<size_t>(i)] = 0; } else if (external_[static_cast<size_t>(i)] != -1) storage_->destroy(external_[static_cast<size_t>(i)]); }
bool own() const { return destroy_ != 0; }
......@@ -81,7 +81,7 @@ clear()
{
if (own())
for (size_t i = 0; i < size(); ++i)
destroy(i);
destroy(static_cast<int>(i));
elements_.clear();
external_.clear();
*in_memory_.access() = 0;
......@@ -95,10 +95,10 @@ unload(int i)
void* e = find(i);
//save_(e, bb);
//external_[i] = storage_->put(bb);
external_[i] = storage_->put(e, save_);
external_[static_cast<size_t>(i)] = storage_->put(e, save_);
destroy_(e);
elements_[i] = 0;
elements_[static_cast<size_t>(i)] = 0;
--(*in_memory_.access());
}
......@@ -111,9 +111,9 @@ load(int i)
//storage_->get(external_[i], bb);
void* e = create_();
//load_(e, bb);
storage_->get(external_[i], e, load_);
elements_[i] = e;
external_[i] = -1;
storage_->get(external_[static_cast<size_t>(i)], e, load_);
elements_[static_cast<size_t>(i)] = e;
external_[static_cast<size_t>(i)] = -1;
++(*in_memory_.access());
}
......
......@@ -2,16 +2,11 @@
#define DIY_CONSTANTS_H
// Default DIY_MAX_DIM to 4, unless provided by the user
// (used for static array size in various Bounds (bb_t in types.h))
// (used for static min/max size in various Bounds)
#ifndef DIY_MAX_DIM
#define DIY_MAX_DIM 4
#endif
struct dir_t
{
int x[DIY_MAX_DIM];
};
enum
{
DIY_X0 = 0x01, /* minimum-side x (left) neighbor */
......@@ -24,4 +19,6 @@ enum
DIY_T1 = 0x80 /* maximum-side t (later) neighbor */
};
#define DIY_UNUSED(expr) do { (void)(expr); } while (0)
#endif
......@@ -3,20 +3,14 @@
namespace diy
{
// TODO: when not running under C++11, i.e., when lock_guard is TinyThread's
// lock_guard, and not C++11's unique_lock, this implementation might
// be buggy since the copy constructor is invoked when
// critical_resource::access() returns an instance of this class. Once
// the temporary is destroyed the mutex is unlocked. I'm not 100%
// certain of this because I'd expect a deadlock on copy constructor,
// but it's clearly not happening -- so I may be missing something.
// (This issue will take care of itself in DIY3 once we switch to C++11 completely.)
template<class T, class Mutex>
class resource_accessor
{
public:
resource_accessor(T& x, Mutex& m):
x_(x), lock_(m) {}
resource_accessor(resource_accessor&&) = default;
resource_accessor(const resource_accessor&) = delete;
T& operator*() { return x_; }
T* operator->() { return &x_; }
......
......@@ -4,6 +4,7 @@
#include <vector>
#include <cassert>
#include "../../partners/all-reduce.hpp"
#include "../../log.hpp"
// TODO: technically, what's done now is not a perfect subsample:
// we take the same number of samples from every block, in reality this number should be selected at random,
......@@ -32,7 +33,7 @@ struct KDTreeSamplingPartition
size_t samples):
dim_(dim), points_(points), samples_(samples) {}
void operator()(void* b_, const diy::ReduceProxy& srp, const KDTreePartners& partners) const;
void operator()(Block* b, const diy::ReduceProxy& srp, const KDTreePartners& partners) const;
int divide_gid(int gid, bool lower, int round, int rounds) const;
void update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const;
......@@ -64,10 +65,8 @@ struct KDTreeSamplingPartition
template<class Block, class Point>
void
diy::detail::KDTreeSamplingPartition<Block,Point>::
operator()(void* b_, const diy::ReduceProxy& srp, const KDTreePartners& partners) const
operator()(Block* b, const diy::ReduceProxy& srp, const KDTreePartners& partners) const
{
Block* b = static_cast<Block*>(b_);
int dim;
if (srp.round() < partners.rounds())
dim = partners.dim(srp.round());
......@@ -137,6 +136,7 @@ void
diy::detail::KDTreeSamplingPartition<Block,Point>::
update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const
{
auto log = get_logger();
int gid = srp.gid();
int lid = srp.master()->lid(gid);
RCLink* link = static_cast<RCLink*>(srp.master()->link(lid));
......@@ -163,7 +163,7 @@ update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int roun
dir[j] = -dir[j];
int k = link_map[std::make_pair(in_gid, dir)];
//printf("%d %d %f -> %d\n", in_gid, dir, split, k);
log->trace("{} {} {} -> {}", in_gid, dir, split, k);
splits[k] = split;
}
}
......@@ -299,8 +299,8 @@ add_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const
Samples smpls;
srp.dequeue(nbr_gid, smpls);
for (size_t i = 0; i < smpls.size(); ++i)
samples.push_back(smpls[i]);
for (size_t j = 0; j < smpls.size(); ++j)
samples.push_back(smpls[j]);
}
}
......@@ -381,11 +381,8 @@ dequeue_exchange(Block* b, const diy::ReduceProxy& srp, int dim) const
for (size_t j = 0; j < in_points.size(); ++j)
{
if (in_points[j][dim] < link->core().min[dim] || in_points[j][dim] > link->core().max[dim])
{
fprintf(stderr, "Warning: dequeued %f outside [%f,%f] (%d)\n",
in_points[j][dim], link->core().min[dim], link->core().max[dim], dim);
std::abort();
}
throw std::runtime_error(fmt::format("Dequeued {} outside [{},{}] ({})",
in_points[j][dim], link->core().min[dim], link->core().max[dim], dim));
(b->*points_).push_back(in_points[j]);
}
}
......