Commit f1962396 authored by Dmitriy Morozov's avatar Dmitriy Morozov

Update DIY

parent e8c83483
......@@ -137,7 +137,7 @@ void analysis_final(size_t k_max, size_t nblocks)
diy::mpi::io::file out(master->communicator(), "buffer.npy", diy::mpi::io::file::wronly | diy::mpi::io::file::create);
diy::io::NumPy writer(out);
writer.write_header<float>(buffer_shape);
master->foreach<Autocorrelation>([&writer](Autocorrelation* b, const diy::Master::ProxyWithLink, void*)
master->foreach([&writer](Autocorrelation* b, const diy::Master::ProxyWithLink)
{
diy::DiscreteBounds bounds;
for (size_t i = 0; i < 3; ++i)
......@@ -152,7 +152,7 @@ void analysis_final(size_t k_max, size_t nblocks)
#endif
// add up the autocorrellations
master->foreach<Autocorrelation>([](Autocorrelation* b, const diy::Master::ProxyWithLink& cp, void*)
master->foreach([](Autocorrelation* b, const diy::Master::ProxyWithLink& cp)
{
std::vector<float> sums(b->window, 0);
grid::for_each(b->corr.shape(), [&](const Vertex4D& v)
......@@ -174,14 +174,16 @@ void analysis_final(size_t k_max, size_t nblocks)
std::cout << ' ' << result[i];
std::cout << std::endl;
}
master->foreach<Autocorrelation>([](Autocorrelation*, const diy::Master::ProxyWithLink& cp, void*)
master->foreach([](Autocorrelation*, const diy::Master::ProxyWithLink& cp)
{
cp.collectives()->clear();
});
// select k strongest autocorrelations for each shift
diy::ContiguousAssigner assigner(master->communicator().size(), nblocks); // NB: this is coupled to main(...) in oscillator.cpp
diy::RegularMergePartners partners(3, nblocks, 2, true);
//diy::RegularMergePartners partners(3, nblocks, 2, true);
diy::RegularDecomposer<diy::DiscreteBounds> decomposer(1, diy::interval(0, nblocks-1), nblocks);
diy::RegularMergePartners partners(decomposer, 2);
diy::reduce(*master, assigner, partners,
[k_max](void* b_, const diy::ReduceProxy& rp, const diy::RegularMergePartners&)
{
......
......@@ -51,7 +51,7 @@ struct Block
velocity_scale(velocity_scale_),
bounds(bounds_),
domain(domain_),
grid(Vertex(bounds.max) - Vertex(bounds.min) + Vertex::one()),
grid(Vertex(&bounds.max[0]) - Vertex(&bounds.min[0]) + Vertex::one()),
oscillators(oscillators_)
{
}
......@@ -62,7 +62,7 @@ struct Block
{
auto& gv = grid(v);
gv = 0;
auto v_global = v + bounds.min;
auto v_global = v + Vertex(&bounds.min[0]);
for (auto& o : oscillators)
gv += o.evaluate(v_global, t);
......@@ -362,14 +362,14 @@ int main(int argc, char** argv)
{
timer::MarkStartTimeStep(t_count, t);
timer::MarkStartEvent("oscillators::advance");
master.foreach<Block>([=](Block* b, const Proxy&, void*)
master.foreach([=](Block* b, const Proxy&)
{
b->advance(t);
});
timer::MarkEndEvent("oscillators::advance");
timer::MarkStartEvent("oscillators::move_particles");
master.foreach<Block>([=](Block* b, const Proxy& p, void*)
master.foreach([=](Block* b, const Proxy& p)
{
b->move_particles(dt, p);
});
......@@ -380,14 +380,14 @@ int main(int argc, char** argv)
timer::MarkEndEvent("oscillators::master.exchange");
timer::MarkStartEvent("oscillators::handle_incoming_particles");
master.foreach<Block>([=](Block* b, const Proxy& p, void*)
master.foreach([=](Block* b, const Proxy& p)
{
b->handle_incoming_particles(p);
});
timer::MarkEndEvent("oscillators::handle_incoming_particles");
timer::MarkStartEvent("oscillators::analysis");
master.foreach<Block>([=](Block* b, const Proxy&, void*)
master.foreach([=](Block* b, const Proxy&)
{
b->analyze_block();
});
......@@ -405,7 +405,7 @@ int main(int argc, char** argv)
std::string outfn = fmt::format("{}-{}.bin", out_prefix, t);
diy::mpi::io::file out(world, outfn, diy::mpi::io::file::wronly | diy::mpi::io::file::create);
diy::io::BOV writer(out, shape);
master.foreach<Block>([&writer](Block* b, const diy::Master::ProxyWithLink& cp, void*)
master.foreach([&writer](Block* b, const diy::Master::ProxyWithLink& cp)
{
auto link = static_cast<Link*>(cp.link());
writer.write(link->bounds(), b->grid.data(), true);
......
......@@ -351,7 +351,7 @@ void Autocorrelation::PrintResults(size_t k_max)
size_t nblocks = internals.NumberOfBlocks;
// add up the autocorrellations
internals.Master->foreach<AutocorrelationImpl>([](AutocorrelationImpl* b, const diy::Master::ProxyWithLink& cp, void*)
internals.Master->foreach([](AutocorrelationImpl* b, const diy::Master::ProxyWithLink& cp)
{
std::vector<float> sums(b->window, 0);
grid::for_each(b->corr.shape(), [&](const Vertex4D& v)
......@@ -373,15 +373,17 @@ void Autocorrelation::PrintResults(size_t k_max)
std::cout << std::endl;
}
internals.Master->foreach<AutocorrelationImpl>(
[](AutocorrelationImpl*, const diy::Master::ProxyWithLink& cp, void*)
internals.Master->foreach(
[](AutocorrelationImpl*, const diy::Master::ProxyWithLink& cp)
{
cp.collectives()->clear();
});
// select k strongest autocorrelations for each shift
diy::ContiguousAssigner assigner(internals.Master->communicator().size(), nblocks); // NB: this is coupled to main(...) in oscillator.cpp
diy::RegularMergePartners partners(3, nblocks, 2, true);
diy::RegularDecomposer<diy::DiscreteBounds> decomposer(1, diy::interval(0, nblocks-1), nblocks);
diy::RegularMergePartners partners(decomposer, 2);
//diy::RegularMergePartners partners(3, nblocks, 2, true);
diy::reduce(*internals.Master, assigner, partners,
[k_max](void* b_, const diy::ReduceProxy& rp, const diy::RegularMergePartners&)
{
......
......@@ -11,6 +11,9 @@
#include "detail/algorithms/sort.hpp"
#include "detail/algorithms/kdtree.hpp"
#include "detail/algorithms/kdtree-sampling.hpp"
#include "log.hpp"
namespace diy
{
......@@ -36,13 +39,13 @@ namespace diy
detail::SampleSort<Block,T,Cmp> sorter(values, samples, cmp, num_samples);
// swap-reduce to all-gather samples
RegularSwapPartners partners(1, assigner.nblocks(), k);
RegularDecomposer<DiscreteBounds> decomposer(1, interval(0,assigner.nblocks()), assigner.nblocks());
RegularSwapPartners partners(decomposer, k);
reduce(master, assigner, partners, sorter.sample(), detail::SkipIntermediate(partners.rounds()));
// swap-reduce to all-to-all exchange the values
RegularSwapPartners exchange_partners(1, assigner.nblocks(), k, false);
// all_to_all to exchange the values
if (!samples_only)
reduce(master, assigner, exchange_partners, sorter.exchange());
all_to_all(master, assigner, sorter.exchange(), k);
master.set_immediate(immediate);
}
......@@ -67,7 +70,7 @@ namespace diy
/**
* \ingroup Algorithms
* \brief build a kd-tree and sort a set of points into it
* \brief build a kd-tree and sort a set of points into it (use histograms to determine split values)
*/
template<class Block, class Point>
void kdtree(Master& master, //!< master object
......@@ -79,18 +82,37 @@ namespace diy
bool wrap = false)//!< periodic boundaries in all dimensions
{
if (assigner.nblocks() & (assigner.nblocks() - 1))
{
fprintf(stderr, "KD-tree requires a number of blocks that's a power of 2, got %d\n", assigner.nblocks());
std::abort();
}
throw std::runtime_error(fmt::format("KD-tree requires a number of blocks that's a power of 2, got {}", assigner.nblocks()));
typedef diy::RegularContinuousLink RCLink;
for (int i = 0; i < master.size(); ++i)
for (size_t i = 0; i < master.size(); ++i)
{
RCLink* link = static_cast<RCLink*>(master.link(i));
link->core() = domain;
link->bounds() = domain;
*link = RCLink(dim, domain, domain);
if (wrap) // set up the links to self
{
diy::BlockID self = { master.gid(i), master.communicator().rank() };
for (int j = 0; j < dim; ++j)
{
diy::Direction dir, wrap_dir;
// left
dir[j] = -1; wrap_dir[j] = -1;
link->add_neighbor(self);
link->add_bounds(domain);
link->add_direction(dir);
link->add_wrap(wrap_dir);
// right
dir[j] = 1; wrap_dir[j] = 1;
link->add_neighbor(self);
link->add_bounds(domain);
link->add_direction(dir);
link->add_wrap(wrap_dir);
}
}
}
detail::KDTreePartition<Block,Point> kdtree_partition(dim, points, bins);
......@@ -100,11 +122,70 @@ namespace diy
// update master.expected to match the links
int expected = 0;
for (int i = 0; i < master.size(); ++i)
for (size_t i = 0; i < master.size(); ++i)
expected += master.link(i)->size_unique();
master.set_expected(expected);
}
/**
* \ingroup Algorithms
* \brief build a kd-tree and sort a set of points into it (use sampling to determine split values)
*/
template<class Block, class Point>
void kdtree_sampling
(Master& master, //!< master object
const Assigner& assigner, //!< assigner object
int dim, //!< dimensionality
const ContinuousBounds& domain, //!< global data extents
std::vector<Point> Block::* points, //!< input points to sort into kd-tree
size_t samples, //!< number of samples to take in each block
bool wrap = false)//!< periodic boundaries in all dimensions
{
if (assigner.nblocks() & (assigner.nblocks() - 1))
throw std::runtime_error(fmt::format("KD-tree requires a number of blocks that's a power of 2, got {}", assigner.nblocks()));
typedef diy::RegularContinuousLink RCLink;
for (size_t i = 0; i < master.size(); ++i)
{
RCLink* link = static_cast<RCLink*>(master.link(i));
*link = RCLink(dim, domain, domain);
if (wrap) // set up the links to self
{
diy::BlockID self = { master.gid(i), master.communicator().rank() };
for (int j = 0; j < dim; ++j)
{
diy::Direction dir, wrap_dir;
// left
dir[j] = -1; wrap_dir[j] = -1;
link->add_neighbor(self);
link->add_bounds(domain);
link->add_direction(dir);
link->add_wrap(wrap_dir);
// right
dir[j] = 1; wrap_dir[j] = 1;
link->add_neighbor(self);
link->add_bounds(domain);
link->add_direction(dir);
link->add_wrap(wrap_dir);
}
}
}
detail::KDTreeSamplingPartition<Block,Point> kdtree_partition(dim, points, samples);
detail::KDTreePartners partners(dim, assigner.nblocks(), wrap, domain);
reduce(master, assigner, partners, kdtree_partition);
// update master.expected to match the links
int expected = 0;
for (size_t i = 0; i < master.size(); ++i)
expected += master.link(i)->size_unique();
master.set_expected(expected);
}
}
#endif
......@@ -2,6 +2,9 @@
#define DIY_ASSIGNER_HPP
#include <vector>
#include <tuple>
#include "mpi.hpp" // needed for DynamicAssigner
namespace diy
{
......@@ -15,43 +18,55 @@ namespace diy
* \ingroup Assignment
* \brief Manages how blocks are assigned to processes
*/
Assigner(int size, //!< total number of processes
int nblocks //!< total (global) number of blocks
Assigner(int size__, //!< total number of processes
int nblocks__ //!< total (global) number of blocks
):
size_(size), nblocks_(nblocks) {}
size_(size__), nblocks_(nblocks__) {}
//! returns the total number of process ranks
int size() const { return size_; }
//! returns the total number of global blocks
int nblocks() const { return nblocks_; }
//! sets the total number of global blocks
void set_nblocks(int nblocks) { nblocks_ = nblocks; }
//! gets the local gids for a given process rank
virtual void local_gids(int rank, std::vector<int>& gids) const =0;
virtual void set_nblocks(int nblocks__) { nblocks_ = nblocks__; }
//! returns the process rank of the block with global id gid (need not be local)
virtual int rank(int gid) const =0;
//! batch lookup; returns the process ranks of the blocks with global id in the vector gids
inline
virtual std::vector<int>
ranks(const std::vector<int>& gids) const;
private:
int size_; // total number of ranks
int nblocks_; // total number of blocks
};
class ContiguousAssigner: public Assigner
class StaticAssigner: public Assigner
{
public:
/**
* \ingroup Assignment
* \brief Intermediate type to express assignment that cannot change; adds `local_gids` query method
*/
using Assigner::Assigner;
//! gets the local gids for a given process rank
virtual void local_gids(int rank, std::vector<int>& gids) const =0;
};
class ContiguousAssigner: public StaticAssigner
{
public:
/**
* \ingroup Assignment
* \brief Assigns blocks to processes in contiguous gid (block global id) order
*/
ContiguousAssigner(int size, //!< total number of processes
int nblocks //!< total (global) number of blocks
):
Assigner(size, nblocks) {}
using StaticAssigner::StaticAssigner;
using Assigner::size;
using Assigner::nblocks;
using StaticAssigner::size;
using StaticAssigner::nblocks;
int rank(int gid) const
int rank(int gid) const override
{
int div = nblocks() / size();
int mod = nblocks() % size();
......@@ -65,47 +80,87 @@ namespace diy
}
}
inline
void local_gids(int rank, std::vector<int>& gids) const;
void local_gids(int rank, std::vector<int>& gids) const override;
};
class RoundRobinAssigner: public Assigner
class RoundRobinAssigner: public StaticAssigner
{
public:
/**
* \ingroup Assignment
* \brief Assigns blocks to processes in cyclic or round-robin gid (block global id) order
*/
RoundRobinAssigner(int size, //!< total number of processes
int nblocks //!< total (global) number of blocks
):
Assigner(size, nblocks) {}
using StaticAssigner::StaticAssigner;
using StaticAssigner::size;
using StaticAssigner::nblocks;
int rank(int gid) const override { return gid % size(); }
inline
void local_gids(int rank, std::vector<int>& gids) const override;
};
using Assigner::size;
using Assigner::nblocks;
class DynamicAssigner: public Assigner
{
public:
DynamicAssigner(const mpi::communicator& comm, int size__, int nblocks__):
Assigner(size__, nblocks__),
comm_(comm),
div_(nblocks__ / size__ + ((nblocks__ % size__) == 0 ? 0 : 1)), // NB: same size window everywhere means the last rank may allocate extra space
rank_map_(comm_, div_) { rank_map_.lock_all(MPI_MODE_NOCHECK); }
~DynamicAssigner() { rank_map_.unlock_all(); }
int rank(int gid) const { return gid % size(); }
inline
void local_gids(int rank, std::vector<int>& gids) const;
virtual void set_nblocks(int nblocks__) override;
inline
virtual int rank(int gid) const override;
inline
virtual std::vector<int>
ranks(const std::vector<int>& gids) const override;
inline std::tuple<bool,int>
get_rank(int& rk, int gid) const;
inline void set_rank(const int& rk, int gid, bool flush = true);
inline void set_ranks(const std::vector<std::tuple<int,int>>& rank_gids);
std::tuple<int,int>
rank_offset(int gid) const { return std::make_tuple(gid / div_, gid % div_); }
private:
mpi::communicator comm_;
int div_;
mutable mpi::window<int> rank_map_;
};
}
std::vector<int>
diy::Assigner::
ranks(const std::vector<int>& gids) const
{
std::vector<int> result(gids.size());
for (size_t i = 0; i < gids.size(); ++i)
result[i] = rank(gids[i]);
return result;
}
void
diy::ContiguousAssigner::
local_gids(int rank, std::vector<int>& gids) const
local_gids(int rank_, std::vector<int>& gids) const
{
int div = nblocks() / size();
int mod = nblocks() % size();
int from, to;
if (rank < mod)
from = rank * (div + 1);
if (rank_ < mod)
from = rank_ * (div + 1);
else
from = mod * (div + 1) + (rank - mod) * div;
from = mod * (div + 1) + (rank_ - mod) * div;
if (rank + 1 < mod)
to = (rank + 1) * (div + 1);
if (rank_ + 1 < mod)
to = (rank_ + 1) * (div + 1);
else
to = mod * (div + 1) + (rank + 1 - mod) * div;
to = mod * (div + 1) + (rank_ + 1 - mod) * div;
for (int gid = from; gid < to; ++gid)
gids.push_back(gid);
......@@ -113,9 +168,9 @@ local_gids(int rank, std::vector<int>& gids) const
void
diy::RoundRobinAssigner::
local_gids(int rank, std::vector<int>& gids) const
local_gids(int rank_, std::vector<int>& gids) const
{
int cur = rank;
int cur = rank_;
while (cur < nblocks())
{
gids.push_back(cur);
......@@ -123,4 +178,87 @@ local_gids(int rank, std::vector<int>& gids) const
}
}
void
diy::DynamicAssigner::
set_nblocks(int nblocks__)
{
Assigner::set_nblocks(nblocks__);
div_ = nblocks() / size() + ((nblocks() % size()) == 0 ? 0 : 1);
rank_map_.unlock_all();
rank_map_ = mpi::window<int>(comm_, div_);
rank_map_.lock_all(MPI_MODE_NOCHECK);
}
std::tuple<bool,int>
diy::DynamicAssigner::
get_rank(int& rk, int gid) const
{
// TODO: check if the gid is in cache
int r,offset;
std::tie(r,offset) = rank_offset(gid);
rank_map_.get(rk, r, offset);
return std::make_tuple(false, r); // false indicates that the data wasn't read from cache
}
int
diy::DynamicAssigner::
rank(int gid) const
{
int rk;
auto cached_gidrk = get_rank(rk, gid);
int gidrk = std::get<1>(cached_gidrk);
rank_map_.flush_local(gidrk);
return rk;
}
std::vector<int>
diy::DynamicAssigner::
ranks(const std::vector<int>& gids) const
{
bool all_cached = true;
std::vector<int> result(gids.size(), -1);
for (size_t i = 0; i < gids.size(); ++i)
{
auto cached_gidrk = get_rank(result[i], gids[i]);
bool cached = std::get<0>(cached_gidrk);
all_cached &= cached;
}
if (!all_cached)
rank_map_.flush_local_all();
return result;
}
void
diy::DynamicAssigner::
set_rank(const int& rk, int gid, bool flush)
{
// TODO: update cache
int r,offset;
std::tie(r,offset) = rank_offset(gid);
rank_map_.put(rk, r, offset);
if (flush)
rank_map_.flush(r);
}
void
diy::DynamicAssigner::
set_ranks(const std::vector<std::tuple<int,int>>& rank_gids)
{
for (auto& rg : rank_gids)
set_rank(std::get<0>(rg), std::get<1>(rg), false);
rank_map_.flush_all();
}
#endif
......@@ -23,26 +23,26 @@ namespace diy
typedef detail::Load Load;
public:
Collection(Create create,
Destroy destroy,
ExternalStorage* storage,
Save save,
Load load):
create_(create),
destroy_(destroy),
storage_(storage),
save_(save),
load_(load),
Collection(Create create__,
Destroy destroy__,
ExternalStorage* storage__,
Save save__,
Load load__):
create_(create__),
destroy_(destroy__),
storage_(storage__),
save_(save__),
load_(load__),
in_memory_(0) {}
size_t size() const { return elements_.size(); }
const CInt& in_memory() const { return in_memory_; }
inline void clear();
int add(Element e) { elements_.push_back(e); external_.push_back(-1); ++(*in_memory_.access()); return elements_.size() - 1; }
void* release(int i) { void* e = get(i); elements_[i] = 0; return e; }
int add(Element e) { elements_.push_back(e); external_.push_back(-1); ++(*in_memory_.access()); return static_cast<int>(elements_.size()) - 1; }
void* release(int i) { void* e = get(i); elements_[static_cast<size_t>(i)] = 0; return e; }
void* find(int i) const { return elements_[i]; } // possibly returns 0, if the element is unloaded
void* find(int i) const { return elements_[static_cast<size_t>(i)]; } // possibly returns 0, if the element is unloaded
void* get(int i) { if (!find(i)) load(i); return find(i); } // loads the element first, and then returns its address
int available() const { int i = 0; for (; i < (int)size(); ++i) if (find(i) != 0) break; return i; }
......@@ -56,7 +56,7 @@ namespace diy
Save saver() const { return save_; }
void* create() const { return create_(); }
void destroy(int i) { if (find(i)) { destroy_(find(i)); elements_[i] = 0; } else if (external_[i] != -1) storage_->destroy(external_[i]); }
void destroy(int i) { if (find(i)) { destroy_(find(i)); elements_[static_cast<size_t>(i)] = 0; } else if (external_[static_cast<size_t>(i)] != -1) storage_->destroy(external_[static_cast<size_t>(i)]); }
bool own() const { return destroy_ != 0; }
......@@ -81,7 +81,7 @@ clear()
{
if (own())
for (size_t i = 0; i < size(); ++i)
destroy(i);
destroy(static_cast<int>(i));
elements_.clear();
external_.clear();
*in_memory_.access() = 0;
......@@ -95,10 +95,10 @@ unload(int i)
void* e = find(i);
//save_(e, bb);
//external_[i] = storage_->put(bb);
external_[i] = storage_->put(e, save_);
external_[static_cast<size_t>(i)] = storage_->put(e, save_);
destroy_(e);
elements_[i] = 0;
elements_[static_cast<size_t>(i)] = 0;
--(*in_memory_.access());
}
......@@ -111,9 +111,9 @@ load(int i)
//storage_->get(external_[i], bb);
void* e = create_();
//load_(e, bb);
storage_->get(external_[i], e, load_);
elements_[i] = e;
external_[i] = -1;
storage_->get(external_[static_cast<size_t>(i)], e, load_);
elements_[static_cast<size_t>(i)] = e;
external_[static_cast<size_t>(i)] = -1;
++(*in_memory_.access());
}
......
......@@ -2,19 +2,12 @@
#define DIY_CONSTANTS_H
// Default DIY_MAX_DIM to 4, unless provided by the user
// (used for static array size in various Bounds (bb_t in types.h))
// (used for static min/max size in various Bounds)
#ifndef DIY_MAX_DIM
#define DIY_MAX_DIM 4
#endif
/* DIY directions: neighbor direction enumeration
used to identify direction of one neighbor block for both regular and
wrapround neighbors
can be bitwise ORed, eg., maximum-side neighboring block in 3 dimensions
would be DIY_X1 | DIY_Y1 | DIY_Z1
each use identifies exactly one block
eg. DIY_X0 is the (one) left neighbor block, not the entire left plane */
typedef enum
enum
{
DIY_X0 = 0x01, /* minimum-side x (left) neighbor */
DIY_X1 = 0x02, /* maximum-side x (right) neighbor */
......@@ -24,6 +17,8 @@ typedef enum
DIY_Z1 = 0x20, /* maximum-side z (front)neighbor */
DIY_T0 = 0x40, /* minimum-side t (earlier) neighbor */
DIY_T1 = 0x80 /* maximum-side t (later) neighbor */
} dir_t;
};
#define DIY_UNUSED(expr) do { (void)(expr); } while (0)
#endif