diff --git a/README.md b/README.md index bcddcc352e5b39e484905960359a8d9cf812e561..dba302433b5907f4e16ac9a052ad39679c7467c7 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ -## DIY2 is a data-parallel out-of-core library +## DIY2 is a block-parallel library -DIY2 is a data-parallel library for implementing scalable algorithms that can execute both +DIY2 is a block-parallel library for implementing scalable algorithms that can execute both in-core and out-of-core. The same program can be executed with one or more threads per MPI process, seamlessly combining distributed-memory message passing with shared-memory thread -parallelism. The abstraction enabling these capabilities is block-based parallelism; blocks +parallelism. The abstraction enabling these capabilities is block parallelism; blocks and their message queues are mapped onto processing elements (MPI processes or threads) and are migrated between memory and storage by the DIY2 runtime. Complex communication patterns, including neighbor exchange, merge reduction, swap reduction, and all-to-all exchange, are diff --git a/include/vtkdiy/algorithms.hpp b/include/vtkdiy/algorithms.hpp index b620a8f27dc6c8791e2caec81452481451e947a2..6920a5dd4c292b11fa60bd9da54509c696ce69d2 100644 --- a/include/vtkdiy/algorithms.hpp +++ b/include/vtkdiy/algorithms.hpp @@ -11,6 +11,7 @@ #include "detail/algorithms/sort.hpp" #include "detail/algorithms/kdtree.hpp" +#include "detail/algorithms/kdtree-sampling.hpp" namespace diy { @@ -36,7 +37,8 @@ namespace diy detail::SampleSort<Block,T,Cmp> sorter(values, samples, cmp, num_samples); // swap-reduce to all-gather samples - RegularSwapPartners partners(1, assigner.nblocks(), k); + RegularDecomposer<DiscreteBounds> decomposer(1, interval(0,assigner.nblocks()), assigner.nblocks()); + RegularSwapPartners partners(decomposer, k); reduce(master, assigner, partners, sorter.sample(), detail::SkipIntermediate(partners.rounds())); // all_to_all to exchange the values @@ -66,7 +68,7 @@ namespace diy /** * \ingroup Algorithms - * \brief build a kd-tree and sort a set of points into it + * \brief build a kd-tree and sort a set of points into it (use histograms to determine split values) */ template<class Block, class Point> void kdtree(Master& master, //!< master object @@ -85,11 +87,33 @@ namespace diy typedef diy::RegularContinuousLink RCLink; - for (int i = 0; i < master.size(); ++i) + for (size_t i = 0; i < master.size(); ++i) { RCLink* link = static_cast<RCLink*>(master.link(i)); - link->core() = domain; - link->bounds() = domain; + *link = RCLink(dim, domain, domain); + + if (wrap) // set up the links to self + { + diy::BlockID self = { master.gid(i), master.communicator().rank() }; + for (int j = 0; j < dim; ++j) + { + diy::Direction dir, wrap_dir; + + // left + dir.x[j] = -1; wrap_dir.x[j] = -1; + link->add_neighbor(self); + link->add_bounds(domain); + link->add_direction(dir); + link->add_wrap(wrap_dir); + + // right + dir.x[j] = 1; wrap_dir.x[j] = 1; + link->add_neighbor(self); + link->add_bounds(domain); + link->add_direction(dir); + link->add_wrap(wrap_dir); + } + } } detail::KDTreePartition<Block,Point> kdtree_partition(dim, points, bins); @@ -99,11 +123,73 @@ namespace diy // update master.expected to match the links int expected = 0; - for (int i = 0; i < master.size(); ++i) + for (size_t i = 0; i < master.size(); ++i) expected += master.link(i)->size_unique(); master.set_expected(expected); } + /** + * \ingroup Algorithms + * \brief build a kd-tree and sort a set of points into it (use sampling to determine split values) + */ + template<class Block, class Point> + void kdtree_sampling + (Master& master, //!< master object + const Assigner& assigner, //!< assigner object + int dim, //!< dimensionality + const ContinuousBounds& domain, //!< global data extents + std::vector<Point> Block::* points, //!< input points to sort into kd-tree + size_t samples, //!< number of samples to take in each block + bool wrap = false)//!< periodic boundaries in all dimensions + { + if (assigner.nblocks() & (assigner.nblocks() - 1)) + { + fprintf(stderr, "KD-tree requires a number of blocks that's a power of 2, got %d\n", assigner.nblocks()); + std::abort(); + } + + typedef diy::RegularContinuousLink RCLink; + + for (size_t i = 0; i < master.size(); ++i) + { + RCLink* link = static_cast<RCLink*>(master.link(i)); + *link = RCLink(dim, domain, domain); + + if (wrap) // set up the links to self + { + diy::BlockID self = { master.gid(i), master.communicator().rank() }; + for (int j = 0; j < dim; ++j) + { + diy::Direction dir, wrap_dir; + + // left + dir.x[j] = -1; wrap_dir.x[j] = -1; + link->add_neighbor(self); + link->add_bounds(domain); + link->add_direction(dir); + link->add_wrap(wrap_dir); + + // right + dir.x[j] = 1; wrap_dir.x[j] = 1; + link->add_neighbor(self); + link->add_bounds(domain); + link->add_direction(dir); + link->add_wrap(wrap_dir); + } + } + } + + detail::KDTreeSamplingPartition<Block,Point> kdtree_partition(dim, points, samples); + + detail::KDTreePartners partners(dim, assigner.nblocks(), wrap, domain); + reduce(master, assigner, partners, kdtree_partition); + + // update master.expected to match the links + int expected = 0; + for (size_t i = 0; i < master.size(); ++i) + expected += master.link(i)->size_unique(); + master.set_expected(expected); + } } #endif diff --git a/include/vtkdiy/constants.h b/include/vtkdiy/constants.h index 5ea7fd7029fbd86fc432303634f2aedd1f6aaca7..cf087a5c94be49788ae4ce9a83e14f569e271cb3 100644 --- a/include/vtkdiy/constants.h +++ b/include/vtkdiy/constants.h @@ -7,23 +7,9 @@ #define DIY_MAX_DIM 4 #endif -/* DIY directions: neighbor direction enumeration - used to identify direction of one neighbor block for both regular and - wrapround neighbors - can be bitwise ORed, eg., maximum-side neighboring block in 3 dimensions - would be DIY_X1 | DIY_Y1 | DIY_Z1 - each use identifies exactly one block - eg. DIY_X0 is the (one) left neighbor block, not the entire left plane */ -typedef enum +struct dir_t { - DIY_X0 = 0x01, /* minimum-side x (left) neighbor */ - DIY_X1 = 0x02, /* maximum-side x (right) neighbor */ - DIY_Y0 = 0x04, /* minimum-side y (bottom) neighbor */ - DIY_Y1 = 0x08, /* maximum-side y (top) neighbor */ - DIY_Z0 = 0x10, /* minimum-side z (back) neighbor */ - DIY_Z1 = 0x20, /* maximum-side z (front)neighbor */ - DIY_T0 = 0x40, /* minimum-side t (earlier) neighbor */ - DIY_T1 = 0x80 /* maximum-side t (later) neighbor */ -} dir_t; + int x[DIY_MAX_DIM]; +}; #endif diff --git a/include/vtkdiy/critical-resource.hpp b/include/vtkdiy/critical-resource.hpp index 0b81e3169d8b5251b55fe459bc29452d89027fee..61a5a4b8a714a988cadc3b095a14ef9cd89284c7 100644 --- a/include/vtkdiy/critical-resource.hpp +++ b/include/vtkdiy/critical-resource.hpp @@ -3,6 +3,14 @@ namespace diy { + // TODO: when not running under C++11, i.e., when lock_guard is TinyThread's + // lock_guard, and not C++11's unique_lock, this implementation might + // be buggy since the copy constructor is invoked when + // critical_resource::access() returns an instance of this class. Once + // the temporary is destroyed the mutex is unlocked. I'm not 100% + // certain of this because I'd expect a deadlock on copy constructor, + // but it's clearly not happening -- so I may be missing something. + // (This issue will take care of itself in DIY3 once we switch to C++11 completely.) template<class T, class Mutex> class resource_accessor { diff --git a/include/vtkdiy/decomposition.hpp b/include/vtkdiy/decomposition.hpp index f285f8409d1b2b811b36a121a4a6d90c9f55c68c..a51123c211b5ff3cc862de9c714de92ee2b8c922 100644 --- a/include/vtkdiy/decomposition.hpp +++ b/include/vtkdiy/decomposition.hpp @@ -5,6 +5,8 @@ #include <algorithm> #include <iostream> #include <cmath> +#include <sstream> +#include <stdexcept> #include "link.hpp" #include "assigner.hpp" @@ -74,35 +76,37 @@ namespace detail typedef std::vector<Coordinate> CoordinateVector; typedef std::vector<int> DivisionsVector; - /// @param assigner: decides how processors are assigned to blocks (maps a gid to a rank) - /// also communicates the total number of blocks /// @param wrap: indicates dimensions on which to wrap the boundary /// @param ghosts: indicates how many ghosts to use in each dimension /// @param divisions: indicates how many cuts to make along each dimension /// (0 means "no constraint," i.e., leave it up to the algorithm) RegularDecomposer(int dim_, const Bounds& domain_, - const Assigner& assigner_, + int nblocks_, BoolVector share_face_ = BoolVector(), BoolVector wrap_ = BoolVector(), CoordinateVector ghosts_ = CoordinateVector(), DivisionsVector divisions_ = DivisionsVector()): - dim(dim_), domain(domain_), assigner(assigner_), + dim(dim_), domain(domain_), nblocks(nblocks_), share_face(share_face_), wrap(wrap_), ghosts(ghosts_), divisions(divisions_) { - if (share_face.size() < dim) share_face.resize(dim); - if (wrap.size() < dim) wrap.resize(dim); - if (ghosts.size() < dim) ghosts.resize(dim); - if (divisions.size() < dim) divisions.resize(dim); + if ((int) share_face.size() < dim) share_face.resize(dim); + if ((int) wrap.size() < dim) wrap.resize(dim); + if ((int) ghosts.size() < dim) ghosts.resize(dim); + if ((int) divisions.size() < dim) divisions.resize(dim); - int nblocks = assigner.nblocks(); - fill_divisions(nblocks); + fill_divisions(divisions); } // Calls create(int gid, const Bounds& bounds, const Link& link) template<class Creator> - void decompose(int rank, const Creator& create); + void decompose(int rank, const Assigner& assigner, const Creator& create); + + template<class Updater> + void decompose(int rank, const Assigner& assigner, Master& master, const Updater& update); + + void decompose(int rank, const Assigner& assigner, Master& master); // find lowest gid that owns a particular point template<class Point> @@ -110,7 +114,7 @@ namespace detail void gid_to_coords(int gid, DivisionsVector& coords) const { gid_to_coords(gid, coords, divisions); } int coords_to_gid(const DivisionsVector& coords) const { return coords_to_gid(coords, divisions); } - void fill_divisions(int nblocks) { fill_divisions(dim, nblocks, divisions); } + void fill_divisions(std::vector<int>& divisions) const; void fill_bounds(Bounds& bounds, const DivisionsVector& coords, bool add_ghosts = false) const; void fill_bounds(Bounds& bounds, int gid, bool add_ghosts = false) const; @@ -118,7 +122,7 @@ namespace detail static bool all(const std::vector<int>& v, int x); static void gid_to_coords(int gid, DivisionsVector& coords, const DivisionsVector& divisions); static int coords_to_gid(const DivisionsVector& coords, const DivisionsVector& divisions); - static void fill_divisions(int dim, int nblocks, std::vector<int>& divisions); + static void factor(std::vector<unsigned>& factors, int n); // Point to GIDs functions @@ -137,8 +141,8 @@ namespace detail int dim; - const Bounds& domain; - const Assigner& assigner; + Bounds domain; + int nblocks; BoolVector share_face; BoolVector wrap; CoordinateVector ghosts; @@ -162,7 +166,7 @@ namespace detail * * `create(...)` is called with each block assigned to the local domain. See [decomposition example](#decomposition-example). */ - template<class Bounds, class Assigner, class Creator> + template<class Bounds, class Creator> void decompose(int dim, int rank, const Bounds& domain, @@ -173,7 +177,7 @@ namespace detail typename RegularDecomposer<Bounds>::CoordinateVector ghosts = typename RegularDecomposer<Bounds>::CoordinateVector(), typename RegularDecomposer<Bounds>::DivisionsVector divs = typename RegularDecomposer<Bounds>::DivisionsVector()) { - RegularDecomposer<Bounds>(dim, domain, assigner, share_face, wrap, ghosts, divs).decompose(rank, create); + RegularDecomposer<Bounds>(dim, domain, assigner.nblocks(), share_face, wrap, ghosts, divs).decompose(rank, assigner, create); } namespace detail @@ -195,6 +199,26 @@ namespace detail diy::Master* master_; }; + + template<class Bounds, class Update> + struct Updater + { + typedef typename RegularDecomposer<Bounds>::Link Link; + + Updater(diy::Master* master, const Update& update): + master_(master), update_(update) {} + + void operator()(int gid, const Bounds& core, const Bounds& bounds, const Bounds& domain, const Link& link) const + { + int lid = master_->lid(gid); + Link* l = new Link(link); + master_->replace_link(lid, l); + update_(gid, lid, core, bounds, domain, *l); + } + + diy::Master* master_; + const Update& update_; + }; } /** @@ -213,7 +237,7 @@ namespace detail * * `master` must have been supplied a create function in order for this function to work. */ - template<class Bounds, class Assigner> + template<class Bounds> void decompose(int dim, int rank, const Bounds& domain, @@ -224,7 +248,7 @@ namespace detail typename RegularDecomposer<Bounds>::CoordinateVector ghosts = typename RegularDecomposer<Bounds>::CoordinateVector(), typename RegularDecomposer<Bounds>::DivisionsVector divs = typename RegularDecomposer<Bounds>::DivisionsVector()) { - RegularDecomposer<Bounds>(dim, domain, assigner, share_face, wrap, ghosts, divs).decompose(rank, detail::AddBlock<Bounds>(&master)); + RegularDecomposer<Bounds>(dim, domain, assigner.nblocks(), share_face, wrap, ghosts, divs).decompose(rank, assigner, master); } /** @@ -248,15 +272,53 @@ namespace detail master.add(local_gids[i], master.create(), new diy::Link); } + /** + * \ingroup Decomposition + * \brief Add a decomposition (modify links) of an existing set of blocks that were + * added to the master previously + * + * @param rank local rank + * @param assigner decides how processors are assigned to blocks (maps a gid to a rank) + * also communicates the total number of blocks + */ + template<class Bounds, class Update> + void decompose(int dim, + int rank, + const Bounds& domain, + const Assigner& assigner, + Master& master, + const Update& update, + typename RegularDecomposer<Bounds>::BoolVector share_face = + typename RegularDecomposer<Bounds>::BoolVector(), + typename RegularDecomposer<Bounds>::BoolVector wrap = + typename RegularDecomposer<Bounds>::BoolVector(), + typename RegularDecomposer<Bounds>::CoordinateVector ghosts = + typename RegularDecomposer<Bounds>::CoordinateVector(), + typename RegularDecomposer<Bounds>::DivisionsVector divs = + typename RegularDecomposer<Bounds>::DivisionsVector()) + { + RegularDecomposer<Bounds>(dim, domain, share_face, wrap, ghosts, divs). + decompose(rank, assigner, master, update); + } + //! Decomposition example: \example decomposition/test-decomposition.cpp //! Direct master insertion example: \example decomposition/test-direct-master.cpp } +// decomposes domain and adds blocks to the master +template<class Bounds> +void +diy::RegularDecomposer<Bounds>:: +decompose(int rank, const Assigner& assigner, Master& master) +{ + decompose(rank, assigner, detail::AddBlock<Bounds>(&master)); +} + template<class Bounds> template<class Creator> void diy::RegularDecomposer<Bounds>:: -decompose(int rank, const Creator& create) +decompose(int rank, const Assigner& assigner, const Creator& create) { std::vector<int> gids; assigner.local_gids(rank, gids); @@ -289,7 +351,7 @@ decompose(int rank, const Creator& create) if (all(offsets, 0)) continue; // skip ourselves DivisionsVector nhbr_coords(dim); - int dir = 0; + Direction dir, wrap_dir; bool inbounds = true; for (int i = 0; i < dim; ++i) { @@ -301,7 +363,7 @@ decompose(int rank, const Creator& create) if (wrap[i]) { nhbr_coords[i] = divisions[i] - 1; - link.add_wrap(Direction(1 << 2*i)); + wrap_dir[i] = -1; } else inbounds = false; @@ -312,17 +374,15 @@ decompose(int rank, const Creator& create) if (wrap[i]) { nhbr_coords[i] = 0; - link.add_wrap(Direction(1 << (2*i + 1))); + wrap_dir[i] = 1; } else inbounds = false; } // NB: this needs to match the addressing scheme in dir_t (in constants.h) - if (offsets[i] == -1) - dir |= 1 << (2*i + 0); - if (offsets[i] == 1) - dir |= 1 << (2*i + 1); + if (offsets[i] == -1 || offsets[i] == 1) + dir[i] = offsets[i]; } if (!inbounds) continue; @@ -334,13 +394,24 @@ decompose(int rank, const Creator& create) fill_bounds(nhbr_bounds, nhbr_coords); link.add_bounds(nhbr_bounds); - link.add_direction(static_cast<Direction>(dir)); + link.add_direction(dir); + link.add_wrap(wrap_dir); } create(gid, core, bounds, domain, link); } } +// decomposes domain but does not add blocks to master, assumes they were added already +template<class Bounds> +template<class Update> +void +diy::RegularDecomposer<Bounds>:: +decompose(int rank, const Assigner& assigner, Master& master, const Update& update) +{ + decompose(rank, assigner, detail::Updater<Bounds,Update>(&master, update)); +} + template<class Bounds> bool diy::RegularDecomposer<Bounds>:: @@ -436,40 +507,116 @@ fill_bounds(Bounds& bounds, //!< (output) bounds fill_bounds(bounds, coords); } +namespace diy { namespace detail { +// current state of division in one dimension used in fill_divisions below +template<class Coordinate> +struct Div +{ + int dim; // 0, 1, 2, etc. e.g. for x, y, z etc. + int nb; // number of blocks so far in this dimension + Coordinate b_size; // block size so far in this dimension + + // sort on descending block size unless tied, in which case + // sort on ascending num blocks in current dim unless tied, in which case + // sort on ascending dimension + bool operator<(Div rhs) const + { + // sort on second value of the pair unless tied, in which case sort on first + if (b_size == rhs.b_size) + { + if (nb == rhs.nb) + return(dim < rhs.dim); + return(nb < rhs.nb); + } + return(b_size > rhs.b_size); + } +}; +} } + template<class Bounds> void diy::RegularDecomposer<Bounds>:: -fill_divisions(int dim, int nblocks, std::vector<int>& divisions) +fill_divisions(std::vector<int>& divisions) const { - int prod = 1; int c = 0; - for (unsigned i = 0; i < dim; ++i) - if (divisions[i] != 0) + // prod = number of blocks unconstrained by user; c = number of unconstrained dimensions + int prod = 1; int c = 0; + for (int i = 0; i < dim; ++i) + if (divisions[i] != 0) + { + prod *= divisions[i]; + ++c; + } + + if (nblocks % prod != 0) { - prod *= divisions[i]; - ++c; + fprintf(stderr, "Total number of blocks cannot be factored into provided divs\n"); + return; } - if (nblocks % prod != 0) - { - std::cerr << "Incompatible requirements" << std::endl; - return; - } + if (c == (int) divisions.size()) // nothing to do; user provided all divs + return; - if (c == divisions.size()) - return; + // factor number of blocks left in unconstrained dimensions + // factorization is sorted from smallest to largest factors + std::vector<unsigned> factors; + factor(factors, nblocks/prod); + + using detail::Div; + std::vector< Div<Coordinate> > missing_divs; // pairs consisting of (dim, #divs) - std::vector<unsigned> factors; - factor(factors, nblocks/prod); + // init missing_divs + for (int i = 0; i < dim; i++) + { + if (divisions[i] == 0) + { + Div<Coordinate> div; + div.dim = i; + div.nb = 1; + div.b_size = domain.max[i] - domain.min[i]; + missing_divs.push_back(div); + } + } - // Fill the missing divs using LPT algorithm - std::vector<unsigned> missing_divs(divisions.size() - c, 1); - for (int i = factors.size() - 1; i >= 0; --i) - *std::min_element(missing_divs.begin(), missing_divs.end()) *= factors[i]; + // iterate over factorization of number of blocks (factors are sorted smallest to largest) + // NB: using int instead of size_t because must be negative in order to break out of loop + for (int i = factors.size() - 1; i >= 0; --i) + { + // fill in missing divs by dividing dimension w/ largest block size + // except when this would be illegal (resulting in bounds.max < bounds.min; + // only a problem for discrete bounds + + // sort on decreasing block size + std::sort(missing_divs.begin(), missing_divs.end()); + + // split the dimension with the largest block size (first element in vector) + Coordinate min = + detail::BoundsHelper<Bounds>::from(0, + missing_divs[0].nb * factors[i], + domain.min[missing_divs[0].dim], + domain.max[missing_divs[0].dim], + share_face[missing_divs[0].dim]); + Coordinate max = + detail::BoundsHelper<Bounds>::to(0, + missing_divs[0].nb * factors[i], + domain.min[missing_divs[0].dim], + domain.max[missing_divs[0].dim], + share_face[missing_divs[0].dim]); + if (max >= min) + { + missing_divs[0].nb *= factors[i]; + missing_divs[0].b_size = max - min; + } + else + { + std::ostringstream oss; + oss << "Unable to decompose domain into " << nblocks << " blocks: " << min << " " << max; + throw std::runtime_error(oss.str()); + } + } - c = 0; - for (unsigned i = 0; i < dim; ++i) - if (divisions[i] == 0) - divisions[i] = missing_divs[c++]; + // assign the divisions + for (size_t i = 0; i < missing_divs.size(); i++) + divisions[missing_divs[i].dim] = missing_divs[i].nb; } template<class Bounds> @@ -478,7 +625,7 @@ diy::RegularDecomposer<Bounds>:: factor(std::vector<unsigned>& factors, int n) { while (n != 1) - for (unsigned i = 2; i <= n; ++i) + for (int i = 2; i <= n; ++i) { if (n % i == 0) { diff --git a/include/vtkdiy/detail/algorithms/kdtree-sampling.hpp b/include/vtkdiy/detail/algorithms/kdtree-sampling.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1b21a91de97dd1550b123c8840c4326059b26a15 --- /dev/null +++ b/include/vtkdiy/detail/algorithms/kdtree-sampling.hpp @@ -0,0 +1,453 @@ +#ifndef DIY_DETAIL_ALGORITHMS_KDTREE_SAMPLING_HPP +#define DIY_DETAIL_ALGORITHMS_KDTREE_SAMPLING_HPP + +#include <vector> +#include <cassert> +#include "../../partners/all-reduce.hpp" + +// TODO: technically, what's done now is not a perfect subsample: +// we take the same number of samples from every block, in reality this number should be selected at random, +// so that the total number of samples adds up to samples*nblocks +// +// NB: random samples are chosen using rand(), which is assumed to be seeded +// externally. Once we switch to C++11, we should use its more advanced +// random number generators (and take a generator as an external parameter) +// (TODO) + +namespace diy +{ +namespace detail +{ + +template<class Block, class Point> +struct KDTreeSamplingPartition +{ + typedef diy::RegularContinuousLink RCLink; + typedef diy::ContinuousBounds Bounds; + + typedef std::vector<float> Samples; + + KDTreeSamplingPartition(int dim, + std::vector<Point> Block::* points, + size_t samples): + dim_(dim), points_(points), samples_(samples) {} + + void operator()(void* b_, const diy::ReduceProxy& srp, const KDTreePartners& partners) const; + + int divide_gid(int gid, bool lower, int round, int rounds) const; + void update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const; + void split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const; + diy::Direction + find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const; + + void compute_local_samples(Block* b, const diy::ReduceProxy& srp, int dim) const; + void add_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const; + void receive_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const; + void forward_samples(Block* b, const diy::ReduceProxy& srp, const Samples& samples) const; + + void enqueue_exchange(Block* b, const diy::ReduceProxy& srp, int dim, const Samples& samples) const; + void dequeue_exchange(Block* b, const diy::ReduceProxy& srp, int dim) const; + + void update_neighbor_bounds(Bounds& bounds, float split, int dim, bool lower) const; + bool intersects(const Bounds& x, const Bounds& y, int dim, bool wrap, const Bounds& domain) const; + float find_split(const Bounds& changed, const Bounds& original) const; + + int dim_; + std::vector<Point> Block::* points_; + size_t samples_; +}; + +} +} + + +template<class Block, class Point> +void +diy::detail::KDTreeSamplingPartition<Block,Point>:: +operator()(void* b_, const diy::ReduceProxy& srp, const KDTreePartners& partners) const +{ + Block* b = static_cast<Block*>(b_); + + int dim; + if (srp.round() < partners.rounds()) + dim = partners.dim(srp.round()); + else + dim = partners.dim(srp.round() - 1); + + if (srp.round() == partners.rounds()) + update_links(b, srp, dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain); // -1 would be the "uninformative" link round + else if (partners.swap_round(srp.round()) && partners.sub_round(srp.round()) < 0) // link round + { + dequeue_exchange(b, srp, dim); // from the swap round + split_to_neighbors(b, srp, dim); + } + else if (partners.swap_round(srp.round())) + { + Samples samples; + receive_samples(b, srp, samples); + enqueue_exchange(b, srp, dim, samples); + } else if (partners.sub_round(srp.round()) == 0) + { + if (srp.round() > 0) + { + int prev_dim = dim - 1; + if (prev_dim < 0) + prev_dim += dim_; + update_links(b, srp, prev_dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain); // -1 would be the "uninformative" link round + } + + compute_local_samples(b, srp, dim); + } else if (partners.sub_round(srp.round()) < (int) partners.histogram.rounds()/2) // we are reusing partners class, so really we are talking about the samples rounds here + { + Samples samples; + add_samples(b, srp, samples); + srp.enqueue(srp.out_link().target(0), samples); + } else + { + Samples samples; + add_samples(b, srp, samples); + if (samples.size() != 1) + { + // pick the median + std::nth_element(samples.begin(), samples.begin() + samples.size()/2, samples.end()); + std::swap(samples[0], samples[samples.size()/2]); + //std::sort(samples.begin(), samples.end()); + //samples[0] = (samples[samples.size()/2] + samples[samples.size()/2 + 1])/2; + samples.resize(1); + } + forward_samples(b, srp, samples); + } +} + +template<class Block, class Point> +int +diy::detail::KDTreeSamplingPartition<Block,Point>:: +divide_gid(int gid, bool lower, int round, int rounds) const +{ + if (lower) + gid &= ~(1 << (rounds - 1 - round)); + else + gid |= (1 << (rounds - 1 - round)); + return gid; +} + +// round here is the outer iteration of the algorithm +template<class Block, class Point> +void +diy::detail::KDTreeSamplingPartition<Block,Point>:: +update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const +{ + int gid = srp.gid(); + int lid = srp.master()->lid(gid); + RCLink* link = static_cast<RCLink*>(srp.master()->link(lid)); + + // (gid, dir) -> i + std::map<std::pair<int,diy::Direction>, int> link_map; + for (int i = 0; i < link->size(); ++i) + link_map[std::make_pair(link->target(i).gid, link->direction(i))] = i; + + // NB: srp.enqueue(..., ...) should match the link + std::vector<float> splits(link->size()); + for (int i = 0; i < link->size(); ++i) + { + float split; diy::Direction dir; + + int in_gid = link->target(i).gid; + while(srp.incoming(in_gid)) + { + srp.dequeue(in_gid, split); + srp.dequeue(in_gid, dir); + + // reverse dir + for (int j = 0; j < dim_; ++j) + dir[j] = -dir[j]; + + int k = link_map[std::make_pair(in_gid, dir)]; + //printf("%d %d %f -> %d\n", in_gid, dir, split, k); + splits[k] = split; + } + } + + RCLink new_link(dim_, link->core(), link->core()); + + bool lower = !(gid & (1 << (rounds - 1 - round))); + + // fill out the new link + for (int i = 0; i < link->size(); ++i) + { + diy::Direction dir = link->direction(i); + //diy::Direction wrap_dir = link->wrap(i); // we don't use existing wrap, but restore it from scratch + if (dir[dim] != 0) + { + if ((dir[dim] < 0 && lower) || (dir[dim] > 0 && !lower)) + { + int nbr_gid = divide_gid(link->target(i).gid, !lower, round, rounds); + diy::BlockID nbr = { nbr_gid, srp.assigner().rank(nbr_gid) }; + new_link.add_neighbor(nbr); + + new_link.add_direction(dir); + + Bounds bounds = link->bounds(i); + update_neighbor_bounds(bounds, splits[i], dim, !lower); + new_link.add_bounds(bounds); + + if (wrap) + new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain)); + else + new_link.add_wrap(diy::Direction()); + } + } else // non-aligned side + { + for (int j = 0; j < 2; ++j) + { + int nbr_gid = divide_gid(link->target(i).gid, j == 0, round, rounds); + + Bounds bounds = link->bounds(i); + update_neighbor_bounds(bounds, splits[i], dim, j == 0); + + if (intersects(bounds, new_link.bounds(), dim, wrap, domain)) + { + diy::BlockID nbr = { nbr_gid, srp.assigner().rank(nbr_gid) }; + new_link.add_neighbor(nbr); + new_link.add_direction(dir); + new_link.add_bounds(bounds); + + if (wrap) + new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain)); + else + new_link.add_wrap(diy::Direction()); + } + } + } + } + + // add link to the dual block + int dual_gid = divide_gid(gid, !lower, round, rounds); + diy::BlockID dual = { dual_gid, srp.assigner().rank(dual_gid) }; + new_link.add_neighbor(dual); + + Bounds nbr_bounds = link->bounds(); // old block bounds + update_neighbor_bounds(nbr_bounds, find_split(new_link.bounds(), nbr_bounds), dim, !lower); + new_link.add_bounds(nbr_bounds); + + new_link.add_wrap(diy::Direction()); // dual block cannot be wrapped + + if (lower) + { + diy::Direction right; + right[dim] = 1; + new_link.add_direction(right); + } else + { + diy::Direction left; + left[dim] = -1; + new_link.add_direction(left); + } + + // update the link; notice that this won't conflict with anything since + // reduce is using its own notion of the link constructed through the + // partners + link->swap(new_link); +} + +template<class Block, class Point> +void +diy::detail::KDTreeSamplingPartition<Block,Point>:: +split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const +{ + int lid = srp.master()->lid(srp.gid()); + RCLink* link = static_cast<RCLink*>(srp.master()->link(lid)); + + // determine split + float split = find_split(link->core(), link->bounds()); + + for (int i = 0; i < link->size(); ++i) + { + srp.enqueue(link->target(i), split); + srp.enqueue(link->target(i), link->direction(i)); + } +} + +template<class Block, class Point> +void +diy::detail::KDTreeSamplingPartition<Block,Point>:: +compute_local_samples(Block* b, const diy::ReduceProxy& srp, int dim) const +{ + // compute and enqueue local samples + Samples samples; + size_t points_size = (b->*points_).size(); + size_t n = std::min(points_size, samples_); + samples.reserve(n); + for (size_t i = 0; i < n; ++i) + { + float x = (b->*points_)[rand() % points_size][dim]; + samples.push_back(x); + } + + srp.enqueue(srp.out_link().target(0), samples); +} + +template<class Block, class Point> +void +diy::detail::KDTreeSamplingPartition<Block,Point>:: +add_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const +{ + // dequeue and combine the samples + for (int i = 0; i < srp.in_link().size(); ++i) + { + int nbr_gid = srp.in_link().target(i).gid; + + Samples smpls; + srp.dequeue(nbr_gid, smpls); + for (size_t i = 0; i < smpls.size(); ++i) + samples.push_back(smpls[i]); + } +} + +template<class Block, class Point> +void +diy::detail::KDTreeSamplingPartition<Block,Point>:: +receive_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const +{ + srp.dequeue(srp.in_link().target(0).gid, samples); +} + +template<class Block, class Point> +void +diy::detail::KDTreeSamplingPartition<Block,Point>:: +forward_samples(Block* b, const diy::ReduceProxy& srp, const Samples& samples) const +{ + for (int i = 0; i < srp.out_link().size(); ++i) + srp.enqueue(srp.out_link().target(i), samples); +} + +template<class Block, class Point> +void +diy::detail::KDTreeSamplingPartition<Block,Point>:: +enqueue_exchange(Block* b, const diy::ReduceProxy& srp, int dim, const Samples& samples) const +{ + int lid = srp.master()->lid(srp.gid()); + RCLink* link = static_cast<RCLink*>(srp.master()->link(lid)); + + int k = srp.out_link().size(); + + if (k == 0) // final round; nothing needs to be sent; this is actually redundant + return; + + // pick split points + float split = samples[0]; + + // subset and enqueue + std::vector< std::vector<Point> > out_points(srp.out_link().size()); + for (size_t i = 0; i < (b->*points_).size(); ++i) + { + float x = (b->*points_)[i][dim]; + int loc = x < split ? 0 : 1; + out_points[loc].push_back((b->*points_)[i]); + } + int pos = -1; + for (int i = 0; i < k; ++i) + { + if (srp.out_link().target(i).gid == srp.gid()) + { + (b->*points_).swap(out_points[i]); + pos = i; + } + else + srp.enqueue(srp.out_link().target(i), out_points[i]); + } + if (pos == 0) + link->core().max[dim] = split; + else + link->core().min[dim] = split; +} + +template<class Block, class Point> +void +diy::detail::KDTreeSamplingPartition<Block,Point>:: +dequeue_exchange(Block* b, const diy::ReduceProxy& srp, int dim) const +{ + int lid = srp.master()->lid(srp.gid()); + RCLink* link = static_cast<RCLink*>(srp.master()->link(lid)); + + for (int i = 0; i < srp.in_link().size(); ++i) + { + int nbr_gid = srp.in_link().target(i).gid; + if (nbr_gid == srp.gid()) + continue; + + std::vector<Point> in_points; + srp.dequeue(nbr_gid, in_points); + for (size_t j = 0; j < in_points.size(); ++j) + { + if (in_points[j][dim] < link->core().min[dim] || in_points[j][dim] > link->core().max[dim]) + { + fprintf(stderr, "Warning: dequeued %f outside [%f,%f] (%d)\n", + in_points[j][dim], link->core().min[dim], link->core().max[dim], dim); + std::abort(); + } + (b->*points_).push_back(in_points[j]); + } + } +} + +template<class Block, class Point> +void +diy::detail::KDTreeSamplingPartition<Block,Point>:: +update_neighbor_bounds(Bounds& bounds, float split, int dim, bool lower) const +{ + if (lower) + bounds.max[dim] = split; + else + bounds.min[dim] = split; +} + +template<class Block, class Point> +bool +diy::detail::KDTreeSamplingPartition<Block,Point>:: +intersects(const Bounds& x, const Bounds& y, int dim, bool wrap, const Bounds& domain) const +{ + if (wrap) + { + if (x.min[dim] == domain.min[dim] && y.max[dim] == domain.max[dim]) + return true; + if (y.min[dim] == domain.min[dim] && x.max[dim] == domain.max[dim]) + return true; + } + return x.min[dim] <= y.max[dim] && y.min[dim] <= x.max[dim]; +} + +template<class Block, class Point> +float +diy::detail::KDTreeSamplingPartition<Block,Point>:: +find_split(const Bounds& changed, const Bounds& original) const +{ + for (int i = 0; i < dim_; ++i) + { + if (changed.min[i] != original.min[i]) + return changed.min[i]; + if (changed.max[i] != original.max[i]) + return changed.max[i]; + } + assert(0); + return -1; +} + +template<class Block, class Point> +diy::Direction +diy::detail::KDTreeSamplingPartition<Block,Point>:: +find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const +{ + diy::Direction wrap; + for (int i = 0; i < dim_; ++i) + { + if (bounds.min[i] == domain.min[i] && nbr_bounds.max[i] == domain.max[i]) + wrap[i] = -1; + if (bounds.max[i] == domain.max[i] && nbr_bounds.min[i] == domain.min[i]) + wrap[i] = 1; + } + return wrap; +} + + +#endif diff --git a/include/vtkdiy/detail/algorithms/kdtree.hpp b/include/vtkdiy/detail/algorithms/kdtree.hpp index 0bce13591778e1092a0b29e6db70a9b5d0a6f648..5e0c3ddfa1b5d5057e26f231d883055f8c6bb72d 100644 --- a/include/vtkdiy/detail/algorithms/kdtree.hpp +++ b/include/vtkdiy/detail/algorithms/kdtree.hpp @@ -30,6 +30,8 @@ struct KDTreePartition int divide_gid(int gid, bool lower, int round, int rounds) const; void update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const; void split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const; + diy::Direction + find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const; void compute_local_histogram(Block* b, const diy::ReduceProxy& srp, int dim) const; void add_histogram(Block* b, const diy::ReduceProxy& srp, Histogram& histogram) const; @@ -59,8 +61,9 @@ struct diy::detail::KDTreePartners typedef diy::ContinuousBounds Bounds; KDTreePartners(int dim, int nblocks, bool wrap_, const Bounds& domain_): - histogram(1, nblocks, 2), - swap(1, nblocks, 2, false), + decomposer(1, interval(0,nblocks-1), nblocks), + histogram(decomposer, 2), + swap(decomposer, 2, false), wrap(wrap_), domain(domain_) { @@ -94,7 +97,7 @@ struct diy::detail::KDTreePartners inline bool active(int round, int gid, const diy::Master& m) const { - if (round == rounds()) + if (round == (int) rounds()) return true; else if (swap_round(round) && sub_round(round) < 0) // link round return true; @@ -106,7 +109,7 @@ struct diy::detail::KDTreePartners inline void incoming(int round, int gid, std::vector<int>& partners, const diy::Master& m) const { - if (round == rounds()) + if (round == (int) rounds()) link_neighbors(-1, gid, partners, m); else if (swap_round(round) && sub_round(round) < 0) // link round swap.incoming(sub_round(round - 1) + 1, gid, partners, m); @@ -125,7 +128,7 @@ struct diy::detail::KDTreePartners inline void outgoing(int round, int gid, std::vector<int>& partners, const diy::Master& m) const { - if (round == rounds()) + if (round == (int) rounds()) swap.outgoing(sub_round(round-1) + 1, gid, partners, m); else if (swap_round(round) && sub_round(round) < 0) // link round link_neighbors(-1, gid, partners, m); @@ -141,13 +144,16 @@ struct diy::detail::KDTreePartners diy::Link* link = m.link(lid); std::set<int> result; // partners must be unique - for (size_t i = 0; i < link->size(); ++i) + for (int i = 0; i < link->size(); ++i) result.insert(link->target(i).gid); for (std::set<int>::const_iterator it = result.begin(); it != result.end(); ++it) partners.push_back(*it); } + // 1-D domain to feed into histogram and swap + diy::RegularDecomposer<diy::DiscreteBounds> decomposer; + diy::RegularAllReducePartners histogram; diy::RegularSwapPartners swap; @@ -194,7 +200,7 @@ operator()(void* b_, const diy::ReduceProxy& srp, const KDTreePartners& partners } compute_local_histogram(b, srp, dim); - } else if (partners.sub_round(srp.round()) < partners.histogram.rounds()/2) + } else if (partners.sub_round(srp.round()) < (int) partners.histogram.rounds()/2) { Histogram histogram(bins_); add_histogram(b, srp, histogram); @@ -248,45 +254,41 @@ update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int roun srp.dequeue(in_gid, dir); // reverse dir - int j = 0; - while (dir >> (j + 1)) - ++j; - - if (j % 2 == 0) - dir = static_cast<diy::Direction>(dir << 1); - else - dir = static_cast<diy::Direction>(dir >> 1); + for (int j = 0; j < dim_; ++j) + dir[j] = -dir[j]; int k = link_map[std::make_pair(in_gid, dir)]; - //printf("%d %d %f -> %d\n", in_gid, dir, split, k); splits[k] = split; } } RCLink new_link(dim_, link->core(), link->core()); - diy::Direction left = static_cast<diy::Direction>(1 << 2*dim); - diy::Direction right = static_cast<diy::Direction>(1 << (2*dim + 1)); - bool lower = !(gid & (1 << (rounds - 1 - round))); // fill out the new link for (int i = 0; i < link->size(); ++i) { - diy::Direction dir = link->direction(i); - if (dir == left || dir == right) + diy::Direction dir = link->direction(i); + //diy::Direction wrap_dir = link->wrap(i); // we don't use existing wrap, but restore it from scratch + if (dir[dim] != 0) { - if ((dir == left && lower) || (dir == right && !lower)) + if ((dir[dim] < 0 && lower) || (dir[dim] > 0 && !lower)) { - int nbr_gid = divide_gid(link->target(i).gid, dir != left, round, rounds); + int nbr_gid = divide_gid(link->target(i).gid, !lower, round, rounds); diy::BlockID nbr = { nbr_gid, srp.assigner().rank(nbr_gid) }; new_link.add_neighbor(nbr); new_link.add_direction(dir); Bounds bounds = link->bounds(i); - update_neighbor_bounds(bounds, splits[i], dim, dir != left); + update_neighbor_bounds(bounds, splits[i], dim, !lower); new_link.add_bounds(bounds); + + if (wrap) + new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain)); + else + new_link.add_wrap(diy::Direction()); } } else // non-aligned side { @@ -303,6 +305,11 @@ update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int roun new_link.add_neighbor(nbr); new_link.add_direction(dir); new_link.add_bounds(bounds); + + if (wrap) + new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain)); + else + new_link.add_wrap(diy::Direction()); } } } @@ -317,10 +324,19 @@ update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int roun update_neighbor_bounds(nbr_bounds, find_split(new_link.bounds(), nbr_bounds), dim, !lower); new_link.add_bounds(nbr_bounds); + new_link.add_wrap(diy::Direction()); // dual block cannot be wrapped + if (lower) + { + diy::Direction right; + right[dim] = 1; new_link.add_direction(right); - else + } else + { + diy::Direction left; + left[dim] = -1; new_link.add_direction(left); + } // update the link; notice that this won't conflict with anything since // reduce is using its own notion of the link constructed through the @@ -339,7 +355,7 @@ split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const // determine split float split = find_split(link->core(), link->bounds()); - for (size_t i = 0; i < link->size(); ++i) + for (int i = 0; i < link->size(); ++i) { srp.enqueue(link->target(i), split); srp.enqueue(link->target(i), link->direction(i)); @@ -367,7 +383,7 @@ compute_local_histogram(Block* b, const diy::ReduceProxy& srp, int dim) const std::cerr << loc << " " << x << " " << link->core().min[dim] << std::endl; std::abort(); } - if (loc >= bins_) + if (loc >= (int) bins_) loc = bins_ - 1; ++(histogram[loc]); } @@ -381,7 +397,7 @@ diy::detail::KDTreePartition<Block,Point>:: add_histogram(Block* b, const diy::ReduceProxy& srp, Histogram& histogram) const { // dequeue and add up the histograms - for (unsigned i = 0; i < srp.in_link().size(); ++i) + for (int i = 0; i < srp.in_link().size(); ++i) { int nbr_gid = srp.in_link().target(i).gid; @@ -405,7 +421,7 @@ void diy::detail::KDTreePartition<Block,Point>:: forward_histogram(Block* b, const diy::ReduceProxy& srp, const Histogram& histogram) const { - for (unsigned i = 0; i < srp.out_link().size(); ++i) + for (int i = 0; i < srp.out_link().size(); ++i) srp.enqueue(srp.out_link().target(i), histogram); } @@ -429,9 +445,8 @@ enqueue_exchange(Block* b, const diy::ReduceProxy& srp, int dim, const Histogram //fprintf(stderr, "Histogram total: %lu\n", total); size_t cur = 0; - size_t last, next; float width = (link->core().max[dim] - link->core().min[dim])/bins_; - float split; + float split = 0; for (size_t i = 0; i < histogram.size(); ++i) { if (cur + histogram[i] > total/2) @@ -476,7 +491,7 @@ dequeue_exchange(Block* b, const diy::ReduceProxy& srp, int dim) const int lid = srp.master()->lid(srp.gid()); RCLink* link = static_cast<RCLink*>(srp.master()->link(lid)); - for (unsigned i = 0; i < srp.in_link().size(); ++i) + for (int i = 0; i < srp.in_link().size(); ++i) { int nbr_gid = srp.in_link().target(i).gid; if (nbr_gid == srp.gid()) @@ -528,19 +543,32 @@ float diy::detail::KDTreePartition<Block,Point>:: find_split(const Bounds& changed, const Bounds& original) const { - diy::Direction dir = DIY_X0; for (int i = 0; i < dim_; ++i) { if (changed.min[i] != original.min[i]) return changed.min[i]; - dir = static_cast<diy::Direction>(dir << 1); if (changed.max[i] != original.max[i]) return changed.max[i]; - dir = static_cast<diy::Direction>(dir << 1); } assert(0); return -1; } +template<class Block, class Point> +diy::Direction +diy::detail::KDTreePartition<Block,Point>:: +find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const +{ + diy::Direction wrap; + for (int i = 0; i < dim_; ++i) + { + if (bounds.min[i] == domain.min[i] && nbr_bounds.max[i] == domain.max[i]) + wrap[i] = -1; + if (bounds.max[i] == domain.max[i] && nbr_bounds.min[i] == domain.min[i]) + wrap[i] = 1; + } + return wrap; +} + #endif diff --git a/include/vtkdiy/detail/algorithms/sort.hpp b/include/vtkdiy/detail/algorithms/sort.hpp index 49541957bbcab7c9ebdacfe6f61a2e4de946b301..90f8035057cf22d4b79dd4756aaa7449918b7cb9 100644 --- a/include/vtkdiy/detail/algorithms/sort.hpp +++ b/include/vtkdiy/detail/algorithms/sort.hpp @@ -99,7 +99,7 @@ struct SampleSort<Block,T,Cmp>::Sampler if (k_in == 0) { // draw random samples - for (int i = 0; i < num_samples; ++i) + for (size_t i = 0; i < num_samples; ++i) samples.push_back((b->*values)[std::rand() % (b->*values).size()]); } else dequeue_values(samples, srp, false); @@ -110,7 +110,7 @@ struct SampleSort<Block,T,Cmp>::Sampler std::sort(samples.begin(), samples.end(), cmp); std::vector<T> subsamples(srp.nblocks() - 1); int step = samples.size() / srp.nblocks(); // NB: subsamples.size() + 1 - for (int i = 0; i < subsamples.size(); ++i) + for (size_t i = 0; i < subsamples.size(); ++i) subsamples[i] = samples[(i+1)*step]; (b->*dividers).swap(subsamples); } diff --git a/include/vtkdiy/detail/reduce/all-to-all.hpp b/include/vtkdiy/detail/reduce/all-to-all.hpp index 88bfb877e64aa13b9363e7732b4a84c519600980..c49b7d28cdb56b2ab558c62e85b7cb16433b2c6d 100644 --- a/include/vtkdiy/detail/reduce/all-to-all.hpp +++ b/include/vtkdiy/detail/reduce/all-to-all.hpp @@ -154,7 +154,7 @@ namespace detail SkipIntermediate(size_t rounds_): rounds(rounds_) {} - bool operator()(int round, int, const Master&) const { if (round == 0 || round == rounds) return false; return true; } + bool operator()(int round, int, const Master&) const { if (round == 0 || round == (int) rounds) return false; return true; } size_t rounds; }; diff --git a/include/vtkdiy/io/block.hpp b/include/vtkdiy/io/block.hpp index 4161d59209218d4690fba27fef50b9b35811759e..4667a32101a988a839ad39e1591d99312ef262af 100644 --- a/include/vtkdiy/io/block.hpp +++ b/include/vtkdiy/io/block.hpp @@ -174,65 +174,45 @@ namespace io * \ingroup IO * \brief Read blocks from storage collectively from one shared file */ - inline - void - read_blocks(const std::string& infilename, //!< input file name - const mpi::communicator& comm, //!< communicator - Assigner& assigner, //!< assigner object - Master& master, //!< master object - MemoryBuffer& extra, //!< user-defined metadata in file header - Master::LoadBlock load = 0) //!< load block function in case different than or unefined in the master - { - if (!load) load = master.loader(); // load is likely to be different from master.load() + inline + void + read_blocks(const std::string& infilename, //!< input file name + const mpi::communicator& comm, //!< communicator + Assigner& assigner, //!< assigner object + Master& master, //!< master object + MemoryBuffer& extra, //!< user-defined metadata in file header + Master::LoadBlock load = 0) //!< load block function in case different than or unefined in the master + { + if (!load) load = master.loader(); // load is likely to be different from master.load() - typedef detail::offset_t offset_t; - typedef detail::GidOffsetCount GidOffsetCount; + typedef detail::offset_t offset_t; + typedef detail::GidOffsetCount GidOffsetCount; - mpi::io::file f(comm, infilename, mpi::io::file::rdonly); + mpi::io::file f(comm, infilename, mpi::io::file::rdonly); - offset_t footer_offset = f.size() - sizeof(size_t); - size_t footer_size; + offset_t footer_offset = f.size() - sizeof(size_t); + size_t footer_size; - // Read the size - try - { + // Read the size f.read_at_all(footer_offset, (char*) &footer_size, sizeof(footer_size)); - } - catch(...) - { - fprintf(stderr, "Error reading footer size\n"); - throw(1); - } - // Read all_offset_counts - footer_offset -= footer_size; - MemoryBuffer footer; - try - { + // Read all_offset_counts + footer_offset -= footer_size; + MemoryBuffer footer; footer.buffer.resize(footer_size); f.read_at_all(footer_offset, footer.buffer); - } - catch(...) - { - fprintf(stderr, "Error reading footer\n"); - throw(1); - } - std::vector<GidOffsetCount> all_offset_counts; - diy::load(footer, all_offset_counts); - diy::load(footer, extra); - extra.reset(); + std::vector<GidOffsetCount> all_offset_counts; + diy::load(footer, all_offset_counts); + diy::load(footer, extra); + extra.reset(); - // Get local gids from assigner - size_t size = all_offset_counts.size(); - assigner.set_nblocks(size); - std::vector<int> gids; - assigner.local_gids(comm.rank(), gids); + // Get local gids from assigner + size_t size = all_offset_counts.size(); + assigner.set_nblocks(size); + std::vector<int> gids; + assigner.local_gids(comm.rank(), gids); - // Read our blocks; - // TODO: use collective IO, when possible - try - { for (unsigned i = 0; i < gids.size(); ++i) { if (gids[i] != all_offset_counts[gids[i]].gid) @@ -251,12 +231,6 @@ namespace io master.add(gids[i], b, l); } } - catch(...) - { - fprintf(stderr, "Error reading blocks\n"); - throw(1); - } - } // Functions without the extra buffer, for compatibility with the old code diff --git a/include/vtkdiy/io/bov.hpp b/include/vtkdiy/io/bov.hpp index f4ead18bff72a87cfb70283564ec727f44f63976..cb059d0317a43fa51fa776442e7e481752600df2 100644 --- a/include/vtkdiy/io/bov.hpp +++ b/include/vtkdiy/io/bov.hpp @@ -74,7 +74,7 @@ read(const DiscreteBounds& bounds, T* buffer, bool collective, int chunk) const int dim = shape_.size(); int total = 1; std::vector<int> subsizes; - for (unsigned i = 0; i < dim; ++i) + for (int i = 0; i < dim; ++i) { subsizes.push_back(bounds.max[i] - bounds.min[i] + 1); total *= subsizes.back(); @@ -88,15 +88,15 @@ read(const DiscreteBounds& bounds, T* buffer, bool collective, int chunk) const // create an MPI struct of size chunk to read the data in those chunks // (this allows to work around MPI-IO weirdness where crucial quantities // are ints, which are too narrow of a type) - const int array_of_blocklengths[] = { chunk }; - const MPI_Aint array_of_displacements[] = { 0 }; - const MPI_Datatype array_of_types[] = { mpi::detail::get_mpi_datatype<T>() }; + int array_of_blocklengths[] = { chunk }; + MPI_Aint array_of_displacements[] = { 0 }; + MPI_Datatype array_of_types[] = { mpi::detail::get_mpi_datatype<T>() }; MPI_Type_create_struct(1, array_of_blocklengths, array_of_displacements, array_of_types, &T_type); MPI_Type_commit(&T_type); } MPI_Datatype fileblk; - MPI_Type_create_subarray(dim, &shape_[0], &subsizes[0], &bounds.min[0], MPI_ORDER_C, T_type, &fileblk); + MPI_Type_create_subarray(dim, (int*) &shape_[0], &subsizes[0], (int*) &bounds.min[0], MPI_ORDER_C, T_type, &fileblk); MPI_Type_commit(&fileblk); MPI_File_set_view(f_.handle(), offset_, T_type, fileblk, "native", MPI_INFO_NULL); @@ -128,7 +128,7 @@ write(const DiscreteBounds& bounds, const T* buffer, const DiscreteBounds& core, int dim = shape_.size(); std::vector<int> subsizes; std::vector<int> buffer_shape, buffer_start; - for (unsigned i = 0; i < dim; ++i) + for (int i = 0; i < dim; ++i) { buffer_shape.push_back(bounds.max[i] - bounds.min[i] + 1); buffer_start.push_back(core.min[i] - bounds.min[i]); @@ -141,16 +141,16 @@ write(const DiscreteBounds& bounds, const T* buffer, const DiscreteBounds& core, else { // assume T is a binary block and create an MPI struct of appropriate size - const int array_of_blocklengths[] = { chunk }; - const MPI_Aint array_of_displacements[] = { 0 }; - const MPI_Datatype array_of_types[] = { mpi::detail::get_mpi_datatype<T>() }; + int array_of_blocklengths[] = { chunk }; + MPI_Aint array_of_displacements[] = { 0 }; + MPI_Datatype array_of_types[] = { mpi::detail::get_mpi_datatype<T>() }; MPI_Type_create_struct(1, array_of_blocklengths, array_of_displacements, array_of_types, &T_type); MPI_Type_commit(&T_type); } MPI_Datatype fileblk, subbuffer; - MPI_Type_create_subarray(dim, &shape_[0], &subsizes[0], &bounds.min[0], MPI_ORDER_C, T_type, &fileblk); - MPI_Type_create_subarray(dim, &buffer_shape[0], &subsizes[0], &buffer_start[0], MPI_ORDER_C, T_type, &subbuffer); + MPI_Type_create_subarray(dim, (int*) &shape_[0], &subsizes[0], (int*) &bounds.min[0], MPI_ORDER_C, T_type, &fileblk); + MPI_Type_create_subarray(dim, (int*) &buffer_shape[0], &subsizes[0], (int*) &buffer_start[0], MPI_ORDER_C, T_type, &subbuffer); MPI_Type_commit(&fileblk); MPI_Type_commit(&subbuffer); diff --git a/include/vtkdiy/io/numpy.hpp b/include/vtkdiy/io/numpy.hpp index ae295dc26da263165531bd9e8ec5a0eef9ec8da8..0199a0c38f4f1ea23c79661578036074dfab1fa2 100644 --- a/include/vtkdiy/io/numpy.hpp +++ b/include/vtkdiy/io/numpy.hpp @@ -143,7 +143,7 @@ write_header(const S& shape) convert_and_save(dict, sizeof(T)); save(dict, "', 'fortran_order': False, 'shape': ("); convert_and_save(dict, shape[0]); - for (int i = 1; i < shape.size(); i++) + for (int i = 1; i < (int) shape.size(); i++) { save(dict, ", "); convert_and_save(dict, shape[i]); @@ -174,7 +174,8 @@ char diy::io::detail::big_endian() { unsigned char x[] = {1,0}; - short y = *(short*) x; + void* x_void = x; + short y = *static_cast<short*>(x_void); return y == 1 ? '<' : '>'; } diff --git a/include/vtkdiy/link.hpp b/include/vtkdiy/link.hpp index 16f64ec4c0c369e82e977362df6b0ab4e1e96347..3262eef610a8b3f5e66649c5379adf6eb075ffb2 100644 --- a/include/vtkdiy/link.hpp +++ b/include/vtkdiy/link.hpp @@ -77,19 +77,21 @@ namespace diy typedef std::vector<Direction> DirVec; public: - RegularLink(int dim, const Bounds& core, const Bounds& bounds, Direction wrap = Direction(0)): - dim_(dim), wrap_(wrap), core_(core), bounds_(bounds) {} + RegularLink(int dim, const Bounds& core, const Bounds& bounds): + dim_(dim), core_(core), bounds_(bounds) {} - int dimension() const { return dim_; } + // dimension + int dimension() const { return dim_; } // direction - int direction(Direction dir) const; // convert direction to a neighbor (-1 if no neighbor) - Direction direction(int i) const { return dir_vec_[i]; } - void add_direction(Direction dir) { int c = dir_map_.size(); dir_map_[dir] = c; dir_vec_.push_back(dir); } + int direction(Direction dir) const; // convert direction to a neighbor (-1 if no neighbor) + Direction direction(int i) const { return dir_vec_[i]; } + void add_direction(Direction dir) { int c = dir_map_.size(); dir_map_[dir] = c; dir_vec_.push_back(dir); } // wrap - void add_wrap(Direction dir) { wrap_ = static_cast<Direction>(wrap_ | dir); } - Direction wrap() const { return wrap_; } + void add_wrap(Direction dir) { wrap_.push_back(dir); } + Direction wrap(int i) const { return wrap_[i]; } + Direction& wrap(int i) { return wrap_[i]; } // bounds const Bounds& core() const { return core_; } @@ -99,7 +101,7 @@ namespace diy const Bounds& bounds(int i) const { return nbr_bounds_[i]; } void add_bounds(const Bounds& bounds) { nbr_bounds_.push_back(bounds); } - void swap(RegularLink& other) { Link::swap(other); dir_map_.swap(other.dir_map_); dir_vec_.swap(other.dir_vec_); nbr_bounds_.swap(other.nbr_bounds_); std::swap(dim_, other.dim_); std::swap(wrap_, other.wrap_); std::swap(core_, other.core_); std::swap(bounds_, other.bounds_); } + void swap(RegularLink& other) { Link::swap(other); dir_map_.swap(other.dir_map_); dir_vec_.swap(other.dir_vec_); nbr_bounds_.swap(other.nbr_bounds_); std::swap(dim_, other.dim_); wrap_.swap(other.wrap_); std::swap(core_, other.core_); std::swap(bounds_, other.bounds_); } void save(BinaryBuffer& bb) const { @@ -107,10 +109,10 @@ namespace diy diy::save(bb, dim_); diy::save(bb, dir_map_); diy::save(bb, dir_vec_); - diy::save(bb, wrap_); diy::save(bb, core_); diy::save(bb, bounds_); diy::save(bb, nbr_bounds_); + diy::save(bb, wrap_); } void load(BinaryBuffer& bb) @@ -119,10 +121,10 @@ namespace diy diy::load(bb, dim_); diy::load(bb, dir_map_); diy::load(bb, dir_vec_); - diy::load(bb, wrap_); diy::load(bb, core_); diy::load(bb, bounds_); diy::load(bb, nbr_bounds_); + diy::load(bb, wrap_); } virtual size_t id() const { return RegularLinkSelector<Bounds>::id; } @@ -132,11 +134,11 @@ namespace diy DirMap dir_map_; DirVec dir_vec_; - Direction wrap_; - Bounds core_; - Bounds bounds_; - std::vector<Bounds> nbr_bounds_; + Bounds core_; + Bounds bounds_; + std::vector<Bounds> nbr_bounds_; + std::vector<Direction> wrap_; }; // Other cover candidates: KDTreeLink, AMRGridLink diff --git a/include/vtkdiy/master.hpp b/include/vtkdiy/master.hpp index 65c2baddef0deba2028b02ca7fd26acfe7b85c2f..066763503772bc0d4e9ac56219dfa2de60511d6f 100644 --- a/include/vtkdiy/master.hpp +++ b/include/vtkdiy/master.hpp @@ -206,6 +206,8 @@ namespace diy int threads() const { return threads_; } int in_memory() const { return *blocks_.in_memory().const_access(); } + void set_threads(int threads) { threads_ = threads; } + CreateBlock creator() const { return blocks_.creator(); } DestroyBlock destroyer() const { return blocks_.destroyer(); } LoadBlock loader() const { return blocks_.loader(); } @@ -256,6 +258,7 @@ namespace diy void set_expected(int expected) { expected_ = expected; } void add_expected(int i) { expected_ += i; } int expected() const { return expected_; } + void replace_link(int i, Link* link) { expected_ -= links_[i]->size_unique(); delete links_[i]; links_[i] = link; expected_ += links_[i]->size_unique(); } public: // Communicator functionality diff --git a/include/vtkdiy/mpi/collectives.hpp b/include/vtkdiy/mpi/collectives.hpp index fd7e558208509a915df69698c609e7b8c13d1215..d1cc0bdd80896e5c2b13e0f67e571f1e219e33b4 100644 --- a/include/vtkdiy/mpi/collectives.hpp +++ b/include/vtkdiy/mpi/collectives.hpp @@ -135,10 +135,10 @@ namespace mpi out.resize(comm.size()); size_t cur = 0; - for (unsigned i = 0; i < comm.size(); ++i) + for (int i = 0; i < comm.size(); ++i) { out[i].reserve(counts[i]); - for (unsigned j = 0; j < counts[i]; ++j) + for (int j = 0; j < counts[i]; ++j) out[i].push_back(buffer[cur++]); } } @@ -173,6 +173,17 @@ namespace mpi comm); } + static void all_reduce(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, const Op&) + { + out.resize(in.size()); + MPI_Allreduce(Datatype::address(const_cast<T&>(in[0])), + Datatype::address(out[0]), + in.size(), + Datatype::datatype(), + detail::mpi_op<Op>::get(), + comm); + } + static void scan(const communicator& comm, const T& in, T& out, const Op&) { MPI_Scan(Datatype::address(const_cast<T&>(in)), @@ -274,6 +285,13 @@ namespace mpi Collectives<T, Op>::all_reduce(comm, in, out, op); } + //! Same as above, but for vectors. + template<class T, class Op> + void all_reduce(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, const Op& op) + { + Collectives<T, Op>::all_reduce(comm, in, out, op); + } + //! scan template<class T, class Op> void scan(const communicator& comm, const T& in, T& out, const Op& op) diff --git a/include/vtkdiy/mpi/optional.hpp b/include/vtkdiy/mpi/optional.hpp index 973a65f7d22d4de12ce606bc7bed9d5a9e17e0de..ab58aaf810816b67624f92b65cc29134704ed6e3 100644 --- a/include/vtkdiy/mpi/optional.hpp +++ b/include/vtkdiy/mpi/optional.hpp @@ -21,14 +21,17 @@ namespace mpi operator bool() const { return init_; } - T& operator*() { return *reinterpret_cast<T*>(buf_); } - const T& operator*() const { return *reinterpret_cast<const T*>(buf_); } + T& operator*() { return *static_cast<T*>(address()); } + const T& operator*() const { return *static_cast<const T*>(address()); } T* operator->() { return &(operator*()); } const T* operator->() const { return &(operator*()); } private: - void clear() { reinterpret_cast<T*>(buf_)->~T(); } + void clear() { static_cast<T*>(address())->~T(); } + + void* address() { return buf_; } + const void* address() const { return buf_; } private: bool init_; diff --git a/include/vtkdiy/partners/all-reduce.hpp b/include/vtkdiy/partners/all-reduce.hpp index d51f5507bf7916997213efd4973bb90d5c8af273..e340665951f970e50445c3a06855836a3d68cc9a 100644 --- a/include/vtkdiy/partners/all-reduce.hpp +++ b/include/vtkdiy/partners/all-reduce.hpp @@ -24,12 +24,12 @@ struct RegularAllReducePartners: public RegularMergePartners //! contiguous parameter indicates whether to match partners contiguously or in a round-robin fashion; //! contiguous is useful when data needs to be united; //! round-robin is useful for vector-"halving" - RegularAllReducePartners(int dim, //!< dimensionality of regular block grid - int nblocks, //!< total number of blocks - int k, //!< target k value - bool contiguous = true //!< distance doubling (true) or halving (false) + template<class Decomposer> + RegularAllReducePartners(const Decomposer& decomposer, //!< domain decomposition + int k, //!< target k value + bool contiguous = true //!< distance doubling (true) or halving (false) ): - Parent(dim, nblocks, k, contiguous) {} + Parent(decomposer, k, contiguous) {} RegularAllReducePartners(const DivisionVector& divs,//!< explicit division vector const KVSVector& kvs, //!< explicit k vector bool contiguous = true //!< distance doubling (true) or halving (false) @@ -45,12 +45,12 @@ struct RegularAllReducePartners: public RegularMergePartners //! returns whether a given block in a given round has dropped out of the merge yet or not inline bool active(int round, int gid, const Master& m) const { return Parent::active(parent_round(round), gid, m); } //! returns what the current round would be in the first or second parent merge reduction - int parent_round(int round) const { return round < Parent::rounds() ? round : rounds() - round; } + int parent_round(int round) const { return round < (int) Parent::rounds() ? round : rounds() - round; } // incoming is only valid for an active gid; it will only be called with an active gid inline void incoming(int round, int gid, std::vector<int>& partners, const Master& m) const { - if (round <= Parent::rounds()) + if (round <= (int) Parent::rounds()) Parent::incoming(round, gid, partners, m); else Parent::outgoing(parent_round(round), gid, partners, m); @@ -58,7 +58,7 @@ struct RegularAllReducePartners: public RegularMergePartners inline void outgoing(int round, int gid, std::vector<int>& partners, const Master& m) const { - if (round < Parent::rounds()) + if (round < (int) Parent::rounds()) Parent::outgoing(round, gid, partners, m); else Parent::incoming(parent_round(round), gid, partners, m); diff --git a/include/vtkdiy/partners/common.hpp b/include/vtkdiy/partners/common.hpp index 496a4483fb313d101549f7bad59e91d4504492fd..43f8297a0fe9c280661df4026f0e4673b8743515 100644 --- a/include/vtkdiy/partners/common.hpp +++ b/include/vtkdiy/partners/common.hpp @@ -19,17 +19,18 @@ struct RegularPartners int size; // group size }; - // The part of RegularDecomposer that we need works the same with either Bounds (so we fix them arbitrarily) - typedef DiscreteBounds Bounds; - typedef RegularDecomposer<Bounds> Decomposer; - typedef std::vector<int> CoordVector; typedef std::vector<int> DivisionVector; typedef std::vector<DimK> KVSVector; - RegularPartners(int dim, int nblocks, int k, bool contiguous = true): - divisions_(dim, 0), - contiguous_(contiguous) { Decomposer::fill_divisions(dim, nblocks, divisions_); factor(k, divisions_, kvs_); fill_steps(); } + // The part of RegularDecomposer that we need works the same with either Bounds (so we fix them arbitrarily) + typedef DiscreteBounds Bounds; + typedef RegularDecomposer<Bounds> Decomposer; + + template<class Decomposer_> + RegularPartners(const Decomposer_& decomposer, int k, bool contiguous = true): + divisions_(decomposer.divisions), + contiguous_(contiguous) { factor(k, divisions_, kvs_); fill_steps(); } RegularPartners(const DivisionVector& divs, const KVSVector& kvs, bool contiguous = true): @@ -73,7 +74,7 @@ fill_steps() { std::vector<int> cur_steps(divisions().size(), 1); - for (int r = 0; r < rounds(); ++r) + for (size_t r = 0; r < rounds(); ++r) { steps_.push_back(cur_steps[kvs_[r].dim]); cur_steps[kvs_[r].dim] *= kvs_[r].size; @@ -81,7 +82,7 @@ fill_steps() } else { std::vector<int> cur_steps(divisions().begin(), divisions().end()); - for (int r = 0; r < rounds(); ++r) + for (size_t r = 0; r < rounds(); ++r) { cur_steps[kvs_[r].dim] /= kvs_[r].size; steps_.push_back(cur_steps[kvs_[r].dim]); @@ -152,7 +153,7 @@ factor(int k, const DivisionVector& divisions, KVSVector& kvs) bool changed = false; for (unsigned i = 0; i < divisions.size(); ++i) { - if (round_per_dim[i] == tmp_kvs[i].size()) + if (round_per_dim[i] == (int) tmp_kvs[i].size()) continue; kvs.push_back(DimK(i, tmp_kvs[i][round_per_dim[i]++])); changed = true; diff --git a/include/vtkdiy/partners/merge.hpp b/include/vtkdiy/partners/merge.hpp index 3c4cfa134dee3be8e42fca5898c800024d6d06e7..c6be4253318c2e788a5dd3dd62aec1c78d5e9988 100644 --- a/include/vtkdiy/partners/merge.hpp +++ b/include/vtkdiy/partners/merge.hpp @@ -20,12 +20,12 @@ struct RegularMergePartners: public RegularPartners // contiguous parameter indicates whether to match partners contiguously or in a round-robin fashion; // contiguous is useful when data needs to be united; // round-robin is useful for vector-"halving" - RegularMergePartners(int dim, //!< dimensionality of regular block grid - int nblocks, //!< total number of blocks - int k, //!< target k value - bool contiguous = true //!< distance doubling (true) or halving (false) + template<class Decomposer> + RegularMergePartners(const Decomposer& decomposer, //!< domain decomposition + int k, //!< target k value + bool contiguous = true //!< distance doubling (true) or halving (false) ): - Parent(dim, nblocks, k, contiguous) {} + Parent(decomposer, k, contiguous) {} RegularMergePartners(const DivisionVector& divs, //!< explicit division vector const KVSVector& kvs, //!< explicit k vector bool contiguous = true //!< distance doubling (true) or halving (false) diff --git a/include/vtkdiy/partners/swap.hpp b/include/vtkdiy/partners/swap.hpp index f06b09e3746e7b016da977789c9ca3c3a90c81ec..cc3b3e4941abee651f67f95a7ff36d35d61f5466 100644 --- a/include/vtkdiy/partners/swap.hpp +++ b/include/vtkdiy/partners/swap.hpp @@ -20,12 +20,12 @@ struct RegularSwapPartners: public RegularPartners // contiguous parameter indicates whether to match partners contiguously or in a round-robin fashion; // contiguous is useful when data needs to be united; // round-robin is useful for vector-"halving" - RegularSwapPartners(int dim, //!< dimensionality of regular block grid - int nblocks, //!< total number of blocks - int k, //!< target k value - bool contiguous = true //!< distance halving (true) or doubling (false) + template<class Decomposer> + RegularSwapPartners(const Decomposer& decomposer, //!< domain decomposition + int k, //!< target k value + bool contiguous = true //!< distance halving (true) or doubling (false) ): - Parent(dim, nblocks, k, contiguous) {} + Parent(decomposer, k, contiguous) {} RegularSwapPartners(const DivisionVector& divs, //!< explicit division vector const KVSVector& kvs, //!< explicit k vector bool contiguous = true //!< distance halving (true) or doubling (false) diff --git a/include/vtkdiy/pick.hpp b/include/vtkdiy/pick.hpp index f6aea2c2277facb3488a5dba6554514c2699663b..f884157b2039f7ff28835c0b52d8c5d5836a1bf8 100644 --- a/include/vtkdiy/pick.hpp +++ b/include/vtkdiy/pick.hpp @@ -15,19 +15,14 @@ namespace diy template<class Point> float distance(int dim, const ContinuousBounds& bounds, const Point& p); - template<class Bounds> - void wrap_bounds(Bounds& bounds, int wrap_dir, const Bounds& domain, int dim); + inline + float distance(int dim, const ContinuousBounds& bounds1, const ContinuousBounds& bounds2); - namespace detail - { - template<class Point> - void shift(float new_pt[DIY_MAX_DIM], const Point& p, float r, Direction dir, int dim); - } + template<class Bounds> + void wrap_bounds(Bounds& bounds, Direction wrap_dir, const Bounds& domain, int dim); } -//! Finds the neighbors within radius r of a target point. Assumptions: -//! 1. Point p needs to be in the current block -//! 2. Only for a regular decomposition +//! Finds the neighbors within radius r of a target point. template<class Bounds, class Point, class OutIter> void diy:: @@ -37,8 +32,6 @@ near(const RegularLink<Bounds>& link, //!< neighbors OutIter out, //!< insert iterator for output set of neighbors const Bounds& domain) //!< global domain bounds { - int d; // current dimension - float new_pt[DIY_MAX_DIM]; // offset point Bounds neigh_bounds; // neighbor block bounds // for all neighbors of this block @@ -46,26 +39,10 @@ near(const RegularLink<Bounds>& link, //!< neighbors { // wrap neighbor bounds, if necessary, otherwise bounds will be unchanged neigh_bounds = link.bounds(n); - wrap_bounds(neigh_bounds, link.wrap() & link.direction(n), domain, link.dimension()); + wrap_bounds(neigh_bounds, link.wrap(n), domain, link.dimension()); - detail::shift(new_pt, p, r, link.direction(n), link.dimension()); - - // check if neighbor is near enough - for (d = 0; d < link.dimension(); d++) - { - // if shifted point did not move into or past the neighbor, - // break and proceed to next neighbor - // note dist can be large enough to shift the point beyond the neighbor - // that means the point was definitely near enough to neighbor - if (((link.direction(n) & (1 << (2*d + 1))) && new_pt[d] < neigh_bounds.min[d]) || - ((link.direction(n) & (1 << (2*d))) && new_pt[d] > neigh_bounds.max[d])) - break; - } - - if (d < link.dimension()) - continue; // next neighbor - - *out++ = n; + if (distance(link.dimension(), neigh_bounds, p) <= r) + *out++ = n; } // for all neighbors } @@ -92,8 +69,31 @@ distance(int dim, const ContinuousBounds& bounds, const Point& p) return sqrt(res); } -//! Finds the neighbor(s) containing the target point. Assumptions: -//! 1. Only for a regular decomposition +float +diy:: +distance(int dim, const ContinuousBounds& bounds1, const ContinuousBounds& bounds2) +{ + float res = 0; + for (int i = 0; i < dim; ++i) + { + float diff = 0, d; + + float d1 = bounds1.max[i] - bounds2.min[i]; + float d2 = bounds2.max[i] - bounds1.min[i]; + + if (d1 > 0 && d2 > 0) + diff = 0; + else if (d1 <= 0) + diff = -d1; + else if (d2 <= 0) + diff = -d2; + + res += diff*diff; + } + return sqrt(res); +} + +//! Finds the neighbor(s) containing the target point. template<class Bounds, class Point, class OutIter> void diy:: @@ -102,7 +102,6 @@ in(const RegularLink<Bounds>& link, //!< neighbors OutIter out, //!< insert iterator for output set of neighbors const Bounds& domain) //!< global domain bounds { - int d; // current dimension Bounds neigh_bounds; // neighbor block bounds // for all neighbors of this block @@ -110,19 +109,10 @@ in(const RegularLink<Bounds>& link, //!< neighbors { // wrap neighbor bounds, if necessary, otherwise bounds will be unchanged neigh_bounds = link.bounds(n); - wrap_bounds(neigh_bounds, link.wrap() & link.direction(n), domain, link.dimension()); - - // check if p is in neighbor - for (d = 0; d < link.dimension(); d++) - { - if (p[d] < neigh_bounds.min[d] || p[d] > neigh_bounds.max[d]) - break; - } + wrap_bounds(neigh_bounds, link.wrap(n), domain, link.dimension()); - if (d < link.dimension()) - continue; // next neighbor - - *out++ = n; + if (distance(link.dimension(), neigh_bounds, p) == 0) + *out++ = n; } // for all neighbors } @@ -132,75 +122,13 @@ in(const RegularLink<Bounds>& link, //!< neighbors template<class Bounds> void diy:: -wrap_bounds(Bounds& bounds, int wrap_dir, const Bounds& domain, int dim) +wrap_bounds(Bounds& bounds, Direction wrap_dir, const Bounds& domain, int dim) { - // wrapping toward the left transforms block bounds to the left, and vice versa - if (dim > 0 && (wrap_dir & DIY_X0) == DIY_X0) - { - bounds.min[0] -= (domain.max[0] - domain.min[0]); - bounds.max[0] -= (domain.max[0] - domain.min[0]); - } - if (dim > 0 && (wrap_dir & DIY_X1) == DIY_X1) - { - bounds.min[0] += (domain.max[0] - domain.min[0]); - bounds.max[0] += (domain.max[0] - domain.min[0]); - } - - if (dim > 1 && (wrap_dir & DIY_Y0) == DIY_Y0) - { - bounds.min[1] -= (domain.max[1] - domain.min[1]); - bounds.max[1] -= (domain.max[1] - domain.min[1]); - } - if (dim > 1 && (wrap_dir & DIY_Y1) == DIY_Y1) - { - bounds.min[1] += (domain.max[1] - domain.min[1]); - bounds.max[1] += (domain.max[1] - domain.min[1]); - } - - if (dim > 2 && (wrap_dir & DIY_Z0) == DIY_Z0) + for (int i = 0; i < dim; ++i) { - bounds.min[2] -= (domain.max[2] - domain.min[2]); - bounds.max[2] -= (domain.max[2] - domain.min[2]); + bounds.min[i] += wrap_dir[i] * (domain.max[i] - domain.min[i]); + bounds.max[i] += wrap_dir[i] * (domain.max[i] - domain.min[i]); } - if (dim > 2 && (wrap_dir & DIY_Z1) == DIY_Z1) - { - bounds.min[2] += (domain.max[2] - domain.min[2]); - bounds.max[2] += (domain.max[2] - domain.min[2]); - } - - if (dim > 3 && (wrap_dir & DIY_T0) == DIY_T0) - { - bounds.min[3] -= (domain.max[3] - domain.min[3]); - bounds.max[3] -= (domain.max[3] - domain.min[3]); - } - if (dim > 3 && (wrap_dir & DIY_T1) == DIY_T1) - { - bounds.min[3] += (domain.max[3] - domain.min[3]); - bounds.max[3] += (domain.max[3] - domain.min[3]); - } -} - -template<class Point> -void -diy::detail:: -shift(float new_pt[DIY_MAX_DIM], const Point& p, float r, Direction dir, int dim) -{ - if (dim > 0 && (dir & DIY_X0)) - new_pt[0] = p[0] - r; - if (dim > 0 && (dir & DIY_X1)) - new_pt[0] = p[0] + r; - if (dim > 1 && (dir & DIY_Y0)) - new_pt[1] = p[1] - r; - if (dim > 1 && (dir & DIY_Y1)) - new_pt[1] = p[1] + r; - if (dim > 2 && (dir & DIY_Z0)) - new_pt[2] = p[2] - r; - if (dim > 2 && (dir & DIY_Z1)) - new_pt[2] = p[2] + r; - if (dim > 3 && (dir & DIY_T0)) - new_pt[3] = p[3] - r; - if (dim > 3 && (dir & DIY_T1)) - new_pt[3] = p[3] + r; } diff --git a/include/vtkdiy/proxy.hpp b/include/vtkdiy/proxy.hpp index 5ea2d4631845f9094872e8652e633fe0a20cb1d2..0160e060567f613906cff294a1e4b4ef52803f01 100644 --- a/include/vtkdiy/proxy.hpp +++ b/include/vtkdiy/proxy.hpp @@ -19,7 +19,7 @@ namespace diy int gid() const { return gid_; } - //! Enqueue data whose size can be determined automatically + //! Enqueue data whose size can be determined automatically, e.g., an STL vector. template<class T> void enqueue(const BlockID& to, //!< target block (gid,proc) const T& x, //!< data (eg. STL vector) @@ -27,7 +27,7 @@ namespace diy ) const { OutgoingQueues& out = *outgoing_; save(out[to], x); } - //! Enqueue an array of data whose size is given explicitly + //! Enqueue data whose size is given explicitly by the user, e.g., an array. template<class T> void enqueue(const BlockID& to, //!< target block (gid,proc) const T* x, //!< pointer to the data (eg. address of start of vector) @@ -35,8 +35,9 @@ namespace diy void (*save)(BinaryBuffer&, const T&) = &::diy::save<T> //!< optional serialization function ) const; - //! Dequeue data whose size can be determined automatically. - //! Diy will allocate the receive buffer. + //! Dequeue data whose size can be determined automatically (e.g., STL vector) and that was + //! previously enqueued so that diy knows its size when it is received. + //! In this case, diy will allocate the receive buffer; the user does not need to do so. template<class T> void dequeue(int from, //!< target block gid T& x, //!< data (eg. STL vector) @@ -44,8 +45,8 @@ namespace diy ) const { IncomingQueues& in = *incoming_; load(in[from], x); } - //! Dequeue an array of data whose size is given explicitly. - //! The user needs to allocate the receive buffer. + //! Dequeue an array of data whose size is given explicitly by the user. + //! In this case, the user needs to allocate the receive buffer prior to calling dequeue. template<class T> void dequeue(int from, //!< target block gid T* x, //!< pointer to the data (eg. address of start of vector) @@ -65,17 +66,37 @@ namespace diy OutgoingQueues* outgoing() const { return outgoing_; } MemoryBuffer& outgoing(const BlockID& to) const { return (*outgoing_)[to]; } - //! Post an all-reduce collective +/** + * \ingroup Communication + * \brief Post an all-reduce collective using an existing communication proxy. + * Available operators are: + * maximum<T>, minimum<T>, std::plus<T>, std::multiplies<T>, std::logical_and<T>, and + * std::logical_or<T>. + */ template<class T, class Op> - inline void all_reduce(const T& in, Op op) const; + inline void all_reduce(const T& in, //!< local value being reduced + Op op //!< operator + ) const; +/** + * \ingroup Communication + * \brief Return the result of a proxy collective without popping it off the collectives list (same result would be returned multiple times). The list can be cleared with collectives()->clear(). + */ template<class T> inline T read() const; +/** + * \ingroup Communication + * \brief Return the result of a proxy collective; result is popped off the collectives list. + */ template<class T> inline T get() const; template<class T> inline void scratch(const T& in) const; +/** + * \ingroup Communication + * \brief Return the list of proxy collectives (values and operations) + */ CollectivesList* collectives() const { return collectives_; } Master* master() const { return master_; } diff --git a/include/vtkdiy/reduce-operations.hpp b/include/vtkdiy/reduce-operations.hpp index 30c4570a6ae2749dd1501e1ccaf10664a3d626e3..b1e45a58061823fe4b597c1dcab1178836ec7318 100644 --- a/include/vtkdiy/reduce-operations.hpp +++ b/include/vtkdiy/reduce-operations.hpp @@ -21,7 +21,8 @@ all_to_all(Master& master, //!< block owner int k = 2 //!< reduction fanout ) { - RegularSwapPartners partners(1, assigner.nblocks(), k, false); + RegularDecomposer<DiscreteBounds> decomposer(1, interval(0,assigner.nblocks()-1), assigner.nblocks()); + RegularSwapPartners partners(decomposer, k, false); reduce(master, assigner, partners, detail::AllToAllReduce<Op>(op, assigner), detail::SkipIntermediate(partners.rounds())); } diff --git a/include/vtkdiy/reduce.hpp b/include/vtkdiy/reduce.hpp index e7be78129b59946f44fe2e9c4fc5ced6f8395a73..4810942ff1c759bc2cb1f1a48d2e3fdebc76b3f8 100644 --- a/include/vtkdiy/reduce.hpp +++ b/include/vtkdiy/reduce.hpp @@ -104,11 +104,11 @@ namespace detail * */ template<class Reduce, class Partners, class Skip> -void reduce(Master& master, - const Assigner& assigner, - const Partners& partners, - const Reduce& reduce, - const Skip& skip) +void reduce(Master& master, //!< master object + const Assigner& assigner, //!< assigner object + const Partners& partners, //!< partners object + const Reduce& reduce, //!< reduction callback function + const Skip& skip) //!< object determining whether a block should be skipped { int original_expected = master.expected(); @@ -121,7 +121,7 @@ void reduce(Master& master, master.execute(); int expected = 0; - for (int i = 0; i < master.size(); ++i) + for (unsigned i = 0; i < master.size(); ++i) { if (partners.active(round + 1, master.gid(i), master)) { @@ -142,11 +142,17 @@ void reduce(Master& master, master.set_expected(original_expected); } +/** + * \ingroup Communication + * \brief Implementation of the reduce communication pattern (includes + * swap-reduce, merge-reduce, and any other global communication). + * + */ template<class Reduce, class Partners> -void reduce(Master& master, - const Assigner& assigner, - const Partners& partners, - const Reduce& reducer) +void reduce(Master& master, //!< master object + const Assigner& assigner, //!< assigner object + const Partners& partners, //!< partners object + const Reduce& reducer) //!< reduction callback function { reduce(master, assigner, partners, reducer, detail::ReduceNeverSkip()); } @@ -174,8 +180,8 @@ namespace detail // touch the outgoing queues to make sure they exist Master::OutgoingQueues& outgoing = *cp.outgoing(); - if (outgoing.size() < rp.out_link().size()) - for (unsigned j = 0; j < rp.out_link().size(); ++j) + if (outgoing.size() < (size_t) rp.out_link().size()) + for (int j = 0; j < rp.out_link().size(); ++j) outgoing[rp.out_link().target(j)]; // touch the outgoing queue, creating it if necessary } diff --git a/include/vtkdiy/serialization.hpp b/include/vtkdiy/serialization.hpp index 4bb89fe15ec474ec5b710dc9714c7e18a5ede27f..6a1a3defae3a94fb8a605a02d03c640b389e5c3e 100644 --- a/include/vtkdiy/serialization.hpp +++ b/include/vtkdiy/serialization.hpp @@ -11,6 +11,7 @@ #if __cplusplus > 199711L // C++11 #include <tuple> #include <unordered_map> +#include <unordered_set> #include <type_traits> // this is used for a safety check for default serialization #endif #endif @@ -333,6 +334,33 @@ namespace diy } }; + // save/load for std::unordered_set<T> + template<class T> + struct Serialization< std::unordered_set<T> > + { + typedef std::unordered_set<T> Set; + + static void save(BinaryBuffer& bb, const Set& m) + { + size_t s = m.size(); + diy::save(bb, s); + for (auto& x : m) + diy::save(bb, x); + } + + static void load(BinaryBuffer& bb, Set& m) + { + size_t s; + diy::load(bb, s); + for (size_t i = 0; i < s; ++i) + { + T p; + diy::load(bb, p); + m.emplace(std::move(p)); + } + } + }; + // save/load for std::tuple<...> // TODO: this ought to be default (copying) serialization // if all arguments are default diff --git a/include/vtkdiy/thread.hpp b/include/vtkdiy/thread.hpp index 95377d29550f38ed8292c8e20c45ef737c2447f5..4208ccff74cc315fd540be3497cfe57f1039aeae 100644 --- a/include/vtkdiy/thread.hpp +++ b/include/vtkdiy/thread.hpp @@ -4,9 +4,30 @@ #ifdef DIY_NO_THREADS #include "no-thread.hpp" #else -#include "thread/tinythread.h" + #include "thread/fast_mutex.h" +#if __cplusplus > 199711L // C++11 +#include <thread> +#include <mutex> + +namespace diy +{ + using std::thread; + using std::mutex; + using std::recursive_mutex; + namespace this_thread = std::this_thread; + + // TODO: replace with our own implementation using std::atomic_flag + using fast_mutex = tthread::fast_mutex; + + template<class Mutex> + using lock_guard = std::unique_lock<Mutex>; +} + +#else +#include "thread/tinythread.h" + namespace diy { using tthread::thread; @@ -17,6 +38,7 @@ namespace diy namespace this_thread = tthread::this_thread; } #endif +#endif #include "critical-resource.hpp" diff --git a/include/vtkdiy/types.hpp b/include/vtkdiy/types.hpp index 83db2d8fd676d3a0dbbc9ba4f82e8428f1edbe9e..7ce19d5a7f618696ecf4a03962e45fbc2ba8b379 100644 --- a/include/vtkdiy/types.hpp +++ b/include/vtkdiy/types.hpp @@ -12,7 +12,29 @@ namespace diy typedef bb_d_t DiscreteBounds; typedef bb_c_t ContinuousBounds; - typedef dir_t Direction; + //! Helper to create a 1-dimensional discrete domain with the specified extents + diy::DiscreteBounds + interval(int from, int to) { DiscreteBounds domain; domain.min[0] = from; domain.max[0] = to; return domain; } + + struct Direction: public dir_t + { + Direction() { for (int i = 0; i < DIY_MAX_DIM; ++i) x[i] = 0; } + + int operator[](int i) const { return x[i]; } + int& operator[](int i) { return x[i]; } + + // lexicographic comparison + bool + operator<(const diy::Direction& y) const + { + for (int i = 0; i < DIY_MAX_DIM; ++i) + { + if ((*this)[i] < y[i]) return true; + if ((*this)[i] > y[i]) return false; + } + return false; + } + }; // Selector of bounds value type template<class Bounds_>