diff --git a/README.md b/README.md
index bcddcc352e5b39e484905960359a8d9cf812e561..dba302433b5907f4e16ac9a052ad39679c7467c7 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,9 @@
-## DIY2 is a data-parallel out-of-core library
+## DIY2 is a block-parallel library
 
-DIY2 is a data-parallel library for implementing scalable algorithms that can execute both
+DIY2 is a block-parallel library for implementing scalable algorithms that can execute both
 in-core and out-of-core. The same program can be executed with one or more threads per MPI
 process, seamlessly combining distributed-memory message passing with shared-memory thread
-parallelism.  The abstraction enabling these capabilities is block-based parallelism; blocks
+parallelism.  The abstraction enabling these capabilities is block parallelism; blocks
 and their message queues are mapped onto processing elements (MPI processes or threads) and are
 migrated between memory and storage by the DIY2 runtime. Complex communication patterns,
 including neighbor exchange, merge reduction, swap reduction, and all-to-all exchange, are
diff --git a/include/vtkdiy/algorithms.hpp b/include/vtkdiy/algorithms.hpp
index b620a8f27dc6c8791e2caec81452481451e947a2..6920a5dd4c292b11fa60bd9da54509c696ce69d2 100644
--- a/include/vtkdiy/algorithms.hpp
+++ b/include/vtkdiy/algorithms.hpp
@@ -11,6 +11,7 @@
 
 #include "detail/algorithms/sort.hpp"
 #include "detail/algorithms/kdtree.hpp"
+#include "detail/algorithms/kdtree-sampling.hpp"
 
 namespace diy
 {
@@ -36,7 +37,8 @@ namespace diy
         detail::SampleSort<Block,T,Cmp> sorter(values, samples, cmp, num_samples);
 
         // swap-reduce to all-gather samples
-        RegularSwapPartners   partners(1, assigner.nblocks(), k);
+        RegularDecomposer<DiscreteBounds> decomposer(1, interval(0,assigner.nblocks()), assigner.nblocks());
+        RegularSwapPartners   partners(decomposer, k);
         reduce(master, assigner, partners, sorter.sample(), detail::SkipIntermediate(partners.rounds()));
 
         // all_to_all to exchange the values
@@ -66,7 +68,7 @@ namespace diy
 
     /**
      * \ingroup Algorithms
-     * \brief build a kd-tree and sort a set of points into it
+     * \brief build a kd-tree and sort a set of points into it (use histograms to determine split values)
      */
     template<class Block, class Point>
     void kdtree(Master&                         master,      //!< master object
@@ -85,11 +87,33 @@ namespace diy
 
         typedef     diy::RegularContinuousLink      RCLink;
 
-        for (int i = 0; i < master.size(); ++i)
+        for (size_t i = 0; i < master.size(); ++i)
         {
             RCLink* link   = static_cast<RCLink*>(master.link(i));
-            link->core()   = domain;
-            link->bounds() = domain;
+            *link = RCLink(dim, domain, domain);
+
+            if (wrap)       // set up the links to self
+            {
+                diy::BlockID self = { master.gid(i), master.communicator().rank() };
+                for (int j = 0; j < dim; ++j)
+                {
+                    diy::Direction dir, wrap_dir;
+
+                    // left
+                    dir.x[j] = -1; wrap_dir.x[j] = -1;
+                    link->add_neighbor(self);
+                    link->add_bounds(domain);
+                    link->add_direction(dir);
+                    link->add_wrap(wrap_dir);
+
+                    // right
+                    dir.x[j] = 1; wrap_dir.x[j] = 1;
+                    link->add_neighbor(self);
+                    link->add_bounds(domain);
+                    link->add_direction(dir);
+                    link->add_wrap(wrap_dir);
+                }
+            }
         }
 
         detail::KDTreePartition<Block,Point>    kdtree_partition(dim, points, bins);
@@ -99,11 +123,73 @@ namespace diy
 
         // update master.expected to match the links
         int expected = 0;
-        for (int i = 0; i < master.size(); ++i)
+        for (size_t i = 0; i < master.size(); ++i)
             expected += master.link(i)->size_unique();
         master.set_expected(expected);
     }
 
+    /**
+     * \ingroup Algorithms
+     * \brief build a kd-tree and sort a set of points into it (use sampling to determine split values)
+     */
+    template<class Block, class Point>
+    void kdtree_sampling
+               (Master&                         master,      //!< master object
+                const Assigner&                 assigner,    //!< assigner object
+                int                             dim,         //!< dimensionality
+                const ContinuousBounds&         domain,      //!< global data extents
+                std::vector<Point>  Block::*    points,      //!< input points to sort into kd-tree
+                size_t                          samples,     //!< number of samples to take in each block
+                bool                            wrap = false)//!< periodic boundaries in all dimensions
+    {
+        if (assigner.nblocks() & (assigner.nblocks() - 1))
+        {
+            fprintf(stderr, "KD-tree requires a number of blocks that's a power of 2, got %d\n", assigner.nblocks());
+            std::abort();
+        }
+
+        typedef     diy::RegularContinuousLink      RCLink;
+
+        for (size_t i = 0; i < master.size(); ++i)
+        {
+            RCLink* link   = static_cast<RCLink*>(master.link(i));
+            *link = RCLink(dim, domain, domain);
+
+            if (wrap)       // set up the links to self
+            {
+                diy::BlockID self = { master.gid(i), master.communicator().rank() };
+                for (int j = 0; j < dim; ++j)
+                {
+                    diy::Direction dir, wrap_dir;
+
+                    // left
+                    dir.x[j] = -1; wrap_dir.x[j] = -1;
+                    link->add_neighbor(self);
+                    link->add_bounds(domain);
+                    link->add_direction(dir);
+                    link->add_wrap(wrap_dir);
+
+                    // right
+                    dir.x[j] = 1; wrap_dir.x[j] = 1;
+                    link->add_neighbor(self);
+                    link->add_bounds(domain);
+                    link->add_direction(dir);
+                    link->add_wrap(wrap_dir);
+                }
+            }
+        }
+
+        detail::KDTreeSamplingPartition<Block,Point>    kdtree_partition(dim, points, samples);
+
+        detail::KDTreePartners                          partners(dim, assigner.nblocks(), wrap, domain);
+        reduce(master, assigner, partners, kdtree_partition);
+
+        // update master.expected to match the links
+        int expected = 0;
+        for (size_t i = 0; i < master.size(); ++i)
+            expected += master.link(i)->size_unique();
+        master.set_expected(expected);
+    }
 }
 
 #endif
diff --git a/include/vtkdiy/constants.h b/include/vtkdiy/constants.h
index 5ea7fd7029fbd86fc432303634f2aedd1f6aaca7..cf087a5c94be49788ae4ce9a83e14f569e271cb3 100644
--- a/include/vtkdiy/constants.h
+++ b/include/vtkdiy/constants.h
@@ -7,23 +7,9 @@
 #define DIY_MAX_DIM 4
 #endif
 
-/* DIY directions: neighbor direction enumeration
-   used to identify direction of one neighbor block for both regular and 
-   wrapround neighbors
-   can be bitwise ORed, eg., maximum-side neighboring block in 3 dimensions 
-   would be DIY_X1 | DIY_Y1 | DIY_Z1 
-   each use identifies exactly one block
-   eg. DIY_X0 is the (one) left neighbor block, not the entire left plane */
-typedef enum
+struct dir_t
 {
-  DIY_X0 = 0x01, /* minimum-side x (left) neighbor */
-  DIY_X1 = 0x02, /* maximum-side x (right) neighbor */
-  DIY_Y0 = 0x04, /* minimum-side y (bottom) neighbor */
-  DIY_Y1 = 0x08, /* maximum-side y (top) neighbor */
-  DIY_Z0 = 0x10, /* minimum-side z (back) neighbor */
-  DIY_Z1 = 0x20, /* maximum-side z (front)neighbor */
-  DIY_T0 = 0x40, /* minimum-side t (earlier) neighbor */
-  DIY_T1 = 0x80  /* maximum-side t (later) neighbor */
-} dir_t;
+    int x[DIY_MAX_DIM];
+};
 
 #endif
diff --git a/include/vtkdiy/critical-resource.hpp b/include/vtkdiy/critical-resource.hpp
index 0b81e3169d8b5251b55fe459bc29452d89027fee..61a5a4b8a714a988cadc3b095a14ef9cd89284c7 100644
--- a/include/vtkdiy/critical-resource.hpp
+++ b/include/vtkdiy/critical-resource.hpp
@@ -3,6 +3,14 @@
 
 namespace diy
 {
+  // TODO: when not running under C++11, i.e., when lock_guard is TinyThread's
+  //       lock_guard, and not C++11's unique_lock, this implementation might
+  //       be buggy since the copy constructor is invoked when
+  //       critical_resource::access() returns an instance of this class. Once
+  //       the temporary is destroyed the mutex is unlocked. I'm not 100%
+  //       certain of this because I'd expect a deadlock on copy constructor,
+  //       but it's clearly not happening -- so I may be missing something.
+  //       (This issue will take care of itself in DIY3 once we switch to C++11 completely.)
   template<class T, class Mutex>
   class resource_accessor
   {
diff --git a/include/vtkdiy/decomposition.hpp b/include/vtkdiy/decomposition.hpp
index f285f8409d1b2b811b36a121a4a6d90c9f55c68c..a51123c211b5ff3cc862de9c714de92ee2b8c922 100644
--- a/include/vtkdiy/decomposition.hpp
+++ b/include/vtkdiy/decomposition.hpp
@@ -5,6 +5,8 @@
 #include <algorithm>
 #include <iostream>
 #include <cmath>
+#include <sstream>
+#include <stdexcept>
 
 #include "link.hpp"
 #include "assigner.hpp"
@@ -74,35 +76,37 @@ namespace detail
     typedef         std::vector<Coordinate>                         CoordinateVector;
     typedef         std::vector<int>                                DivisionsVector;
 
-    /// @param assigner:  decides how processors are assigned to blocks (maps a gid to a rank)
-    ///                   also communicates the total number of blocks
     /// @param wrap:      indicates dimensions on which to wrap the boundary
     /// @param ghosts:    indicates how many ghosts to use in each dimension
     /// @param divisions: indicates how many cuts to make along each dimension
     ///                   (0 means "no constraint," i.e., leave it up to the algorithm)
                     RegularDecomposer(int               dim_,
                                       const Bounds&     domain_,
-                                      const Assigner&   assigner_,
+                                      int               nblocks_,
                                       BoolVector        share_face_ = BoolVector(),
                                       BoolVector        wrap_       = BoolVector(),
                                       CoordinateVector  ghosts_     = CoordinateVector(),
                                       DivisionsVector   divisions_  = DivisionsVector()):
-                      dim(dim_), domain(domain_), assigner(assigner_),
+                      dim(dim_), domain(domain_), nblocks(nblocks_),
                       share_face(share_face_),
                       wrap(wrap_), ghosts(ghosts_), divisions(divisions_)
     {
-      if (share_face.size() < dim)  share_face.resize(dim);
-      if (wrap.size() < dim)        wrap.resize(dim);
-      if (ghosts.size() < dim)      ghosts.resize(dim);
-      if (divisions.size() < dim)   divisions.resize(dim);
+      if ((int) share_face.size() < dim)  share_face.resize(dim);
+      if ((int) wrap.size() < dim)        wrap.resize(dim);
+      if ((int) ghosts.size() < dim)      ghosts.resize(dim);
+      if ((int) divisions.size() < dim)   divisions.resize(dim);
 
-      int nblocks = assigner.nblocks();
-      fill_divisions(nblocks);
+      fill_divisions(divisions);
     }
 
     // Calls create(int gid, const Bounds& bounds, const Link& link)
     template<class Creator>
-    void            decompose(int rank, const Creator& create);
+    void            decompose(int rank, const Assigner& assigner, const Creator& create);
+
+    template<class Updater>
+    void            decompose(int rank, const Assigner& assigner, Master& master, const Updater& update);
+
+    void            decompose(int rank, const Assigner& assigner, Master& master);
 
     // find lowest gid that owns a particular point
     template<class Point>
@@ -110,7 +114,7 @@ namespace detail
 
     void            gid_to_coords(int gid, DivisionsVector& coords) const       { gid_to_coords(gid, coords, divisions); }
     int             coords_to_gid(const DivisionsVector& coords) const          { return coords_to_gid(coords, divisions); }
-    void            fill_divisions(int nblocks)                                 { fill_divisions(dim, nblocks, divisions); }
+    void            fill_divisions(std::vector<int>& divisions) const;
 
     void            fill_bounds(Bounds& bounds, const DivisionsVector& coords, bool add_ghosts = false) const;
     void            fill_bounds(Bounds& bounds, int gid, bool add_ghosts = false) const;
@@ -118,7 +122,7 @@ namespace detail
     static bool     all(const std::vector<int>& v, int x);
     static void     gid_to_coords(int gid, DivisionsVector& coords, const DivisionsVector& divisions);
     static int      coords_to_gid(const DivisionsVector& coords, const DivisionsVector& divisions);
-    static void     fill_divisions(int dim, int nblocks, std::vector<int>& divisions);
+
     static void     factor(std::vector<unsigned>& factors, int n);
 
     // Point to GIDs functions
@@ -137,8 +141,8 @@ namespace detail
 
 
     int               dim;
-    const Bounds&     domain;
-    const Assigner&   assigner;
+    Bounds            domain;
+    int               nblocks;
     BoolVector        share_face;
     BoolVector        wrap;
     CoordinateVector  ghosts;
@@ -162,7 +166,7 @@ namespace detail
    *
    * `create(...)` is called with each block assigned to the local domain. See [decomposition example](#decomposition-example).
    */
-  template<class Bounds, class Assigner, class Creator>
+  template<class Bounds, class Creator>
   void decompose(int                dim,
                  int                rank,
                  const Bounds&      domain,
@@ -173,7 +177,7 @@ namespace detail
                  typename RegularDecomposer<Bounds>::CoordinateVector ghosts     = typename RegularDecomposer<Bounds>::CoordinateVector(),
                  typename RegularDecomposer<Bounds>::DivisionsVector  divs       = typename RegularDecomposer<Bounds>::DivisionsVector())
   {
-    RegularDecomposer<Bounds>(dim, domain, assigner, share_face, wrap, ghosts, divs).decompose(rank, create);
+    RegularDecomposer<Bounds>(dim, domain, assigner.nblocks(), share_face, wrap, ghosts, divs).decompose(rank, assigner, create);
   }
 
 namespace detail
@@ -195,6 +199,26 @@ namespace detail
 
     diy::Master* master_;
   };
+
+  template<class Bounds, class Update>
+  struct Updater
+  {
+    typedef typename RegularDecomposer<Bounds>::Link        Link;
+
+            Updater(diy::Master* master, const Update& update):
+                master_(master), update_(update)                    {}
+
+    void    operator()(int gid, const Bounds& core, const Bounds& bounds, const Bounds& domain, const Link& link) const
+    {
+        int lid = master_->lid(gid);
+        Link* l = new Link(link);
+        master_->replace_link(lid, l);
+        update_(gid, lid, core, bounds, domain, *l);
+    }
+
+    diy::Master*    master_;
+    const Update&   update_;
+  };
 }
 
   /**
@@ -213,7 +237,7 @@ namespace detail
    *
    * `master` must have been supplied a create function in order for this function to work.
    */
-  template<class Bounds, class Assigner>
+  template<class Bounds>
   void decompose(int                dim,
                  int                rank,
                  const Bounds&      domain,
@@ -224,7 +248,7 @@ namespace detail
                  typename RegularDecomposer<Bounds>::CoordinateVector ghosts     = typename RegularDecomposer<Bounds>::CoordinateVector(),
                  typename RegularDecomposer<Bounds>::DivisionsVector  divs       = typename RegularDecomposer<Bounds>::DivisionsVector())
   {
-    RegularDecomposer<Bounds>(dim, domain, assigner, share_face, wrap, ghosts, divs).decompose(rank, detail::AddBlock<Bounds>(&master));
+    RegularDecomposer<Bounds>(dim, domain, assigner.nblocks(), share_face, wrap, ghosts, divs).decompose(rank, assigner, master);
   }
 
   /**
@@ -248,15 +272,53 @@ namespace detail
       master.add(local_gids[i], master.create(), new diy::Link);
   }
 
+    /**
+     * \ingroup Decomposition
+     * \brief Add a decomposition (modify links) of an existing set of blocks that were
+     * added to the master previously
+     *
+     * @param rank       local rank
+     * @param assigner   decides how processors are assigned to blocks (maps a gid to a rank)
+     *                   also communicates the total number of blocks
+     */
+  template<class Bounds, class Update>
+  void decompose(int                dim,
+                 int                rank,
+                 const Bounds&      domain,
+                 const Assigner&    assigner,
+                 Master&            master,
+                 const Update&      update,
+                 typename RegularDecomposer<Bounds>::BoolVector       share_face =
+                 typename RegularDecomposer<Bounds>::BoolVector(),
+                 typename RegularDecomposer<Bounds>::BoolVector       wrap       =
+                 typename RegularDecomposer<Bounds>::BoolVector(),
+                 typename RegularDecomposer<Bounds>::CoordinateVector ghosts     =
+                 typename RegularDecomposer<Bounds>::CoordinateVector(),
+                 typename RegularDecomposer<Bounds>::DivisionsVector  divs       =
+                 typename RegularDecomposer<Bounds>::DivisionsVector())
+  {
+      RegularDecomposer<Bounds>(dim, domain, share_face, wrap, ghosts, divs).
+          decompose(rank, assigner, master, update);
+  }
+
   //! Decomposition example: \example decomposition/test-decomposition.cpp
   //! Direct master insertion example: \example decomposition/test-direct-master.cpp
 }
 
+// decomposes domain and adds blocks to the master
+template<class Bounds>
+void
+diy::RegularDecomposer<Bounds>::
+decompose(int rank, const Assigner& assigner, Master& master)
+{
+  decompose(rank, assigner, detail::AddBlock<Bounds>(&master));
+}
+
 template<class Bounds>
 template<class Creator>
 void
 diy::RegularDecomposer<Bounds>::
-decompose(int rank, const Creator& create)
+decompose(int rank, const Assigner& assigner, const Creator& create)
 {
   std::vector<int> gids;
   assigner.local_gids(rank, gids);
@@ -289,7 +351,7 @@ decompose(int rank, const Creator& create)
       if (all(offsets, 0)) continue;      // skip ourselves
 
       DivisionsVector     nhbr_coords(dim);
-      int                 dir      = 0;
+      Direction           dir, wrap_dir;
       bool                inbounds = true;
       for (int i = 0; i < dim; ++i)
       {
@@ -301,7 +363,7 @@ decompose(int rank, const Creator& create)
           if (wrap[i])
           {
             nhbr_coords[i] = divisions[i] - 1;
-            link.add_wrap(Direction(1 << 2*i));
+            wrap_dir[i] = -1;
           }
           else
             inbounds = false;
@@ -312,17 +374,15 @@ decompose(int rank, const Creator& create)
           if (wrap[i])
           {
             nhbr_coords[i] = 0;
-            link.add_wrap(Direction(1 << (2*i + 1)));
+            wrap_dir[i] = 1;
           }
           else
             inbounds = false;
         }
 
         // NB: this needs to match the addressing scheme in dir_t (in constants.h)
-        if (offsets[i] == -1)
-          dir |= 1 << (2*i + 0);
-        if (offsets[i] == 1)
-          dir |= 1 << (2*i + 1);
+        if (offsets[i] == -1 || offsets[i] == 1)
+          dir[i] = offsets[i];
       }
       if (!inbounds) continue;
 
@@ -334,13 +394,24 @@ decompose(int rank, const Creator& create)
       fill_bounds(nhbr_bounds, nhbr_coords);
       link.add_bounds(nhbr_bounds);
 
-      link.add_direction(static_cast<Direction>(dir));
+      link.add_direction(dir);
+      link.add_wrap(wrap_dir);
     }
 
     create(gid, core, bounds, domain, link);
   }
 }
 
+// decomposes domain but does not add blocks to master, assumes they were added already
+template<class Bounds>
+template<class Update>
+void
+diy::RegularDecomposer<Bounds>::
+decompose(int rank, const Assigner& assigner, Master& master, const Update& update)
+{
+    decompose(rank, assigner, detail::Updater<Bounds,Update>(&master, update));
+}
+
 template<class Bounds>
 bool
 diy::RegularDecomposer<Bounds>::
@@ -436,40 +507,116 @@ fill_bounds(Bounds& bounds,                  //!< (output) bounds
         fill_bounds(bounds, coords);
 }
 
+namespace diy { namespace detail {
+// current state of division in one dimension used in fill_divisions below
+template<class Coordinate>
+struct Div
+{
+    int dim;                                 // 0, 1, 2, etc. e.g. for x, y, z etc.
+    int nb;                                  // number of blocks so far in this dimension
+    Coordinate b_size;                       // block size so far in this dimension
+
+    // sort on descending block size unless tied, in which case
+    // sort on ascending num blocks in current dim unless tied, in which case
+    // sort on ascending dimension
+    bool operator<(Div rhs) const
+    {
+        // sort on second value of the pair unless tied, in which case sort on first
+        if (b_size == rhs.b_size)
+        {
+            if (nb == rhs.nb)
+                return(dim < rhs.dim);
+            return(nb < rhs.nb);
+        }
+        return(b_size > rhs.b_size);
+    }
+};
+} }
+
 template<class Bounds>
 void
 diy::RegularDecomposer<Bounds>::
-fill_divisions(int dim, int nblocks, std::vector<int>& divisions)
+fill_divisions(std::vector<int>& divisions) const
 {
-  int prod = 1; int c = 0;
-  for (unsigned i = 0; i < dim; ++i)
-    if (divisions[i] != 0)
+    // prod = number of blocks unconstrained by user; c = number of unconstrained dimensions
+    int prod = 1; int c = 0;
+    for (int i = 0; i < dim; ++i)
+        if (divisions[i] != 0)
+        {
+            prod *= divisions[i];
+            ++c;
+        }
+
+    if (nblocks % prod != 0)
     {
-      prod *= divisions[i];
-      ++c;
+        fprintf(stderr, "Total number of blocks cannot be factored into provided divs\n");
+        return;
     }
 
-  if (nblocks % prod != 0)
-  {
-    std::cerr << "Incompatible requirements" << std::endl;
-    return;
-  }
+    if (c == (int) divisions.size())               // nothing to do; user provided all divs
+        return;
 
-  if (c == divisions.size())
-    return;
+    // factor number of blocks left in unconstrained dimensions
+    // factorization is sorted from smallest to largest factors
+    std::vector<unsigned> factors;
+    factor(factors, nblocks/prod);
+
+    using detail::Div;
+    std::vector< Div<Coordinate> > missing_divs;              // pairs consisting of (dim, #divs)
 
-  std::vector<unsigned> factors;
-  factor(factors, nblocks/prod);
+    // init missing_divs
+    for (int i = 0; i < dim; i++)
+    {
+        if (divisions[i] == 0)
+        {
+            Div<Coordinate> div;
+            div.dim = i;
+            div.nb = 1;
+            div.b_size = domain.max[i] - domain.min[i];
+            missing_divs.push_back(div);
+        }
+    }
 
-  // Fill the missing divs using LPT algorithm
-  std::vector<unsigned> missing_divs(divisions.size() - c, 1);
-  for (int i = factors.size() - 1; i >= 0; --i)
-    *std::min_element(missing_divs.begin(), missing_divs.end()) *= factors[i];
+    // iterate over factorization of number of blocks (factors are sorted smallest to largest)
+    // NB: using int instead of size_t because must be negative in order to break out of loop
+    for (int i = factors.size() - 1; i >= 0; --i)
+    {
+        // fill in missing divs by dividing dimension w/ largest block size
+        // except when this would be illegal (resulting in bounds.max < bounds.min;
+        // only a problem for discrete bounds
+
+        // sort on decreasing block size
+        std::sort(missing_divs.begin(), missing_divs.end());
+
+        // split the dimension with the largest block size (first element in vector)
+        Coordinate min =
+            detail::BoundsHelper<Bounds>::from(0,
+                                               missing_divs[0].nb * factors[i],
+                                               domain.min[missing_divs[0].dim],
+                                               domain.max[missing_divs[0].dim],
+                                               share_face[missing_divs[0].dim]);
+        Coordinate max =
+            detail::BoundsHelper<Bounds>::to(0,
+                                             missing_divs[0].nb * factors[i],
+                                             domain.min[missing_divs[0].dim],
+                                             domain.max[missing_divs[0].dim],
+                                             share_face[missing_divs[0].dim]);
+        if (max >= min)
+        {
+            missing_divs[0].nb    *= factors[i];
+            missing_divs[0].b_size = max - min;
+        }
+        else
+        {
+            std::ostringstream oss;
+            oss << "Unable to decompose domain into " << nblocks << " blocks: " << min << " " << max;
+            throw std::runtime_error(oss.str());
+        }
+    }
 
-  c = 0;
-  for (unsigned i = 0; i < dim; ++i)
-    if (divisions[i] == 0)
-      divisions[i] = missing_divs[c++];
+    // assign the divisions
+    for (size_t i = 0; i < missing_divs.size(); i++)
+        divisions[missing_divs[i].dim] = missing_divs[i].nb;
 }
 
 template<class Bounds>
@@ -478,7 +625,7 @@ diy::RegularDecomposer<Bounds>::
 factor(std::vector<unsigned>& factors, int n)
 {
   while (n != 1)
-    for (unsigned i = 2; i <= n; ++i)
+    for (int i = 2; i <= n; ++i)
     {
       if (n % i == 0)
       {
diff --git a/include/vtkdiy/detail/algorithms/kdtree-sampling.hpp b/include/vtkdiy/detail/algorithms/kdtree-sampling.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1b21a91de97dd1550b123c8840c4326059b26a15
--- /dev/null
+++ b/include/vtkdiy/detail/algorithms/kdtree-sampling.hpp
@@ -0,0 +1,453 @@
+#ifndef DIY_DETAIL_ALGORITHMS_KDTREE_SAMPLING_HPP
+#define DIY_DETAIL_ALGORITHMS_KDTREE_SAMPLING_HPP
+
+#include <vector>
+#include <cassert>
+#include "../../partners/all-reduce.hpp"
+
+// TODO: technically, what's done now is not a perfect subsample:
+//       we take the same number of samples from every block, in reality this number should be selected at random,
+//       so that the total number of samples adds up to samples*nblocks
+//
+// NB: random samples are chosen using rand(), which is assumed to be seeded
+//     externally. Once we switch to C++11, we should use its more advanced
+//     random number generators (and take a generator as an external parameter)
+//     (TODO)
+
+namespace diy
+{
+namespace detail
+{
+
+template<class Block, class Point>
+struct KDTreeSamplingPartition
+{
+    typedef     diy::RegularContinuousLink      RCLink;
+    typedef     diy::ContinuousBounds           Bounds;
+
+    typedef     std::vector<float>              Samples;
+
+                KDTreeSamplingPartition(int                             dim,
+                                        std::vector<Point>  Block::*    points,
+                                        size_t                          samples):
+                    dim_(dim), points_(points), samples_(samples)           {}
+
+    void        operator()(void* b_, const diy::ReduceProxy& srp, const KDTreePartners& partners) const;
+
+    int         divide_gid(int gid, bool lower, int round, int rounds) const;
+    void        update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const;
+    void        split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const;
+    diy::Direction
+                find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const;
+
+    void        compute_local_samples(Block* b, const diy::ReduceProxy& srp, int dim) const;
+    void        add_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const;
+    void        receive_samples(Block* b, const diy::ReduceProxy& srp,       Samples& samples) const;
+    void        forward_samples(Block* b, const diy::ReduceProxy& srp, const Samples& samples) const;
+
+    void        enqueue_exchange(Block* b, const diy::ReduceProxy& srp, int dim, const Samples& samples) const;
+    void        dequeue_exchange(Block* b, const diy::ReduceProxy& srp, int dim) const;
+
+    void        update_neighbor_bounds(Bounds& bounds, float split, int dim, bool lower) const;
+    bool        intersects(const Bounds& x, const Bounds& y, int dim, bool wrap, const Bounds& domain) const;
+    float       find_split(const Bounds& changed, const Bounds& original) const;
+
+    int                             dim_;
+    std::vector<Point>  Block::*    points_;
+    size_t                          samples_;
+};
+
+}
+}
+
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+operator()(void* b_, const diy::ReduceProxy& srp, const KDTreePartners& partners) const
+{
+    Block* b = static_cast<Block*>(b_);
+
+    int dim;
+    if (srp.round() < partners.rounds())
+        dim = partners.dim(srp.round());
+    else
+        dim = partners.dim(srp.round() - 1);
+
+    if (srp.round() == partners.rounds())
+        update_links(b, srp, dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain); // -1 would be the "uninformative" link round
+    else if (partners.swap_round(srp.round()) && partners.sub_round(srp.round()) < 0)       // link round
+    {
+        dequeue_exchange(b, srp, dim);         // from the swap round
+        split_to_neighbors(b, srp, dim);
+    }
+    else if (partners.swap_round(srp.round()))
+    {
+        Samples samples;
+        receive_samples(b, srp, samples);
+        enqueue_exchange(b, srp, dim, samples);
+    } else if (partners.sub_round(srp.round()) == 0)
+    {
+        if (srp.round() > 0)
+        {
+            int prev_dim = dim - 1;
+            if (prev_dim < 0)
+                prev_dim += dim_;
+            update_links(b, srp, prev_dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain);    // -1 would be the "uninformative" link round
+        }
+
+        compute_local_samples(b, srp, dim);
+    } else if (partners.sub_round(srp.round()) < (int) partners.histogram.rounds()/2)     // we are reusing partners class, so really we are talking about the samples rounds here
+    {
+        Samples samples;
+        add_samples(b, srp, samples);
+        srp.enqueue(srp.out_link().target(0), samples);
+    } else
+    {
+        Samples samples;
+        add_samples(b, srp, samples);
+        if (samples.size() != 1)
+        {
+            // pick the median
+            std::nth_element(samples.begin(), samples.begin() + samples.size()/2, samples.end());
+            std::swap(samples[0], samples[samples.size()/2]);
+            //std::sort(samples.begin(), samples.end());
+            //samples[0] = (samples[samples.size()/2] + samples[samples.size()/2 + 1])/2;
+            samples.resize(1);
+        }
+        forward_samples(b, srp, samples);
+    }
+}
+
+template<class Block, class Point>
+int
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+divide_gid(int gid, bool lower, int round, int rounds) const
+{
+    if (lower)
+        gid &= ~(1 << (rounds - 1 - round));
+    else
+        gid |=  (1 << (rounds - 1 - round));
+    return gid;
+}
+
+// round here is the outer iteration of the algorithm
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const
+{
+    int         gid  = srp.gid();
+    int         lid  = srp.master()->lid(gid);
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    // (gid, dir) -> i
+    std::map<std::pair<int,diy::Direction>, int> link_map;
+    for (int i = 0; i < link->size(); ++i)
+        link_map[std::make_pair(link->target(i).gid, link->direction(i))] = i;
+
+    // NB: srp.enqueue(..., ...) should match the link
+    std::vector<float>  splits(link->size());
+    for (int i = 0; i < link->size(); ++i)
+    {
+        float split; diy::Direction dir;
+
+        int in_gid = link->target(i).gid;
+        while(srp.incoming(in_gid))
+        {
+            srp.dequeue(in_gid, split);
+            srp.dequeue(in_gid, dir);
+
+            // reverse dir
+            for (int j = 0; j < dim_; ++j)
+                dir[j] = -dir[j];
+
+            int k = link_map[std::make_pair(in_gid, dir)];
+            //printf("%d %d %f -> %d\n", in_gid, dir, split, k);
+            splits[k] = split;
+        }
+    }
+
+    RCLink      new_link(dim_, link->core(), link->core());
+
+    bool lower = !(gid & (1 << (rounds - 1 - round)));
+
+    // fill out the new link
+    for (int i = 0; i < link->size(); ++i)
+    {
+        diy::Direction  dir = link->direction(i);
+        //diy::Direction  wrap_dir = link->wrap(i);     // we don't use existing wrap, but restore it from scratch
+        if (dir[dim] != 0)
+        {
+            if ((dir[dim] < 0 && lower) || (dir[dim] > 0 && !lower))
+            {
+                int nbr_gid = divide_gid(link->target(i).gid, !lower, round, rounds);
+                diy::BlockID nbr = { nbr_gid, srp.assigner().rank(nbr_gid) };
+                new_link.add_neighbor(nbr);
+
+                new_link.add_direction(dir);
+
+                Bounds bounds = link->bounds(i);
+                update_neighbor_bounds(bounds, splits[i], dim, !lower);
+                new_link.add_bounds(bounds);
+
+                if (wrap)
+                    new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain));
+                else
+                    new_link.add_wrap(diy::Direction());
+            }
+        } else // non-aligned side
+        {
+            for (int j = 0; j < 2; ++j)
+            {
+                int nbr_gid = divide_gid(link->target(i).gid, j == 0, round, rounds);
+
+                Bounds  bounds  = link->bounds(i);
+                update_neighbor_bounds(bounds, splits[i], dim, j == 0);
+
+                if (intersects(bounds, new_link.bounds(), dim, wrap, domain))
+                {
+                    diy::BlockID nbr = { nbr_gid, srp.assigner().rank(nbr_gid) };
+                    new_link.add_neighbor(nbr);
+                    new_link.add_direction(dir);
+                    new_link.add_bounds(bounds);
+
+                    if (wrap)
+                        new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain));
+                    else
+                        new_link.add_wrap(diy::Direction());
+                }
+            }
+        }
+    }
+
+    // add link to the dual block
+    int dual_gid = divide_gid(gid, !lower, round, rounds);
+    diy::BlockID dual = { dual_gid, srp.assigner().rank(dual_gid) };
+    new_link.add_neighbor(dual);
+
+    Bounds nbr_bounds = link->bounds();     // old block bounds
+    update_neighbor_bounds(nbr_bounds, find_split(new_link.bounds(), nbr_bounds), dim, !lower);
+    new_link.add_bounds(nbr_bounds);
+
+    new_link.add_wrap(diy::Direction());    // dual block cannot be wrapped
+
+    if (lower)
+    {
+        diy::Direction right;
+        right[dim] = 1;
+        new_link.add_direction(right);
+    } else
+    {
+        diy::Direction left;
+        left[dim] = -1;
+        new_link.add_direction(left);
+    }
+
+    // update the link; notice that this won't conflict with anything since
+    // reduce is using its own notion of the link constructed through the
+    // partners
+    link->swap(new_link);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const
+{
+    int         lid  = srp.master()->lid(srp.gid());
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    // determine split
+    float split = find_split(link->core(), link->bounds());
+
+    for (int i = 0; i < link->size(); ++i)
+    {
+        srp.enqueue(link->target(i), split);
+        srp.enqueue(link->target(i), link->direction(i));
+    }
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+compute_local_samples(Block* b, const diy::ReduceProxy& srp, int dim) const
+{
+    // compute and enqueue local samples
+    Samples samples;
+    size_t points_size = (b->*points_).size();
+    size_t n = std::min(points_size, samples_);
+    samples.reserve(n);
+    for (size_t i = 0; i < n; ++i)
+    {
+        float x = (b->*points_)[rand() % points_size][dim];
+        samples.push_back(x);
+    }
+
+    srp.enqueue(srp.out_link().target(0), samples);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+add_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const
+{
+    // dequeue and combine the samples
+    for (int i = 0; i < srp.in_link().size(); ++i)
+    {
+        int nbr_gid = srp.in_link().target(i).gid;
+
+        Samples smpls;
+        srp.dequeue(nbr_gid, smpls);
+        for (size_t i = 0; i < smpls.size(); ++i)
+            samples.push_back(smpls[i]);
+    }
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+receive_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const
+{
+    srp.dequeue(srp.in_link().target(0).gid, samples);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+forward_samples(Block* b, const diy::ReduceProxy& srp, const Samples& samples) const
+{
+    for (int i = 0; i < srp.out_link().size(); ++i)
+        srp.enqueue(srp.out_link().target(i), samples);
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+enqueue_exchange(Block* b, const diy::ReduceProxy& srp, int dim, const Samples& samples) const
+{
+    int         lid  = srp.master()->lid(srp.gid());
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    int k = srp.out_link().size();
+
+    if (k == 0)        // final round; nothing needs to be sent; this is actually redundant
+        return;
+
+    // pick split points
+    float split = samples[0];
+
+    // subset and enqueue
+    std::vector< std::vector<Point> > out_points(srp.out_link().size());
+    for (size_t i = 0; i < (b->*points_).size(); ++i)
+    {
+      float x = (b->*points_)[i][dim];
+      int loc = x < split ? 0 : 1;
+      out_points[loc].push_back((b->*points_)[i]);
+    }
+    int pos = -1;
+    for (int i = 0; i < k; ++i)
+    {
+      if (srp.out_link().target(i).gid == srp.gid())
+      {
+        (b->*points_).swap(out_points[i]);
+        pos = i;
+      }
+      else
+        srp.enqueue(srp.out_link().target(i), out_points[i]);
+    }
+    if (pos == 0)
+        link->core().max[dim] = split;
+    else
+        link->core().min[dim] = split;
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+dequeue_exchange(Block* b, const diy::ReduceProxy& srp, int dim) const
+{
+    int         lid  = srp.master()->lid(srp.gid());
+    RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
+
+    for (int i = 0; i < srp.in_link().size(); ++i)
+    {
+      int nbr_gid = srp.in_link().target(i).gid;
+      if (nbr_gid == srp.gid())
+          continue;
+
+      std::vector<Point>   in_points;
+      srp.dequeue(nbr_gid, in_points);
+      for (size_t j = 0; j < in_points.size(); ++j)
+      {
+        if (in_points[j][dim] < link->core().min[dim] || in_points[j][dim] > link->core().max[dim])
+        {
+            fprintf(stderr, "Warning: dequeued %f outside [%f,%f] (%d)\n",
+                            in_points[j][dim], link->core().min[dim], link->core().max[dim], dim);
+            std::abort();
+        }
+        (b->*points_).push_back(in_points[j]);
+      }
+    }
+}
+
+template<class Block, class Point>
+void
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+update_neighbor_bounds(Bounds& bounds, float split, int dim, bool lower) const
+{
+    if (lower)
+        bounds.max[dim] = split;
+    else
+        bounds.min[dim] = split;
+}
+
+template<class Block, class Point>
+bool
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+intersects(const Bounds& x, const Bounds& y, int dim, bool wrap, const Bounds& domain) const
+{
+    if (wrap)
+    {
+        if (x.min[dim] == domain.min[dim] && y.max[dim] == domain.max[dim])
+            return true;
+        if (y.min[dim] == domain.min[dim] && x.max[dim] == domain.max[dim])
+            return true;
+    }
+    return x.min[dim] <= y.max[dim] && y.min[dim] <= x.max[dim];
+}
+
+template<class Block, class Point>
+float
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+find_split(const Bounds& changed, const Bounds& original) const
+{
+    for (int i = 0; i < dim_; ++i)
+    {
+        if (changed.min[i] != original.min[i])
+            return changed.min[i];
+        if (changed.max[i] != original.max[i])
+            return changed.max[i];
+    }
+    assert(0);
+    return -1;
+}
+
+template<class Block, class Point>
+diy::Direction
+diy::detail::KDTreeSamplingPartition<Block,Point>::
+find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const
+{
+    diy::Direction wrap;
+    for (int i = 0; i < dim_; ++i)
+    {
+        if (bounds.min[i] == domain.min[i] && nbr_bounds.max[i] == domain.max[i])
+            wrap[i] = -1;
+        if (bounds.max[i] == domain.max[i] && nbr_bounds.min[i] == domain.min[i])
+            wrap[i] =  1;
+    }
+    return wrap;
+}
+
+
+#endif
diff --git a/include/vtkdiy/detail/algorithms/kdtree.hpp b/include/vtkdiy/detail/algorithms/kdtree.hpp
index 0bce13591778e1092a0b29e6db70a9b5d0a6f648..5e0c3ddfa1b5d5057e26f231d883055f8c6bb72d 100644
--- a/include/vtkdiy/detail/algorithms/kdtree.hpp
+++ b/include/vtkdiy/detail/algorithms/kdtree.hpp
@@ -30,6 +30,8 @@ struct KDTreePartition
     int         divide_gid(int gid, bool lower, int round, int rounds) const;
     void        update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const;
     void        split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const;
+    diy::Direction
+                find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const;
 
     void        compute_local_histogram(Block* b, const diy::ReduceProxy& srp, int dim) const;
     void        add_histogram(Block* b, const diy::ReduceProxy& srp, Histogram& histogram) const;
@@ -59,8 +61,9 @@ struct diy::detail::KDTreePartners
   typedef           diy::ContinuousBounds                   Bounds;
 
                     KDTreePartners(int dim, int nblocks, bool wrap_, const Bounds& domain_):
-                        histogram(1, nblocks, 2),
-                        swap(1, nblocks, 2, false),
+                        decomposer(1, interval(0,nblocks-1), nblocks),
+                        histogram(decomposer, 2),
+                        swap(decomposer, 2, false),
                         wrap(wrap_),
                         domain(domain_)
   {
@@ -94,7 +97,7 @@ struct diy::detail::KDTreePartners
 
   inline bool   active(int round, int gid, const diy::Master& m) const
   {
-    if (round == rounds())
+    if (round == (int) rounds())
         return true;
     else if (swap_round(round) && sub_round(round) < 0)     // link round
         return true;
@@ -106,7 +109,7 @@ struct diy::detail::KDTreePartners
 
   inline void   incoming(int round, int gid, std::vector<int>& partners, const diy::Master& m) const
   {
-    if (round == rounds())
+    if (round == (int) rounds())
         link_neighbors(-1, gid, partners, m);
     else if (swap_round(round) && sub_round(round) < 0)       // link round
         swap.incoming(sub_round(round - 1) + 1, gid, partners, m);
@@ -125,7 +128,7 @@ struct diy::detail::KDTreePartners
 
   inline void   outgoing(int round, int gid, std::vector<int>& partners, const diy::Master& m) const
   {
-    if (round == rounds())
+    if (round == (int) rounds())
         swap.outgoing(sub_round(round-1) + 1, gid, partners, m);
     else if (swap_round(round) && sub_round(round) < 0)       // link round
         link_neighbors(-1, gid, partners, m);
@@ -141,13 +144,16 @@ struct diy::detail::KDTreePartners
     diy::Link*  link = m.link(lid);
 
     std::set<int> result;       // partners must be unique
-    for (size_t i = 0; i < link->size(); ++i)
+    for (int i = 0; i < link->size(); ++i)
         result.insert(link->target(i).gid);
 
     for (std::set<int>::const_iterator it = result.begin(); it != result.end(); ++it)
         partners.push_back(*it);
   }
 
+  // 1-D domain to feed into histogram and swap
+  diy::RegularDecomposer<diy::DiscreteBounds>   decomposer;
+
   diy::RegularAllReducePartners     histogram;
   diy::RegularSwapPartners          swap;
 
@@ -194,7 +200,7 @@ operator()(void* b_, const diy::ReduceProxy& srp, const KDTreePartners& partners
         }
 
         compute_local_histogram(b, srp, dim);
-    } else if (partners.sub_round(srp.round()) < partners.histogram.rounds()/2)
+    } else if (partners.sub_round(srp.round()) < (int) partners.histogram.rounds()/2)
     {
         Histogram   histogram(bins_);
         add_histogram(b, srp, histogram);
@@ -248,45 +254,41 @@ update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int roun
             srp.dequeue(in_gid, dir);
 
             // reverse dir
-            int j = 0;
-            while (dir >> (j + 1))
-                ++j;
-
-            if (j % 2 == 0)
-                dir = static_cast<diy::Direction>(dir << 1);
-            else
-                dir = static_cast<diy::Direction>(dir >> 1);
+            for (int j = 0; j < dim_; ++j)
+                dir[j] = -dir[j];
 
             int k = link_map[std::make_pair(in_gid, dir)];
-            //printf("%d %d %f -> %d\n", in_gid, dir, split, k);
             splits[k] = split;
         }
     }
 
     RCLink      new_link(dim_, link->core(), link->core());
 
-    diy::Direction left  = static_cast<diy::Direction>(1 <<   2*dim);
-    diy::Direction right = static_cast<diy::Direction>(1 <<  (2*dim + 1));
-
     bool lower = !(gid & (1 << (rounds - 1 - round)));
 
     // fill out the new link
     for (int i = 0; i < link->size(); ++i)
     {
-        diy::Direction  dir = link->direction(i);
-        if (dir == left || dir == right)
+        diy::Direction  dir      = link->direction(i);
+        //diy::Direction  wrap_dir = link->wrap(i);     // we don't use existing wrap, but restore it from scratch
+        if (dir[dim] != 0)
         {
-            if ((dir == left && lower) || (dir == right && !lower))
+            if ((dir[dim] < 0 && lower) || (dir[dim] > 0 && !lower))
             {
-                int nbr_gid = divide_gid(link->target(i).gid, dir != left, round, rounds);
+                int nbr_gid = divide_gid(link->target(i).gid, !lower, round, rounds);
                 diy::BlockID nbr = { nbr_gid, srp.assigner().rank(nbr_gid) };
                 new_link.add_neighbor(nbr);
 
                 new_link.add_direction(dir);
 
                 Bounds bounds = link->bounds(i);
-                update_neighbor_bounds(bounds, splits[i], dim, dir != left);
+                update_neighbor_bounds(bounds, splits[i], dim, !lower);
                 new_link.add_bounds(bounds);
+
+                if (wrap)
+                    new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain));
+                else
+                    new_link.add_wrap(diy::Direction());
             }
         } else // non-aligned side
         {
@@ -303,6 +305,11 @@ update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int roun
                     new_link.add_neighbor(nbr);
                     new_link.add_direction(dir);
                     new_link.add_bounds(bounds);
+
+                    if (wrap)
+                        new_link.add_wrap(find_wrap(new_link.bounds(), bounds, domain));
+                    else
+                        new_link.add_wrap(diy::Direction());
                 }
             }
         }
@@ -317,10 +324,19 @@ update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int roun
     update_neighbor_bounds(nbr_bounds, find_split(new_link.bounds(), nbr_bounds), dim, !lower);
     new_link.add_bounds(nbr_bounds);
 
+    new_link.add_wrap(diy::Direction());    // dual block cannot be wrapped
+
     if (lower)
+    {
+        diy::Direction right;
+        right[dim] = 1;
         new_link.add_direction(right);
-    else
+    } else
+    {
+        diy::Direction left;
+        left[dim] = -1;
         new_link.add_direction(left);
+    }
 
     // update the link; notice that this won't conflict with anything since
     // reduce is using its own notion of the link constructed through the
@@ -339,7 +355,7 @@ split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const
     // determine split
     float split = find_split(link->core(), link->bounds());
 
-    for (size_t i = 0; i < link->size(); ++i)
+    for (int i = 0; i < link->size(); ++i)
     {
         srp.enqueue(link->target(i), split);
         srp.enqueue(link->target(i), link->direction(i));
@@ -367,7 +383,7 @@ compute_local_histogram(Block* b, const diy::ReduceProxy& srp, int dim) const
             std::cerr << loc << " " << x << " " << link->core().min[dim] << std::endl;
             std::abort();
         }
-        if (loc >= bins_)
+        if (loc >= (int) bins_)
             loc = bins_ - 1;
         ++(histogram[loc]);
     }
@@ -381,7 +397,7 @@ diy::detail::KDTreePartition<Block,Point>::
 add_histogram(Block* b, const diy::ReduceProxy& srp, Histogram& histogram) const
 {
     // dequeue and add up the histograms
-    for (unsigned i = 0; i < srp.in_link().size(); ++i)
+    for (int i = 0; i < srp.in_link().size(); ++i)
     {
         int nbr_gid = srp.in_link().target(i).gid;
 
@@ -405,7 +421,7 @@ void
 diy::detail::KDTreePartition<Block,Point>::
 forward_histogram(Block* b, const diy::ReduceProxy& srp, const Histogram& histogram) const
 {
-    for (unsigned i = 0; i < srp.out_link().size(); ++i)
+    for (int i = 0; i < srp.out_link().size(); ++i)
         srp.enqueue(srp.out_link().target(i), histogram);
 }
 
@@ -429,9 +445,8 @@ enqueue_exchange(Block* b, const diy::ReduceProxy& srp, int dim, const Histogram
     //fprintf(stderr, "Histogram total: %lu\n", total);
 
     size_t cur   = 0;
-    size_t last, next;
     float  width = (link->core().max[dim] - link->core().min[dim])/bins_;
-    float  split;
+    float  split = 0;
     for (size_t i = 0; i < histogram.size(); ++i)
     {
         if (cur + histogram[i] > total/2)
@@ -476,7 +491,7 @@ dequeue_exchange(Block* b, const diy::ReduceProxy& srp, int dim) const
     int         lid  = srp.master()->lid(srp.gid());
     RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
 
-    for (unsigned i = 0; i < srp.in_link().size(); ++i)
+    for (int i = 0; i < srp.in_link().size(); ++i)
     {
       int nbr_gid = srp.in_link().target(i).gid;
       if (nbr_gid == srp.gid())
@@ -528,19 +543,32 @@ float
 diy::detail::KDTreePartition<Block,Point>::
 find_split(const Bounds& changed, const Bounds& original) const
 {
-    diy::Direction dir = DIY_X0;
     for (int i = 0; i < dim_; ++i)
     {
         if (changed.min[i] != original.min[i])
             return changed.min[i];
-        dir = static_cast<diy::Direction>(dir << 1);
         if (changed.max[i] != original.max[i])
             return changed.max[i];
-        dir = static_cast<diy::Direction>(dir << 1);
     }
     assert(0);
     return -1;
 }
 
+template<class Block, class Point>
+diy::Direction
+diy::detail::KDTreePartition<Block,Point>::
+find_wrap(const Bounds& bounds, const Bounds& nbr_bounds, const Bounds& domain) const
+{
+    diy::Direction wrap;
+    for (int i = 0; i < dim_; ++i)
+    {
+        if (bounds.min[i] == domain.min[i] && nbr_bounds.max[i] == domain.max[i])
+            wrap[i] = -1;
+        if (bounds.max[i] == domain.max[i] && nbr_bounds.min[i] == domain.min[i])
+            wrap[i] =  1;
+    }
+    return wrap;
+}
+
 
 #endif
diff --git a/include/vtkdiy/detail/algorithms/sort.hpp b/include/vtkdiy/detail/algorithms/sort.hpp
index 49541957bbcab7c9ebdacfe6f61a2e4de946b301..90f8035057cf22d4b79dd4756aaa7449918b7cb9 100644
--- a/include/vtkdiy/detail/algorithms/sort.hpp
+++ b/include/vtkdiy/detail/algorithms/sort.hpp
@@ -99,7 +99,7 @@ struct SampleSort<Block,T,Cmp>::Sampler
         if (k_in == 0)
         {
             // draw random samples
-            for (int i = 0; i < num_samples; ++i)
+            for (size_t i = 0; i < num_samples; ++i)
                 samples.push_back((b->*values)[std::rand() % (b->*values).size()]);
         } else
             dequeue_values(samples, srp, false);
@@ -110,7 +110,7 @@ struct SampleSort<Block,T,Cmp>::Sampler
             std::sort(samples.begin(), samples.end(), cmp);
             std::vector<T>  subsamples(srp.nblocks() - 1);
             int step = samples.size() / srp.nblocks();       // NB: subsamples.size() + 1
-            for (int i = 0; i < subsamples.size(); ++i)
+            for (size_t i = 0; i < subsamples.size(); ++i)
                 subsamples[i] = samples[(i+1)*step];
             (b->*dividers).swap(subsamples);
         }
diff --git a/include/vtkdiy/detail/reduce/all-to-all.hpp b/include/vtkdiy/detail/reduce/all-to-all.hpp
index 88bfb877e64aa13b9363e7732b4a84c519600980..c49b7d28cdb56b2ab558c62e85b7cb16433b2c6d 100644
--- a/include/vtkdiy/detail/reduce/all-to-all.hpp
+++ b/include/vtkdiy/detail/reduce/all-to-all.hpp
@@ -154,7 +154,7 @@ namespace detail
          SkipIntermediate(size_t rounds_):
             rounds(rounds_)                                     {}
 
-    bool operator()(int round, int, const Master&) const        { if (round == 0 || round == rounds) return false; return true; }
+    bool operator()(int round, int, const Master&) const        { if (round == 0 || round == (int) rounds) return false; return true; }
 
     size_t  rounds;
   };
diff --git a/include/vtkdiy/io/block.hpp b/include/vtkdiy/io/block.hpp
index 4161d59209218d4690fba27fef50b9b35811759e..4667a32101a988a839ad39e1591d99312ef262af 100644
--- a/include/vtkdiy/io/block.hpp
+++ b/include/vtkdiy/io/block.hpp
@@ -174,65 +174,45 @@ namespace io
  * \ingroup IO
  * \brief Read blocks from storage collectively from one shared file
  */
-  inline
-  void
-  read_blocks(const std::string&           infilename,     //!< input file name
-              const mpi::communicator&     comm,           //!< communicator
-              Assigner&                    assigner,       //!< assigner object
-              Master&                      master,         //!< master object
-              MemoryBuffer&                extra,          //!< user-defined metadata in file header
-              Master::LoadBlock            load = 0)       //!< load block function in case different than or unefined in the master
-  {
-    if (!load) load = master.loader();      // load is likely to be different from master.load()
+    inline
+    void
+    read_blocks(const std::string&           infilename,     //!< input file name
+                const mpi::communicator&     comm,           //!< communicator
+                Assigner&                    assigner,       //!< assigner object
+                Master&                      master,         //!< master object
+                MemoryBuffer&                extra,          //!< user-defined metadata in file header
+                Master::LoadBlock            load = 0)       //!< load block function in case different than or unefined in the master
+    {
+        if (!load) load = master.loader();      // load is likely to be different from master.load()
 
-    typedef detail::offset_t                offset_t;
-    typedef detail::GidOffsetCount          GidOffsetCount;
+        typedef detail::offset_t                offset_t;
+        typedef detail::GidOffsetCount          GidOffsetCount;
 
-    mpi::io::file f(comm, infilename, mpi::io::file::rdonly);
+        mpi::io::file f(comm, infilename, mpi::io::file::rdonly);
 
-    offset_t    footer_offset = f.size() - sizeof(size_t);
-    size_t footer_size;
+        offset_t    footer_offset = f.size() - sizeof(size_t);
+        size_t footer_size;
 
-    // Read the size
-    try
-    {
+        // Read the size
         f.read_at_all(footer_offset, (char*) &footer_size, sizeof(footer_size));
-    }
-    catch(...)
-    {
-        fprintf(stderr, "Error reading footer size\n");
-        throw(1);
-    }
 
-    // Read all_offset_counts
-    footer_offset -= footer_size;
-    MemoryBuffer footer;
-    try
-    {
+        // Read all_offset_counts
+        footer_offset -= footer_size;
+        MemoryBuffer footer;
         footer.buffer.resize(footer_size);
         f.read_at_all(footer_offset, footer.buffer);
-    }
-    catch(...)
-    {
-        fprintf(stderr, "Error reading footer\n");
-        throw(1);
-    }
 
-    std::vector<GidOffsetCount>  all_offset_counts;
-    diy::load(footer, all_offset_counts);
-    diy::load(footer, extra);
-    extra.reset();
+        std::vector<GidOffsetCount>  all_offset_counts;
+        diy::load(footer, all_offset_counts);
+        diy::load(footer, extra);
+        extra.reset();
 
-    // Get local gids from assigner
-    size_t size = all_offset_counts.size();
-    assigner.set_nblocks(size);
-    std::vector<int> gids;
-    assigner.local_gids(comm.rank(), gids);
+        // Get local gids from assigner
+        size_t size = all_offset_counts.size();
+        assigner.set_nblocks(size);
+        std::vector<int> gids;
+        assigner.local_gids(comm.rank(), gids);
 
-    // Read our blocks;
-    // TODO: use collective IO, when possible
-    try
-    {
         for (unsigned i = 0; i < gids.size(); ++i)
         {
             if (gids[i] != all_offset_counts[gids[i]].gid)
@@ -251,12 +231,6 @@ namespace io
             master.add(gids[i], b, l);
         }
     }
-    catch(...)
-    {
-        fprintf(stderr, "Error reading blocks\n");
-        throw(1);
-    }
-  }
 
 
   // Functions without the extra buffer, for compatibility with the old code
diff --git a/include/vtkdiy/io/bov.hpp b/include/vtkdiy/io/bov.hpp
index f4ead18bff72a87cfb70283564ec727f44f63976..cb059d0317a43fa51fa776442e7e481752600df2 100644
--- a/include/vtkdiy/io/bov.hpp
+++ b/include/vtkdiy/io/bov.hpp
@@ -74,7 +74,7 @@ read(const DiscreteBounds& bounds, T* buffer, bool collective, int chunk) const
   int dim   = shape_.size();
   int total = 1;
   std::vector<int> subsizes;
-  for (unsigned i = 0; i < dim; ++i)
+  for (int i = 0; i < dim; ++i)
   {
     subsizes.push_back(bounds.max[i] - bounds.min[i] + 1);
     total *= subsizes.back();
@@ -88,15 +88,15 @@ read(const DiscreteBounds& bounds, T* buffer, bool collective, int chunk) const
     // create an MPI struct of size chunk to read the data in those chunks
     // (this allows to work around MPI-IO weirdness where crucial quantities
     // are ints, which are too narrow of a type)
-    const int             array_of_blocklengths[]  = { chunk };
-    const MPI_Aint        array_of_displacements[] = { 0 };
-    const MPI_Datatype    array_of_types[]         = { mpi::detail::get_mpi_datatype<T>() };
+    int             array_of_blocklengths[]  = { chunk };
+    MPI_Aint        array_of_displacements[] = { 0 };
+    MPI_Datatype    array_of_types[]         = { mpi::detail::get_mpi_datatype<T>() };
     MPI_Type_create_struct(1, array_of_blocklengths, array_of_displacements, array_of_types, &T_type);
     MPI_Type_commit(&T_type);
   }
 
   MPI_Datatype fileblk;
-  MPI_Type_create_subarray(dim, &shape_[0], &subsizes[0], &bounds.min[0], MPI_ORDER_C, T_type, &fileblk);
+  MPI_Type_create_subarray(dim, (int*) &shape_[0], &subsizes[0], (int*) &bounds.min[0], MPI_ORDER_C, T_type, &fileblk);
   MPI_Type_commit(&fileblk);
 
   MPI_File_set_view(f_.handle(), offset_, T_type, fileblk, "native", MPI_INFO_NULL);
@@ -128,7 +128,7 @@ write(const DiscreteBounds& bounds, const T* buffer, const DiscreteBounds& core,
   int dim   = shape_.size();
   std::vector<int> subsizes;
   std::vector<int> buffer_shape, buffer_start;
-  for (unsigned i = 0; i < dim; ++i)
+  for (int i = 0; i < dim; ++i)
   {
     buffer_shape.push_back(bounds.max[i] - bounds.min[i] + 1);
     buffer_start.push_back(core.min[i] - bounds.min[i]);
@@ -141,16 +141,16 @@ write(const DiscreteBounds& bounds, const T* buffer, const DiscreteBounds& core,
   else
   {
     // assume T is a binary block and create an MPI struct of appropriate size
-    const int             array_of_blocklengths[]  = { chunk };
-    const MPI_Aint        array_of_displacements[] = { 0 };
-    const MPI_Datatype    array_of_types[]         = { mpi::detail::get_mpi_datatype<T>() };
+    int             array_of_blocklengths[]  = { chunk };
+    MPI_Aint        array_of_displacements[] = { 0 };
+    MPI_Datatype    array_of_types[]         = { mpi::detail::get_mpi_datatype<T>() };
     MPI_Type_create_struct(1, array_of_blocklengths, array_of_displacements, array_of_types, &T_type);
     MPI_Type_commit(&T_type);
   }
 
   MPI_Datatype fileblk, subbuffer;
-  MPI_Type_create_subarray(dim, &shape_[0],       &subsizes[0], &bounds.min[0],   MPI_ORDER_C, T_type, &fileblk);
-  MPI_Type_create_subarray(dim, &buffer_shape[0], &subsizes[0], &buffer_start[0], MPI_ORDER_C, T_type, &subbuffer);
+  MPI_Type_create_subarray(dim, (int*) &shape_[0],       &subsizes[0], (int*) &bounds.min[0],   MPI_ORDER_C, T_type, &fileblk);
+  MPI_Type_create_subarray(dim, (int*) &buffer_shape[0], &subsizes[0], (int*) &buffer_start[0], MPI_ORDER_C, T_type, &subbuffer);
   MPI_Type_commit(&fileblk);
   MPI_Type_commit(&subbuffer);
 
diff --git a/include/vtkdiy/io/numpy.hpp b/include/vtkdiy/io/numpy.hpp
index ae295dc26da263165531bd9e8ec5a0eef9ec8da8..0199a0c38f4f1ea23c79661578036074dfab1fa2 100644
--- a/include/vtkdiy/io/numpy.hpp
+++ b/include/vtkdiy/io/numpy.hpp
@@ -143,7 +143,7 @@ write_header(const S& shape)
     convert_and_save(dict, sizeof(T));
     save(dict, "', 'fortran_order': False, 'shape': (");
     convert_and_save(dict, shape[0]);
-    for (int i = 1; i < shape.size(); i++)
+    for (int i = 1; i < (int) shape.size(); i++)
     {
         save(dict, ", ");
         convert_and_save(dict, shape[i]);
@@ -174,7 +174,8 @@ char
 diy::io::detail::big_endian()
 {
   unsigned char x[] = {1,0};
-  short y = *(short*) x;
+  void* x_void = x;
+  short y = *static_cast<short*>(x_void);
   return y == 1 ? '<' : '>';
 }
 
diff --git a/include/vtkdiy/link.hpp b/include/vtkdiy/link.hpp
index 16f64ec4c0c369e82e977362df6b0ab4e1e96347..3262eef610a8b3f5e66649c5379adf6eb075ffb2 100644
--- a/include/vtkdiy/link.hpp
+++ b/include/vtkdiy/link.hpp
@@ -77,19 +77,21 @@ namespace diy
       typedef   std::vector<Direction>              DirVec;
 
     public:
-                RegularLink(int dim, const Bounds& core, const Bounds& bounds, Direction wrap = Direction(0)):
-                  dim_(dim), wrap_(wrap), core_(core), bounds_(bounds)            {}
+                RegularLink(int dim, const Bounds& core, const Bounds& bounds):
+                  dim_(dim), core_(core), bounds_(bounds)            {}
 
-      int       dimension() const                   { return dim_; }
+      // dimension
+      int       dimension() const                       { return dim_; }
 
       // direction
-      int       direction(Direction dir) const;     // convert direction to a neighbor (-1 if no neighbor)
-      Direction direction(int i) const              { return dir_vec_[i]; }
-      void      add_direction(Direction dir)        { int c = dir_map_.size(); dir_map_[dir] = c; dir_vec_.push_back(dir); }
+      int       direction(Direction dir) const;         // convert direction to a neighbor (-1 if no neighbor)
+      Direction direction(int i) const                  { return dir_vec_[i]; }
+      void      add_direction(Direction dir)            { int c = dir_map_.size(); dir_map_[dir] = c; dir_vec_.push_back(dir); }
 
       // wrap
-      void      add_wrap(Direction dir)             { wrap_ = static_cast<Direction>(wrap_ | dir); }
-      Direction wrap() const                        { return wrap_; }
+      void       add_wrap(Direction dir)                { wrap_.push_back(dir); }
+      Direction  wrap(int i) const                      { return wrap_[i]; }
+      Direction& wrap(int i)                            { return wrap_[i]; }
 
       // bounds
       const Bounds& core() const                        { return core_; }
@@ -99,7 +101,7 @@ namespace diy
       const Bounds& bounds(int i) const                 { return nbr_bounds_[i]; }
       void          add_bounds(const Bounds& bounds)    { nbr_bounds_.push_back(bounds); }
 
-      void      swap(RegularLink& other)                { Link::swap(other); dir_map_.swap(other.dir_map_); dir_vec_.swap(other.dir_vec_); nbr_bounds_.swap(other.nbr_bounds_); std::swap(dim_, other.dim_); std::swap(wrap_, other.wrap_); std::swap(core_, other.core_); std::swap(bounds_, other.bounds_); }
+      void      swap(RegularLink& other)                { Link::swap(other); dir_map_.swap(other.dir_map_); dir_vec_.swap(other.dir_vec_); nbr_bounds_.swap(other.nbr_bounds_); std::swap(dim_, other.dim_); wrap_.swap(other.wrap_); std::swap(core_, other.core_); std::swap(bounds_, other.bounds_); }
 
       void      save(BinaryBuffer& bb) const
       {
@@ -107,10 +109,10 @@ namespace diy
           diy::save(bb, dim_);
           diy::save(bb, dir_map_);
           diy::save(bb, dir_vec_);
-          diy::save(bb, wrap_);
           diy::save(bb, core_);
           diy::save(bb, bounds_);
           diy::save(bb, nbr_bounds_);
+          diy::save(bb, wrap_);
       }
 
       void      load(BinaryBuffer& bb)
@@ -119,10 +121,10 @@ namespace diy
           diy::load(bb, dim_);
           diy::load(bb, dir_map_);
           diy::load(bb, dir_vec_);
-          diy::load(bb, wrap_);
           diy::load(bb, core_);
           diy::load(bb, bounds_);
           diy::load(bb, nbr_bounds_);
+          diy::load(bb, wrap_);
       }
 
       virtual size_t id() const                         { return RegularLinkSelector<Bounds>::id; }
@@ -132,11 +134,11 @@ namespace diy
 
       DirMap    dir_map_;
       DirVec    dir_vec_;
-      Direction wrap_;
 
-      Bounds                core_;
-      Bounds                bounds_;
-      std::vector<Bounds>   nbr_bounds_;
+      Bounds                    core_;
+      Bounds                    bounds_;
+      std::vector<Bounds>       nbr_bounds_;
+      std::vector<Direction>    wrap_;
   };
 
   // Other cover candidates: KDTreeLink, AMRGridLink
diff --git a/include/vtkdiy/master.hpp b/include/vtkdiy/master.hpp
index 65c2baddef0deba2028b02ca7fd26acfe7b85c2f..066763503772bc0d4e9ac56219dfa2de60511d6f 100644
--- a/include/vtkdiy/master.hpp
+++ b/include/vtkdiy/master.hpp
@@ -206,6 +206,8 @@ namespace diy
       int           threads() const                     { return threads_; }
       int           in_memory() const                   { return *blocks_.in_memory().const_access(); }
 
+      void          set_threads(int threads)            { threads_ = threads; }
+
       CreateBlock   creator() const                     { return blocks_.creator(); }
       DestroyBlock  destroyer() const                   { return blocks_.destroyer(); }
       LoadBlock     loader() const                      { return blocks_.loader(); }
@@ -256,6 +258,7 @@ namespace diy
       void              set_expected(int expected)      { expected_ = expected; }
       void              add_expected(int i)             { expected_ += i; }
       int               expected() const                { return expected_; }
+      void              replace_link(int i, Link* link) { expected_ -= links_[i]->size_unique(); delete links_[i]; links_[i] = link; expected_ += links_[i]->size_unique(); }
 
     public:
       // Communicator functionality
diff --git a/include/vtkdiy/mpi/collectives.hpp b/include/vtkdiy/mpi/collectives.hpp
index fd7e558208509a915df69698c609e7b8c13d1215..d1cc0bdd80896e5c2b13e0f67e571f1e219e33b4 100644
--- a/include/vtkdiy/mpi/collectives.hpp
+++ b/include/vtkdiy/mpi/collectives.hpp
@@ -135,10 +135,10 @@ namespace mpi
 
       out.resize(comm.size());
       size_t cur = 0;
-      for (unsigned i = 0; i < comm.size(); ++i)
+      for (int i = 0; i < comm.size(); ++i)
       {
           out[i].reserve(counts[i]);
-          for (unsigned j = 0; j < counts[i]; ++j)
+          for (int j = 0; j < counts[i]; ++j)
               out[i].push_back(buffer[cur++]);
       }
     }
@@ -173,6 +173,17 @@ namespace mpi
                     comm);
     }
 
+    static void all_reduce(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, const Op&)
+    {
+      out.resize(in.size());
+      MPI_Allreduce(Datatype::address(const_cast<T&>(in[0])),
+                    Datatype::address(out[0]),
+                    in.size(),
+                    Datatype::datatype(),
+                    detail::mpi_op<Op>::get(),
+                    comm);
+    }
+
     static void scan(const communicator& comm, const T& in, T& out, const Op&)
     {
       MPI_Scan(Datatype::address(const_cast<T&>(in)),
@@ -274,6 +285,13 @@ namespace mpi
     Collectives<T, Op>::all_reduce(comm, in, out, op);
   }
 
+  //! Same as above, but for vectors.
+  template<class T, class Op>
+  void      all_reduce(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, const Op& op)
+  {
+    Collectives<T, Op>::all_reduce(comm, in, out, op);
+  }
+
   //! scan
   template<class T, class Op>
   void      scan(const communicator& comm, const T& in, T& out, const Op& op)
diff --git a/include/vtkdiy/mpi/optional.hpp b/include/vtkdiy/mpi/optional.hpp
index 973a65f7d22d4de12ce606bc7bed9d5a9e17e0de..ab58aaf810816b67624f92b65cc29134704ed6e3 100644
--- a/include/vtkdiy/mpi/optional.hpp
+++ b/include/vtkdiy/mpi/optional.hpp
@@ -21,14 +21,17 @@ namespace mpi
 
                 operator bool() const           { return init_; }
 
-    T&          operator*()                     { return *reinterpret_cast<T*>(buf_); }
-    const T&    operator*() const               { return *reinterpret_cast<const T*>(buf_); }
+    T&          operator*()                     { return *static_cast<T*>(address()); }
+    const T&    operator*() const               { return *static_cast<const T*>(address()); }
 
     T*          operator->()                    { return &(operator*()); }
     const T*    operator->() const              { return &(operator*()); }
 
     private:
-      void      clear()                         { reinterpret_cast<T*>(buf_)->~T(); }
+      void      clear()                         { static_cast<T*>(address())->~T(); }
+
+      void*         address()                   { return buf_; }
+      const void*   address() const             { return buf_; }
 
     private:
       bool init_;
diff --git a/include/vtkdiy/partners/all-reduce.hpp b/include/vtkdiy/partners/all-reduce.hpp
index d51f5507bf7916997213efd4973bb90d5c8af273..e340665951f970e50445c3a06855836a3d68cc9a 100644
--- a/include/vtkdiy/partners/all-reduce.hpp
+++ b/include/vtkdiy/partners/all-reduce.hpp
@@ -24,12 +24,12 @@ struct RegularAllReducePartners: public RegularMergePartners
                 //! contiguous parameter indicates whether to match partners contiguously or in a round-robin fashion;
                 //! contiguous is useful when data needs to be united;
                 //! round-robin is useful for vector-"halving"
-                RegularAllReducePartners(int dim,                 //!< dimensionality of regular block grid
-                                         int nblocks,             //!< total number of blocks
-                                         int k,                   //!< target k value
-                                         bool contiguous = true   //!< distance doubling (true) or halving (false)
+  template<class Decomposer>
+                RegularAllReducePartners(const Decomposer& decomposer,  //!< domain decomposition
+                                         int k,                         //!< target k value
+                                         bool contiguous = true         //!< distance doubling (true) or halving (false)
                     ):
-                  Parent(dim, nblocks, k, contiguous)         {}
+                  Parent(decomposer, k, contiguous)         {}
                 RegularAllReducePartners(const DivisionVector&   divs,//!< explicit division vector
                                          const KVSVector&        kvs, //!< explicit k vector
                                          bool  contiguous = true      //!< distance doubling (true) or halving (false)
@@ -45,12 +45,12 @@ struct RegularAllReducePartners: public RegularMergePartners
   //! returns whether a given block in a given round has dropped out of the merge yet or not
   inline bool   active(int round, int gid, const Master& m) const { return Parent::active(parent_round(round), gid, m); }
   //! returns what the current round would be in the first or second parent merge reduction
-  int           parent_round(int round) const                   { return round < Parent::rounds() ? round : rounds() - round; }
+  int           parent_round(int round) const                   { return round < (int) Parent::rounds() ? round : rounds() - round; }
 
   // incoming is only valid for an active gid; it will only be called with an active gid
   inline void   incoming(int round, int gid, std::vector<int>& partners, const Master& m) const
   {
-      if (round <= Parent::rounds())
+      if (round <= (int) Parent::rounds())
           Parent::incoming(round, gid, partners, m);
       else
           Parent::outgoing(parent_round(round), gid, partners, m);
@@ -58,7 +58,7 @@ struct RegularAllReducePartners: public RegularMergePartners
 
   inline void   outgoing(int round, int gid, std::vector<int>& partners, const Master& m) const
   {
-      if (round < Parent::rounds())
+      if (round < (int) Parent::rounds())
           Parent::outgoing(round, gid, partners, m);
       else
           Parent::incoming(parent_round(round), gid, partners, m);
diff --git a/include/vtkdiy/partners/common.hpp b/include/vtkdiy/partners/common.hpp
index 496a4483fb313d101549f7bad59e91d4504492fd..43f8297a0fe9c280661df4026f0e4673b8743515 100644
--- a/include/vtkdiy/partners/common.hpp
+++ b/include/vtkdiy/partners/common.hpp
@@ -19,17 +19,18 @@ struct RegularPartners
     int size;           // group size
   };
 
-  // The part of RegularDecomposer that we need works the same with either Bounds (so we fix them arbitrarily)
-  typedef       DiscreteBounds                      Bounds;
-  typedef       RegularDecomposer<Bounds>           Decomposer;
-
   typedef       std::vector<int>                    CoordVector;
   typedef       std::vector<int>                    DivisionVector;
   typedef       std::vector<DimK>                   KVSVector;
 
-                RegularPartners(int dim, int nblocks, int k, bool contiguous = true):
-                  divisions_(dim, 0),
-                  contiguous_(contiguous)                       { Decomposer::fill_divisions(dim, nblocks, divisions_); factor(k, divisions_, kvs_); fill_steps(); }
+  // The part of RegularDecomposer that we need works the same with either Bounds (so we fix them arbitrarily)
+  typedef       DiscreteBounds                      Bounds;
+  typedef       RegularDecomposer<Bounds>           Decomposer;
+
+  template<class Decomposer_>
+                RegularPartners(const Decomposer_& decomposer, int k, bool contiguous = true):
+                  divisions_(decomposer.divisions),
+                  contiguous_(contiguous)                       { factor(k, divisions_, kvs_); fill_steps(); }
                 RegularPartners(const DivisionVector&   divs,
                                 const KVSVector&        kvs,
                                 bool  contiguous = true):
@@ -73,7 +74,7 @@ fill_steps()
   {
     std::vector<int>    cur_steps(divisions().size(), 1);
 
-    for (int r = 0; r < rounds(); ++r)
+    for (size_t r = 0; r < rounds(); ++r)
     {
       steps_.push_back(cur_steps[kvs_[r].dim]);
       cur_steps[kvs_[r].dim] *= kvs_[r].size;
@@ -81,7 +82,7 @@ fill_steps()
   } else
   {
     std::vector<int>    cur_steps(divisions().begin(), divisions().end());
-    for (int r = 0; r < rounds(); ++r)
+    for (size_t r = 0; r < rounds(); ++r)
     {
       cur_steps[kvs_[r].dim] /= kvs_[r].size;
       steps_.push_back(cur_steps[kvs_[r].dim]);
@@ -152,7 +153,7 @@ factor(int k, const DivisionVector& divisions, KVSVector& kvs)
     bool changed = false;
     for (unsigned i = 0; i < divisions.size(); ++i)
     {
-      if (round_per_dim[i] == tmp_kvs[i].size())
+      if (round_per_dim[i] == (int) tmp_kvs[i].size())
         continue;
       kvs.push_back(DimK(i, tmp_kvs[i][round_per_dim[i]++]));
       changed = true;
diff --git a/include/vtkdiy/partners/merge.hpp b/include/vtkdiy/partners/merge.hpp
index 3c4cfa134dee3be8e42fca5898c800024d6d06e7..c6be4253318c2e788a5dd3dd62aec1c78d5e9988 100644
--- a/include/vtkdiy/partners/merge.hpp
+++ b/include/vtkdiy/partners/merge.hpp
@@ -20,12 +20,12 @@ struct RegularMergePartners: public RegularPartners
                 // contiguous parameter indicates whether to match partners contiguously or in a round-robin fashion;
                 // contiguous is useful when data needs to be united;
                 // round-robin is useful for vector-"halving"
-                RegularMergePartners(int dim,                 //!< dimensionality of regular block grid
-                                     int nblocks,             //!< total number of blocks
-                                     int k,                   //!< target k value
-                                     bool contiguous = true   //!< distance doubling (true) or halving (false)
+  template<class Decomposer>
+                RegularMergePartners(const Decomposer& decomposer,  //!< domain decomposition
+                                     int k,                         //!< target k value
+                                     bool contiguous = true         //!< distance doubling (true) or halving (false)
                     ):
-                    Parent(dim, nblocks, k, contiguous)         {}
+                    Parent(decomposer, k, contiguous)           {}
                 RegularMergePartners(const DivisionVector&   divs, //!< explicit division vector
                                      const KVSVector&        kvs,  //!< explicit k vector
                                      bool  contiguous = true       //!< distance doubling (true) or halving (false)
diff --git a/include/vtkdiy/partners/swap.hpp b/include/vtkdiy/partners/swap.hpp
index f06b09e3746e7b016da977789c9ca3c3a90c81ec..cc3b3e4941abee651f67f95a7ff36d35d61f5466 100644
--- a/include/vtkdiy/partners/swap.hpp
+++ b/include/vtkdiy/partners/swap.hpp
@@ -20,12 +20,12 @@ struct RegularSwapPartners: public RegularPartners
                 // contiguous parameter indicates whether to match partners contiguously or in a round-robin fashion;
                 // contiguous is useful when data needs to be united;
                 // round-robin is useful for vector-"halving"
-                RegularSwapPartners(int dim,               //!< dimensionality of regular block grid
-                                    int nblocks,           //!< total number of blocks
-                                    int k,                 //!< target k value
-                                    bool contiguous = true //!< distance halving (true) or doubling (false)
+  template<class Decomposer>
+                RegularSwapPartners(const Decomposer& decomposer,   //!< domain decomposition
+                                    int k,                          //!< target k value
+                                    bool contiguous = true          //!< distance halving (true) or doubling (false)
                     ):
-                    Parent(dim, nblocks, k, contiguous)         {}
+                    Parent(decomposer, k, contiguous)         {}
                 RegularSwapPartners(const DivisionVector&   divs, //!< explicit division vector
                                     const KVSVector&        kvs,  //!< explicit k vector
                                     bool  contiguous = true       //!< distance halving (true) or doubling (false)
diff --git a/include/vtkdiy/pick.hpp b/include/vtkdiy/pick.hpp
index f6aea2c2277facb3488a5dba6554514c2699663b..f884157b2039f7ff28835c0b52d8c5d5836a1bf8 100644
--- a/include/vtkdiy/pick.hpp
+++ b/include/vtkdiy/pick.hpp
@@ -15,19 +15,14 @@ namespace diy
   template<class Point>
   float distance(int dim, const ContinuousBounds& bounds, const Point& p);
 
-  template<class Bounds>
-  void wrap_bounds(Bounds& bounds, int wrap_dir, const Bounds& domain, int dim);
+  inline
+  float distance(int dim, const ContinuousBounds& bounds1, const ContinuousBounds& bounds2);
 
-  namespace detail
-  {
-    template<class Point>
-    void shift(float new_pt[DIY_MAX_DIM], const Point& p, float r, Direction dir, int dim);
-  }
+  template<class Bounds>
+  void wrap_bounds(Bounds& bounds, Direction wrap_dir, const Bounds& domain, int dim);
 }
 
-//! Finds the neighbors within radius r of a target point. Assumptions:
-//! 1. Point p needs to be in the current block
-//! 2. Only for a regular decomposition
+//! Finds the neighbors within radius r of a target point.
 template<class Bounds, class Point, class OutIter>
 void
 diy::
@@ -37,8 +32,6 @@ near(const RegularLink<Bounds>& link,  //!< neighbors
      OutIter out,                      //!< insert iterator for output set of neighbors
      const Bounds& domain)             //!< global domain bounds
 {
-  int d; // current dimension
-  float new_pt[DIY_MAX_DIM]; // offset point
   Bounds neigh_bounds; // neighbor block bounds
 
   // for all neighbors of this block
@@ -46,26 +39,10 @@ near(const RegularLink<Bounds>& link,  //!< neighbors
   {
     // wrap neighbor bounds, if necessary, otherwise bounds will be unchanged
     neigh_bounds = link.bounds(n);
-    wrap_bounds(neigh_bounds, link.wrap() & link.direction(n), domain, link.dimension());
+    wrap_bounds(neigh_bounds, link.wrap(n), domain, link.dimension());
 
-    detail::shift(new_pt, p, r, link.direction(n), link.dimension());
-
-    // check if neighbor is near enough
-    for (d = 0; d < link.dimension(); d++)
-    {
-      // if shifted point did not move into or past the neighbor,
-      // break and proceed to next neighbor
-      // note dist can be large enough to shift the point beyond the neighbor
-      // that means the point was definitely near enough to neighbor
-      if (((link.direction(n) & (1 << (2*d + 1)))   && new_pt[d] < neigh_bounds.min[d]) ||
-          ((link.direction(n) & (1 << (2*d)))       && new_pt[d] > neigh_bounds.max[d]))
-        break;
-    }
-
-    if (d < link.dimension())
-      continue; // next neighbor
-
-    *out++ = n;
+    if (distance(link.dimension(), neigh_bounds, p) <= r)
+        *out++ = n;
   } // for all neighbors
 }
 
@@ -92,8 +69,31 @@ distance(int dim, const ContinuousBounds& bounds, const Point& p)
     return sqrt(res);
 }
 
-//! Finds the neighbor(s) containing the target point. Assumptions:
-//! 1. Only for a regular decomposition
+float
+diy::
+distance(int dim, const ContinuousBounds& bounds1, const ContinuousBounds& bounds2)
+{
+    float res = 0;
+    for (int i = 0; i < dim; ++i)
+    {
+        float diff = 0, d;
+
+        float d1 = bounds1.max[i] - bounds2.min[i];
+        float d2 = bounds2.max[i] - bounds1.min[i];
+
+        if (d1 > 0 && d2 > 0)
+            diff = 0;
+        else if (d1 <= 0)
+            diff = -d1;
+        else if (d2 <= 0)
+            diff = -d2;
+
+        res += diff*diff;
+    }
+    return sqrt(res);
+}
+
+//! Finds the neighbor(s) containing the target point.
 template<class Bounds, class Point, class OutIter>
 void
 diy::
@@ -102,7 +102,6 @@ in(const RegularLink<Bounds>& link,  //!< neighbors
    OutIter out,                      //!< insert iterator for output set of neighbors
    const Bounds& domain)             //!< global domain bounds
 {
-  int d; // current dimension
   Bounds neigh_bounds; // neighbor block bounds
 
   // for all neighbors of this block
@@ -110,19 +109,10 @@ in(const RegularLink<Bounds>& link,  //!< neighbors
   {
     // wrap neighbor bounds, if necessary, otherwise bounds will be unchanged
     neigh_bounds = link.bounds(n);
-    wrap_bounds(neigh_bounds, link.wrap() & link.direction(n), domain, link.dimension());
-
-    // check if p is in neighbor
-    for (d = 0; d < link.dimension(); d++)
-    {
-      if (p[d] < neigh_bounds.min[d] || p[d] > neigh_bounds.max[d])
-        break;
-    }
+    wrap_bounds(neigh_bounds, link.wrap(n), domain, link.dimension());
 
-    if (d < link.dimension())
-      continue; // next neighbor
-
-    *out++ = n;
+    if (distance(link.dimension(), neigh_bounds, p) == 0)
+        *out++ = n;
   } // for all neighbors
 }
 
@@ -132,75 +122,13 @@ in(const RegularLink<Bounds>& link,  //!< neighbors
 template<class Bounds>
 void
 diy::
-wrap_bounds(Bounds& bounds, int wrap_dir, const Bounds& domain, int dim)
+wrap_bounds(Bounds& bounds, Direction wrap_dir, const Bounds& domain, int dim)
 {
-  // wrapping toward the left transforms block bounds to the left, and vice versa
-  if (dim > 0 && (wrap_dir & DIY_X0) == DIY_X0)
-  {
-    bounds.min[0] -= (domain.max[0] - domain.min[0]);
-    bounds.max[0] -= (domain.max[0] - domain.min[0]);
-  }
-  if (dim > 0 && (wrap_dir & DIY_X1) == DIY_X1)
-  {
-    bounds.min[0] += (domain.max[0] - domain.min[0]);
-    bounds.max[0] += (domain.max[0] - domain.min[0]);
-  }
-
-  if (dim > 1 && (wrap_dir & DIY_Y0) == DIY_Y0)
-  {
-    bounds.min[1] -= (domain.max[1] - domain.min[1]);
-    bounds.max[1] -= (domain.max[1] - domain.min[1]);
-  }
-  if (dim > 1 && (wrap_dir & DIY_Y1) == DIY_Y1)
-  {
-    bounds.min[1] += (domain.max[1] - domain.min[1]);
-    bounds.max[1] += (domain.max[1] - domain.min[1]);
-  }
-
-  if (dim > 2 && (wrap_dir & DIY_Z0) == DIY_Z0)
+  for (int i = 0; i < dim; ++i)
   {
-    bounds.min[2] -= (domain.max[2] - domain.min[2]);
-    bounds.max[2] -= (domain.max[2] - domain.min[2]);
+    bounds.min[i] += wrap_dir[i] * (domain.max[i] - domain.min[i]);
+    bounds.max[i] += wrap_dir[i] * (domain.max[i] - domain.min[i]);
   }
-  if (dim > 2 && (wrap_dir & DIY_Z1) == DIY_Z1)
-  {
-    bounds.min[2] += (domain.max[2] - domain.min[2]);
-    bounds.max[2] += (domain.max[2] - domain.min[2]);
-  }
-
-  if (dim > 3 && (wrap_dir & DIY_T0) == DIY_T0)
-  {
-    bounds.min[3] -= (domain.max[3] - domain.min[3]);
-    bounds.max[3] -= (domain.max[3] - domain.min[3]);
-  }
-  if (dim > 3 && (wrap_dir & DIY_T1) == DIY_T1)
-  {
-    bounds.min[3] += (domain.max[3] - domain.min[3]);
-    bounds.max[3] += (domain.max[3] - domain.min[3]);
-  }
-}
-
-template<class Point>
-void
-diy::detail::
-shift(float new_pt[DIY_MAX_DIM], const Point& p, float r, Direction dir, int dim)
-{
-  if (dim > 0 && (dir & DIY_X0))
-      new_pt[0] = p[0] - r;
-  if (dim > 0 && (dir & DIY_X1))
-      new_pt[0] = p[0] + r;
-  if (dim > 1 && (dir & DIY_Y0))
-      new_pt[1] = p[1] - r;
-  if (dim > 1 && (dir & DIY_Y1))
-      new_pt[1] = p[1] + r;
-  if (dim > 2 && (dir & DIY_Z0))
-      new_pt[2] = p[2] - r;
-  if (dim > 2 && (dir & DIY_Z1))
-      new_pt[2] = p[2] + r;
-  if (dim > 3 && (dir & DIY_T0))
-      new_pt[3] = p[3] - r;
-  if (dim > 3 && (dir & DIY_T1))
-      new_pt[3] = p[3] + r;
 }
 
 
diff --git a/include/vtkdiy/proxy.hpp b/include/vtkdiy/proxy.hpp
index 5ea2d4631845f9094872e8652e633fe0a20cb1d2..0160e060567f613906cff294a1e4b4ef52803f01 100644
--- a/include/vtkdiy/proxy.hpp
+++ b/include/vtkdiy/proxy.hpp
@@ -19,7 +19,7 @@ namespace diy
 
     int                 gid() const                                     { return gid_; }
 
-    //! Enqueue data whose size can be determined automatically
+    //! Enqueue data whose size can be determined automatically, e.g., an STL vector.
     template<class T>
     void                enqueue(const BlockID&  to,                                     //!< target block (gid,proc)
                                 const T&        x,                                      //!< data (eg. STL vector)
@@ -27,7 +27,7 @@ namespace diy
                                ) const
     { OutgoingQueues& out = *outgoing_; save(out[to], x); }
 
-    //! Enqueue an array of data whose size is given explicitly
+    //! Enqueue data whose size is given explicitly by the user, e.g., an array.
     template<class T>
     void                enqueue(const BlockID&  to,                                     //!< target block (gid,proc)
                                 const T*        x,                                      //!< pointer to the data (eg. address of start of vector)
@@ -35,8 +35,9 @@ namespace diy
                                 void (*save)(BinaryBuffer&, const T&) = &::diy::save<T> //!< optional serialization function
                                ) const;
 
-    //! Dequeue data whose size can be determined automatically.
-    //! Diy will allocate the receive buffer.
+    //! Dequeue data whose size can be determined automatically (e.g., STL vector) and that was
+    //! previously enqueued so that diy knows its size when it is received.
+    //! In this case, diy will allocate the receive buffer; the user does not need to do so.
     template<class T>
     void                dequeue(int             from,                                   //!< target block gid
                                 T&              x,                                      //!< data (eg. STL vector)
@@ -44,8 +45,8 @@ namespace diy
                                ) const
     { IncomingQueues& in  = *incoming_; load(in[from], x); }
 
-    //! Dequeue an array of data whose size is given explicitly.
-    //! The user needs to allocate the receive buffer.
+    //! Dequeue an array of data whose size is given explicitly by the user.
+    //! In this case, the user needs to allocate the receive buffer prior to calling dequeue.
     template<class T>
     void                dequeue(int             from,                                   //!< target block gid
                                 T*              x,                                      //!< pointer to the data (eg. address of start of vector)
@@ -65,17 +66,37 @@ namespace diy
     OutgoingQueues*     outgoing() const                                { return outgoing_; }
     MemoryBuffer&       outgoing(const BlockID& to) const               { return (*outgoing_)[to]; }
 
-    //! Post an all-reduce collective
+/**
+ * \ingroup Communication
+ * \brief Post an all-reduce collective using an existing communication proxy.
+ * Available operators are:
+ * maximum<T>, minimum<T>, std::plus<T>, std::multiplies<T>, std::logical_and<T>, and
+ * std::logical_or<T>.
+ */
     template<class T, class Op>
-    inline void         all_reduce(const T& in, Op op) const;
+    inline void         all_reduce(const T& in,                  //!< local value being reduced
+                                   Op op                         //!< operator
+                                   ) const;
+/**
+ * \ingroup Communication
+ * \brief Return the result of a proxy collective without popping it off the collectives list (same result would be returned multiple times). The list can be cleared with collectives()->clear().
+ */
     template<class T>
     inline T            read() const;
+/**
+ * \ingroup Communication
+ * \brief Return the result of a proxy collective; result is popped off the collectives list.
+ */
     template<class T>
     inline T            get() const;
 
     template<class T>
     inline void         scratch(const T& in) const;
 
+/**
+ * \ingroup Communication
+ * \brief Return the list of proxy collectives (values and operations)
+ */
     CollectivesList*    collectives() const                             { return collectives_; }
 
     Master*             master() const                                  { return master_; }
diff --git a/include/vtkdiy/reduce-operations.hpp b/include/vtkdiy/reduce-operations.hpp
index 30c4570a6ae2749dd1501e1ccaf10664a3d626e3..b1e45a58061823fe4b597c1dcab1178836ec7318 100644
--- a/include/vtkdiy/reduce-operations.hpp
+++ b/include/vtkdiy/reduce-operations.hpp
@@ -21,7 +21,8 @@ all_to_all(Master&              master,     //!< block owner
            int                  k = 2       //!< reduction fanout
           )
 {
-  RegularSwapPartners  partners(1, assigner.nblocks(), k, false);
+  RegularDecomposer<DiscreteBounds> decomposer(1, interval(0,assigner.nblocks()-1), assigner.nblocks());
+  RegularSwapPartners  partners(decomposer, k, false);
   reduce(master, assigner, partners, detail::AllToAllReduce<Op>(op, assigner), detail::SkipIntermediate(partners.rounds()));
 }
 
diff --git a/include/vtkdiy/reduce.hpp b/include/vtkdiy/reduce.hpp
index e7be78129b59946f44fe2e9c4fc5ced6f8395a73..4810942ff1c759bc2cb1f1a48d2e3fdebc76b3f8 100644
--- a/include/vtkdiy/reduce.hpp
+++ b/include/vtkdiy/reduce.hpp
@@ -104,11 +104,11 @@ namespace detail
  *
  */
 template<class Reduce, class Partners, class Skip>
-void reduce(Master&                    master,
-            const Assigner&            assigner,
-            const Partners&            partners,
-            const Reduce&              reduce,
-            const Skip&                skip)
+void reduce(Master&                    master,        //!< master object
+            const Assigner&            assigner,      //!< assigner object
+            const Partners&            partners,      //!< partners object
+            const Reduce&              reduce,        //!< reduction callback function
+            const Skip&                skip)          //!< object determining whether a block should be skipped
 {
   int original_expected = master.expected();
 
@@ -121,7 +121,7 @@ void reduce(Master&                    master,
     master.execute();
 
     int expected = 0;
-    for (int i = 0; i < master.size(); ++i)
+    for (unsigned i = 0; i < master.size(); ++i)
     {
       if (partners.active(round + 1, master.gid(i), master))
       {
@@ -142,11 +142,17 @@ void reduce(Master&                    master,
   master.set_expected(original_expected);
 }
 
+/**
+ * \ingroup Communication
+ * \brief Implementation of the reduce communication pattern (includes
+ *        swap-reduce, merge-reduce, and any other global communication).
+ *
+ */
 template<class Reduce, class Partners>
-void reduce(Master&                    master,
-            const Assigner&            assigner,
-            const Partners&            partners,
-            const Reduce&              reducer)
+void reduce(Master&                    master,        //!< master object
+            const Assigner&            assigner,      //!< assigner object
+            const Partners&            partners,      //!< partners object
+            const Reduce&              reducer)       //!< reduction callback function
 {
   reduce(master, assigner, partners, reducer, detail::ReduceNeverSkip());
 }
@@ -174,8 +180,8 @@ namespace detail
 
       // touch the outgoing queues to make sure they exist
       Master::OutgoingQueues& outgoing = *cp.outgoing();
-      if (outgoing.size() < rp.out_link().size())
-        for (unsigned j = 0; j < rp.out_link().size(); ++j)
+      if (outgoing.size() < (size_t) rp.out_link().size())
+        for (int j = 0; j < rp.out_link().size(); ++j)
           outgoing[rp.out_link().target(j)];       // touch the outgoing queue, creating it if necessary
     }
 
diff --git a/include/vtkdiy/serialization.hpp b/include/vtkdiy/serialization.hpp
index 4bb89fe15ec474ec5b710dc9714c7e18a5ede27f..6a1a3defae3a94fb8a605a02d03c640b389e5c3e 100644
--- a/include/vtkdiy/serialization.hpp
+++ b/include/vtkdiy/serialization.hpp
@@ -11,6 +11,7 @@
 #if __cplusplus > 199711L           // C++11
 #include <tuple>
 #include <unordered_map>
+#include <unordered_set>
 #include <type_traits>              // this is used for a safety check for default serialization
 #endif
 #endif
@@ -333,6 +334,33 @@ namespace diy
     }
   };
 
+  // save/load for std::unordered_set<T>
+  template<class T>
+  struct Serialization< std::unordered_set<T> >
+  {
+    typedef             std::unordered_set<T>           Set;
+
+    static void         save(BinaryBuffer& bb, const Set& m)
+    {
+      size_t s = m.size();
+      diy::save(bb, s);
+      for (auto& x : m)
+        diy::save(bb, x);
+    }
+
+    static void         load(BinaryBuffer& bb, Set& m)
+    {
+      size_t s;
+      diy::load(bb, s);
+      for (size_t i = 0; i < s; ++i)
+      {
+        T p;
+        diy::load(bb, p);
+        m.emplace(std::move(p));
+      }
+    }
+  };
+
   // save/load for std::tuple<...>
   // TODO: this ought to be default (copying) serialization
   //       if all arguments are default
diff --git a/include/vtkdiy/thread.hpp b/include/vtkdiy/thread.hpp
index 95377d29550f38ed8292c8e20c45ef737c2447f5..4208ccff74cc315fd540be3497cfe57f1039aeae 100644
--- a/include/vtkdiy/thread.hpp
+++ b/include/vtkdiy/thread.hpp
@@ -4,9 +4,30 @@
 #ifdef DIY_NO_THREADS
 #include "no-thread.hpp"
 #else
-#include "thread/tinythread.h"
+
 #include "thread/fast_mutex.h"
 
+#if __cplusplus > 199711L           // C++11
+#include <thread>
+#include <mutex>
+
+namespace diy
+{
+    using std::thread;
+    using std::mutex;
+    using std::recursive_mutex;
+    namespace this_thread = std::this_thread;
+
+    // TODO: replace with our own implementation using std::atomic_flag
+    using fast_mutex = tthread::fast_mutex;
+
+    template<class Mutex>
+    using lock_guard = std::unique_lock<Mutex>;
+}
+
+#else
+#include "thread/tinythread.h"
+
 namespace diy
 {
   using tthread::thread;
@@ -17,6 +38,7 @@ namespace diy
   namespace this_thread = tthread::this_thread;
 }
 #endif
+#endif
 
 #include "critical-resource.hpp"
 
diff --git a/include/vtkdiy/types.hpp b/include/vtkdiy/types.hpp
index 83db2d8fd676d3a0dbbc9ba4f82e8428f1edbe9e..7ce19d5a7f618696ecf4a03962e45fbc2ba8b379 100644
--- a/include/vtkdiy/types.hpp
+++ b/include/vtkdiy/types.hpp
@@ -12,7 +12,29 @@ namespace diy
   typedef   bb_d_t      DiscreteBounds;
   typedef   bb_c_t      ContinuousBounds;
 
-  typedef   dir_t       Direction;
+  //! Helper to create a 1-dimensional discrete domain with the specified extents
+  diy::DiscreteBounds
+  interval(int from, int to)            { DiscreteBounds domain; domain.min[0] = from; domain.max[0] = to; return domain; }
+
+  struct Direction: public dir_t
+  {
+            Direction()                 { for (int i = 0; i < DIY_MAX_DIM; ++i) x[i] = 0; }
+
+    int     operator[](int i) const     { return x[i]; }
+    int&    operator[](int i)           { return x[i]; }
+
+    // lexicographic comparison
+    bool
+    operator<(const diy::Direction& y) const
+    {
+      for (int i = 0; i < DIY_MAX_DIM; ++i)
+      {
+          if ((*this)[i] < y[i]) return true;
+          if ((*this)[i] > y[i]) return false;
+      }
+      return false;
+    }
+  };
 
   // Selector of bounds value type
   template<class Bounds_>