From 6891accba362e2e6f5dcc482429e23b98cbe68a0 Mon Sep 17 00:00:00 2001
From: Diy2 Upstream <kwrobot@kitware.com>
Date: Mon, 3 Feb 2025 13:16:17 -0500
Subject: [PATCH] diy2 2025-02-03 (e0d255f5)

Code extracted from:

    https://gitlab.kitware.com/third-party/diy2.git

at commit e0d255f55ac3505ea42f38424b42048f60389ac7 (for/vtk-20250203-0cde57f9b).
---
 README.md                                     |    8 +-
 include/vtkdiy2/algorithms.hpp                |  115 +-
 include/vtkdiy2/assigner.hpp                  |    4 +-
 include/vtkdiy2/collection.hpp                |   28 +-
 include/vtkdiy2/coroutine.hpp                 |   41 +
 include/vtkdiy2/coroutine/fiber.hpp           |   50 +
 include/vtkdiy2/coroutine/sjlj.hpp            |  129 +
 include/vtkdiy2/critical-resource.hpp         |    5 +
 include/vtkdiy2/decomposition.hpp             |   32 +-
 .../detail/algorithms/kdtree-sampling.hpp     |   14 +-
 include/vtkdiy2/detail/algorithms/kdtree.hpp  |   63 +-
 .../algorithms/load-balance-collective.hpp    |  125 +
 .../algorithms/load-balance-sampling.hpp      |  217 +
 .../detail/algorithms/load-balance.hpp        |   38 +
 include/vtkdiy2/detail/algorithms/sort.hpp    |   18 +-
 include/vtkdiy2/detail/block_traits.hpp       |    2 +-
 include/vtkdiy2/detail/master/collectives.hpp |    2 +-
 .../vtkdiy2/detail/master/communication.hpp   |   61 +-
 include/vtkdiy2/detail/master/execution.hpp   |    5 +-
 .../detail/master/foreach_exchange.hpp        |  105 +
 .../detail/master/iexchange-collective.hpp    |   16 +-
 .../vtkdiy2/detail/master/iexchange-dud.hpp   |  338 --
 include/vtkdiy2/detail/master/iexchange.hpp   |   43 +-
 include/vtkdiy2/detail/reduce/all-to-all.hpp  |   31 +-
 include/vtkdiy2/dynamic-point.hpp             |   16 +-
 include/vtkdiy2/factory.hpp                   |   20 +-
 include/vtkdiy2/fmt/compile.h                 |  466 --
 include/vtkdiy2/fmt/core.h                    | 1420 ------
 include/vtkdiy2/fmt/format-inl.h              | 1000 -----
 include/vtkdiy2/fmt/format.h                  | 3602 ---------------
 include/vtkdiy2/fmt/locale.h                  |   77 -
 include/vtkdiy2/grid.hpp                      |   23 +-
 include/vtkdiy2/io/block.hpp                  |    7 +-
 include/vtkdiy2/io/bov.hpp                    |  100 +-
 include/vtkdiy2/io/numpy.hpp                  |    6 +-
 include/vtkdiy2/io/shared.hpp                 |   36 +-
 include/vtkdiy2/io/utils.hpp                  |    2 +
 include/vtkdiy2/link.hpp                      |    2 +-
 include/vtkdiy2/log.hpp                       |    8 +-
 include/vtkdiy2/master.hpp                    |  806 ++--
 include/vtkdiy2/mpi.hpp                       |   61 +-
 include/vtkdiy2/mpi/collectives.cpp           |  161 +
 include/vtkdiy2/mpi/collectives.hpp           |  337 +-
 include/vtkdiy2/mpi/communicator.cpp          |  130 +
 include/vtkdiy2/mpi/communicator.hpp          |  178 +-
 include/vtkdiy2/mpi/config.hpp                |   85 +
 include/vtkdiy2/mpi/constants.hpp             |   13 -
 include/vtkdiy2/mpi/datatypes.cpp             |   34 +
 include/vtkdiy2/mpi/datatypes.hpp             |   85 +-
 include/vtkdiy2/mpi/diy-mpi-export.h          |   49 +
 include/vtkdiy2/mpi/environment.cpp           |   62 +
 include/vtkdiy2/mpi/environment.hpp           |   35 +
 include/vtkdiy2/mpi/io.cpp                    |  222 +
 include/vtkdiy2/mpi/io.hpp                    |  195 +-
 include/vtkdiy2/mpi/mpi_cast.hpp              |   39 +
 include/vtkdiy2/mpi/mpitypes.hpp.in           |   80 +
 include/vtkdiy2/mpi/no-mpi.hpp                |   44 +-
 include/vtkdiy2/mpi/operations.cpp            |   33 +
 include/vtkdiy2/mpi/operations.hpp            |   55 +-
 include/vtkdiy2/mpi/optional.hpp              |    7 +-
 include/vtkdiy2/mpi/point-to-point.cpp        |  106 +
 include/vtkdiy2/mpi/point-to-point.hpp        |  191 +-
 include/vtkdiy2/mpi/request.cpp               |   45 +
 include/vtkdiy2/mpi/request.hpp               |   53 +-
 include/vtkdiy2/mpi/status.cpp                |   30 +
 include/vtkdiy2/mpi/status.hpp                |   61 +-
 include/vtkdiy2/mpi/window.cpp                |  226 +
 include/vtkdiy2/mpi/window.hpp                |  248 +-
 include/vtkdiy2/no-thread.hpp                 |    7 +
 include/vtkdiy2/proxy.hpp                     |  159 +-
 include/vtkdiy2/reduce.hpp                    |   26 +-
 include/vtkdiy2/resolve.hpp                   |    6 +-
 include/vtkdiy2/serialization.hpp             |  132 +-
 include/vtkdiy2/stats.hpp                     |    8 +-
 include/vtkdiy2/storage.hpp                   |   16 +-
 include/vtkdiy2/{ => thirdparty}/fmt/chrono.h |  465 +-
 include/vtkdiy2/{ => thirdparty}/fmt/color.h  |  176 +-
 include/vtkdiy2/thirdparty/fmt/compile.h      |  701 +++
 include/vtkdiy2/thirdparty/fmt/core.h         | 2122 +++++++++
 include/vtkdiy2/thirdparty/fmt/format-inl.h   | 2801 ++++++++++++
 .../vtkdiy2/{ => thirdparty}/fmt/format.cc    |    2 +-
 include/vtkdiy2/thirdparty/fmt/format.h       | 3962 +++++++++++++++++
 include/vtkdiy2/thirdparty/fmt/locale.h       |   64 +
 .../{fmt/posix.h => thirdparty/fmt/os.h}      |  247 +-
 .../vtkdiy2/{ => thirdparty}/fmt/ostream.h    |   87 +-
 include/vtkdiy2/{ => thirdparty}/fmt/posix.cc |    2 +-
 include/vtkdiy2/thirdparty/fmt/posix.h        |    2 +
 include/vtkdiy2/{ => thirdparty}/fmt/printf.h |  240 +-
 include/vtkdiy2/{ => thirdparty}/fmt/ranges.h |  186 +-
 .../{ => thirdparty}/fmt/safe-duration-cast.h |    0
 .../{ => thirdparty}/itlib/small_vector.hpp   |    0
 .../{ => thirdparty}/thread/fast_mutex.h      |    0
 include/vtkdiy2/thread.hpp                    |   67 +-
 include/vtkdiy2/time.hpp                      |    6 +-
 include/vtkdiy2/types.hpp                     |   48 +-
 include/vtkdiy2/utils.hpp                     |   15 +
 96 files changed, 14566 insertions(+), 8927 deletions(-)
 create mode 100644 include/vtkdiy2/coroutine.hpp
 create mode 100644 include/vtkdiy2/coroutine/fiber.hpp
 create mode 100644 include/vtkdiy2/coroutine/sjlj.hpp
 create mode 100644 include/vtkdiy2/detail/algorithms/load-balance-collective.hpp
 create mode 100644 include/vtkdiy2/detail/algorithms/load-balance-sampling.hpp
 create mode 100644 include/vtkdiy2/detail/algorithms/load-balance.hpp
 create mode 100644 include/vtkdiy2/detail/master/foreach_exchange.hpp
 delete mode 100644 include/vtkdiy2/detail/master/iexchange-dud.hpp
 delete mode 100644 include/vtkdiy2/fmt/compile.h
 delete mode 100644 include/vtkdiy2/fmt/core.h
 delete mode 100644 include/vtkdiy2/fmt/format-inl.h
 delete mode 100644 include/vtkdiy2/fmt/format.h
 delete mode 100644 include/vtkdiy2/fmt/locale.h
 create mode 100644 include/vtkdiy2/mpi/collectives.cpp
 create mode 100644 include/vtkdiy2/mpi/communicator.cpp
 create mode 100644 include/vtkdiy2/mpi/config.hpp
 delete mode 100644 include/vtkdiy2/mpi/constants.hpp
 create mode 100644 include/vtkdiy2/mpi/datatypes.cpp
 create mode 100644 include/vtkdiy2/mpi/diy-mpi-export.h
 create mode 100644 include/vtkdiy2/mpi/environment.cpp
 create mode 100644 include/vtkdiy2/mpi/environment.hpp
 create mode 100644 include/vtkdiy2/mpi/io.cpp
 create mode 100644 include/vtkdiy2/mpi/mpi_cast.hpp
 create mode 100644 include/vtkdiy2/mpi/mpitypes.hpp.in
 create mode 100644 include/vtkdiy2/mpi/operations.cpp
 create mode 100644 include/vtkdiy2/mpi/point-to-point.cpp
 create mode 100644 include/vtkdiy2/mpi/request.cpp
 create mode 100644 include/vtkdiy2/mpi/status.cpp
 create mode 100644 include/vtkdiy2/mpi/window.cpp
 rename include/vtkdiy2/{ => thirdparty}/fmt/chrono.h (60%)
 rename include/vtkdiy2/{ => thirdparty}/fmt/color.h (81%)
 create mode 100644 include/vtkdiy2/thirdparty/fmt/compile.h
 create mode 100644 include/vtkdiy2/thirdparty/fmt/core.h
 create mode 100644 include/vtkdiy2/thirdparty/fmt/format-inl.h
 rename include/vtkdiy2/{ => thirdparty}/fmt/format.cc (98%)
 create mode 100644 include/vtkdiy2/thirdparty/fmt/format.h
 create mode 100644 include/vtkdiy2/thirdparty/fmt/locale.h
 rename include/vtkdiy2/{fmt/posix.h => thirdparty/fmt/os.h} (55%)
 rename include/vtkdiy2/{ => thirdparty}/fmt/ostream.h (55%)
 rename include/vtkdiy2/{ => thirdparty}/fmt/posix.cc (99%)
 create mode 100644 include/vtkdiy2/thirdparty/fmt/posix.h
 rename include/vtkdiy2/{ => thirdparty}/fmt/printf.h (71%)
 rename include/vtkdiy2/{ => thirdparty}/fmt/ranges.h (58%)
 rename include/vtkdiy2/{ => thirdparty}/fmt/safe-duration-cast.h (100%)
 rename include/vtkdiy2/{ => thirdparty}/itlib/small_vector.hpp (100%)
 rename include/vtkdiy2/{ => thirdparty}/thread/fast_mutex.h (100%)
 create mode 100644 include/vtkdiy2/utils.hpp

diff --git a/README.md b/README.md
index a2ab00c944e..5fe20581168 100644
--- a/README.md
+++ b/README.md
@@ -28,9 +28,15 @@ DIY is a header-only library. It does not need to be built; you can simply
 include it in your project. The examples can be built using `cmake` from the
 top-level directory.
 
+## Development
+
+Development happens in the [DIY repo](https://gitlab.kitware.com/diatomic/diy)
+on Kitware's GitLab. Please submit merge requests there. Issues should
+be submitted in the GitHub repo.
+
 ## Documentation
 
-[Doxygen pages](https://diatomic.github.io/diy)
+[DIY project](https://diatomic.github.io/diy)
 
 ## Example
 
diff --git a/include/vtkdiy2/algorithms.hpp b/include/vtkdiy2/algorithms.hpp
index 13a9b0ee737..588a17835f3 100644
--- a/include/vtkdiy2/algorithms.hpp
+++ b/include/vtkdiy2/algorithms.hpp
@@ -8,10 +8,13 @@
 #include "reduce.hpp"
 #include "reduce-operations.hpp"
 #include "partners/swap.hpp"
+#include "resolve.hpp"
 
 #include "detail/algorithms/sort.hpp"
 #include "detail/algorithms/kdtree.hpp"
 #include "detail/algorithms/kdtree-sampling.hpp"
+#include "detail/algorithms/load-balance-collective.hpp"
+#include "detail/algorithms/load-balance-sampling.hpp"
 
 #include "log.hpp"
 
@@ -86,7 +89,7 @@ namespace diy
 
         typedef     diy::RegularContinuousLink      RCLink;
 
-        for (size_t i = 0; i < master.size(); ++i)
+        for (int i = 0; i < static_cast<int>(master.size()); ++i)
         {
             RCLink* link   = static_cast<RCLink*>(master.link(i));
             *link = RCLink(dim, domain, domain);
@@ -122,7 +125,7 @@ namespace diy
 
         // update master.expected to match the links
         int expected = 0;
-        for (size_t i = 0; i < master.size(); ++i)
+        for (int i = 0; i < static_cast<int>(master.size()); ++i)
             expected += master.link(i)->size_unique();
         master.set_expected(expected);
     }
@@ -146,7 +149,7 @@ namespace diy
 
         typedef     diy::RegularContinuousLink      RCLink;
 
-        for (size_t i = 0; i < master.size(); ++i)
+        for (int i = 0; i < static_cast<int>(master.size()); ++i)
         {
             RCLink* link   = static_cast<RCLink*>(master.link(i));
             *link = RCLink(dim, domain, domain);
@@ -182,10 +185,114 @@ namespace diy
 
         // update master.expected to match the links
         int expected = 0;
-        for (size_t i = 0; i < master.size(); ++i)
+        for (int i = 0; i < static_cast<int>(master.size()); ++i)
             expected += master.link(i)->size_unique();
         master.set_expected(expected);
     }
+
+    template<class B>
+    using LBCallback = std::function<Work(B*, int)>;
+
+    // load balancing using collective method
+    template<class Callback>
+    void load_balance_collective(
+            diy::Master&                master,             // diy master
+            diy::DynamicAssigner&       dynamic_assigner,   // diy dynamic assigner
+            const Callback&             f)                  // callback to get work for a block
+    {
+        // assert that master.destroyer() exists, will be needed for moving blocks
+        if (!master.destroyer())
+        {
+            fmt::print(stderr, "DIY error: Master must have a block destroyer function in order to use load balancing. Please define one.\n");
+            abort();
+        }
+
+        using Block = typename detail::block_traits<Callback>::type;
+        const LBCallback<Block>& f_ = f;
+
+        // compile my work info
+        diy::detail::WorkInfo my_work_info = { master.communicator().rank(), -1, 0, 0, static_cast<int>(master.size()) };
+        for (auto i = 0; i < master.size(); i++)
+        {
+            Block* block = static_cast<Block*>(master.block(i));
+            Work w = f_(block, master.gid(i));
+            my_work_info.proc_work += w;
+            if (my_work_info.top_gid == -1 || my_work_info.top_work < w)
+            {
+                my_work_info.top_gid    = master.gid(i);
+                my_work_info.top_work   = w;
+            }
+        }
+
+        // exchange info about load balance
+        std::vector<diy::detail::WorkInfo>   all_work_info; // work info collected from all mpi processes
+        diy::detail::exchange_work_info(master, my_work_info, all_work_info);
+
+        // decide what to move where
+        std::vector<diy::detail::MoveInfo>   all_move_info; // move info for all moves
+        diy::detail::decide_move_info(all_work_info, all_move_info);
+
+        // move blocks from src to dst proc
+        for (auto i = 0; i < all_move_info.size(); i++)
+            diy::detail::move_block(master, all_move_info[i]);
+
+        // fix links
+        diy::fix_links(master, dynamic_assigner);
+    }
+
+    // load balancing using sampling method
+    template<class Callback>
+    void load_balance_sampling(
+            diy::Master&                master,
+            diy::DynamicAssigner&       dynamic_assigner,   // diy dynamic assigner
+            const Callback&             f,                  // callback to get work for a block
+            float                       sample_frac = 0.5,  // fraction of procs to sample 0.0 < sample_size <= 1.0
+            float                       quantile    = 0.8)  // quantile cutoff above which to move blocks (0.0 - 1.0)
+    {
+        // assert that master.destroyer() exists, will be needed for moving blocks
+        if (!master.destroyer())
+        {
+            fmt::print(stderr, "DIY error: Master must have a block destroyer function in order to use load balancing. Please define one.\n");
+            abort();
+        }
+
+        using Block = typename detail::block_traits<Callback>::type;
+        const LBCallback<Block>& f_ = f;
+
+        // compile my work info
+        diy::detail::WorkInfo my_work_info = { master.communicator().rank(), -1, 0, 0, static_cast<int>(master.size()) };
+        for (auto i = 0; i < master.size(); i++)
+        {
+            Block* block = static_cast<Block*>(master.block(i));
+            Work w = f_(block, master.gid(i));
+            my_work_info.proc_work += w;
+            if (my_work_info.top_gid == -1 || my_work_info.top_work < w)
+            {
+                my_work_info.top_gid    = master.gid(i);
+                my_work_info.top_work   = w;
+            }
+        }
+
+        // "auxiliary" master and decomposer for using rexchange for load balancing, 1 block per process
+        diy::Master                     aux_master(master.communicator(), 1, -1, &diy::detail::AuxBlock::create, &diy::detail::AuxBlock::destroy);
+        diy::ContiguousAssigner         aux_assigner(aux_master.communicator().size(), aux_master.communicator().size());
+        diy::DiscreteBounds aux_domain(1);                               // any fake domain
+        aux_domain.min[0] = 0;
+        aux_domain.max[0] = aux_master.communicator().size() + 1;
+        diy::RegularDecomposer<diy::DiscreteBounds>  aux_decomposer(1, aux_domain, aux_master.communicator().size());
+        aux_decomposer.decompose(aux_master.communicator().rank(), aux_assigner, aux_master);
+
+        // exchange info about load balance
+        std::vector<diy::detail::WorkInfo>   sample_work_info;           // work info collecting from sampling other mpi processes
+        diy::detail::exchange_sample_work_info(master, aux_master, sample_frac, my_work_info, sample_work_info);
+
+        // move blocks
+        diy::detail::move_sample_blocks(master, aux_master, sample_work_info, my_work_info, quantile);
+
+        // fix links
+        diy::fix_links(master, dynamic_assigner);
+    }
+
 }
 
 #endif
diff --git a/include/vtkdiy2/assigner.hpp b/include/vtkdiy2/assigner.hpp
index 955a7a6f2f4..d117919bae5 100644
--- a/include/vtkdiy2/assigner.hpp
+++ b/include/vtkdiy2/assigner.hpp
@@ -109,7 +109,7 @@ namespace diy
                       Assigner(size__, nblocks__),
                       comm_(comm),
                       div_(nblocks__ / size__ + ((nblocks__ % size__) == 0 ? 0 : 1)),   // NB: same size window everywhere means the last rank may allocate extra space
-                      rank_map_(comm_, div_)                                            { rank_map_.lock_all(MPI_MODE_NOCHECK); }
+                      rank_map_(comm_, div_)                                            { rank_map_.lock_all(mpi::nocheck); }
                     ~DynamicAssigner()                                                  { rank_map_.unlock_all(); }
 
       inline
@@ -189,7 +189,7 @@ set_nblocks(int nblocks__)
 
     rank_map_.unlock_all();
     rank_map_ = mpi::window<int>(comm_, div_);
-    rank_map_.lock_all(MPI_MODE_NOCHECK);
+    rank_map_.lock_all(mpi::nocheck);
 }
 
 std::tuple<bool,int>
diff --git a/include/vtkdiy2/collection.hpp b/include/vtkdiy2/collection.hpp
index c327d7ab509..61390f560f4 100644
--- a/include/vtkdiy2/collection.hpp
+++ b/include/vtkdiy2/collection.hpp
@@ -17,10 +17,10 @@ namespace diy
       typedef       std::vector<Element>                        Elements;
       typedef       critical_resource<int, recursive_mutex>     CInt;
 
-      typedef       void* (*Create)();
-      typedef       void  (*Destroy)(void*);
-      typedef       detail::Save                                Save;
-      typedef       detail::Load                                Load;
+      using Create  = std::function<void*()>;
+      using Destroy = std::function<void(void*)>;
+      using Save    = detail::Save;
+      using Load    = detail::Load;
 
     public:
                     Collection(Create               create__,
@@ -40,9 +40,10 @@ namespace diy
       inline void   clear();
 
       int           add(Element e)                  { elements_.push_back(e); external_.push_back(-1); ++(*in_memory_.access()); return static_cast<int>(elements_.size()) - 1; }
-      void*         release(int i)                  { void* e = get(i); elements_[static_cast<size_t>(i)] = 0; return e; }
+      inline void*  release(int i);
 
       void*         find(int i) const               { return elements_[static_cast<size_t>(i)]; }                        // possibly returns 0, if the element is unloaded
+      void* const&  reference(int i) const          { return elements_[static_cast<size_t>(i)]; }
       void*         get(int i)                      { if (!find(i)) load(i); return find(i); }      // loads the element first, and then returns its address
 
       int           available() const               { int i = 0; for (; i < (int)size(); ++i) if (find(i) != 0) break; return i; }
@@ -87,6 +88,23 @@ clear()
   *in_memory_.access() = 0;
 }
 
+void*
+diy::Collection::
+release(int i)
+{
+  void* e = get(i);
+
+  elements_[static_cast<size_t>(i)] = 0;
+  std::swap(elements_[static_cast<size_t>(i)], elements_.back());
+  elements_.pop_back();
+
+  std::swap(external_[static_cast<size_t>(i)], external_.back());
+  external_.pop_back();
+  --(*in_memory_.access());
+
+  return e;
+}
+
 void
 diy::Collection::
 unload(int i)
diff --git a/include/vtkdiy2/coroutine.hpp b/include/vtkdiy2/coroutine.hpp
new file mode 100644
index 00000000000..aecb57ec6a1
--- /dev/null
+++ b/include/vtkdiy2/coroutine.hpp
@@ -0,0 +1,41 @@
+#ifndef DIY_COROUTINE_HPP
+#define DIY_COROUTINE_HPP
+
+/*
+ * Derived from libco v20 (2019-10-16)
+ * author: byuu (https://byuu.org/)
+ * license: ISC
+ */
+
+namespace diy
+{
+
+namespace coroutine
+{
+
+using cothread_t = void*;
+
+
+inline cothread_t  co_active();
+inline cothread_t  co_create(unsigned int, void (*)(void));
+inline void        co_delete(cothread_t);
+inline void        co_switch(cothread_t);
+
+// "global variable" to pass an argument
+inline void*&      argument()
+{
+    static thread_local void* x;
+    return x;
+}
+
+}
+
+}
+
+#if defined(_WIN32)
+#include "coroutine/fiber.hpp"
+#else
+#include "coroutine/sjlj.hpp"
+#endif
+
+#endif
diff --git a/include/vtkdiy2/coroutine/fiber.hpp b/include/vtkdiy2/coroutine/fiber.hpp
new file mode 100644
index 00000000000..c83604cbbaa
--- /dev/null
+++ b/include/vtkdiy2/coroutine/fiber.hpp
@@ -0,0 +1,50 @@
+#include <windows.h>
+
+namespace diy
+{
+
+namespace coroutine
+{
+
+/**********************
+ * "Global varialbes" *
+ **********************/
+inline cothread_t& co_active_()
+{
+    static thread_local cothread_t x = 0;
+    return x;
+}
+/**********************/
+
+static void __stdcall co_thunk(void* coentry) {
+  ((void (*)(void))coentry)();
+}
+
+cothread_t co_active() {
+  if(!co_active_()) {
+    ConvertThreadToFiber(0);
+    co_active_() = GetCurrentFiber();
+  }
+  return co_active_();
+}
+
+cothread_t co_create(unsigned int heapsize, void (*coentry)(void)) {
+  if(!co_active_()) {
+    ConvertThreadToFiber(0);
+    co_active_() = GetCurrentFiber();
+  }
+  return (cothread_t)CreateFiber(heapsize, co_thunk, (void*)coentry);
+}
+
+void co_delete(cothread_t cothread) {
+  DeleteFiber(cothread);
+}
+
+void co_switch(cothread_t cothread) {
+  co_active_() = cothread;
+  SwitchToFiber(cothread);
+}
+
+}
+
+}
diff --git a/include/vtkdiy2/coroutine/sjlj.hpp b/include/vtkdiy2/coroutine/sjlj.hpp
new file mode 100644
index 00000000000..3c116561aa6
--- /dev/null
+++ b/include/vtkdiy2/coroutine/sjlj.hpp
@@ -0,0 +1,129 @@
+/*
+  note this was designed for UNIX systems. Based on ideas expressed in a paper by Ralf Engelschall.
+  for SJLJ on other systems, one would want to rewrite springboard() and co_create() and hack the jmb_buf stack pointer.
+*/
+
+#if __USE_FORTIFY_LEVEL
+#define DIY_USE_FORTIFY_LEVEL __USE_FORTIFY_LEVEL
+#undef __USE_FORTIFY_LEVEL
+#warning "diy::coroutine (sjlj.hpp) cannot be compiled with _FORTIFY_SOURCE; disabling."
+#endif
+
+
+//#define _BSD_SOURCE
+//#define _XOPEN_SOURCE 500
+#include <stdlib.h>
+#include <signal.h>
+#include <setjmp.h>
+
+namespace diy
+{
+
+namespace coroutine
+{
+
+struct cothread_struct
+{
+  sigjmp_buf context;
+  void (*coentry)(void);
+  void* stack;
+};
+
+/**********************
+ * "Global varialbes" *
+ **********************/
+inline cothread_struct& co_primary()
+{
+    static thread_local cothread_struct x;
+    return x;
+}
+
+inline cothread_struct*& creating()
+{
+    static thread_local cothread_struct* x;
+    return x;
+}
+
+inline cothread_struct*& co_running()
+{
+    static thread_local cothread_struct* x = 0;
+    return x;
+}
+/**********************/
+
+static void springboard(int) {
+  if(sigsetjmp(creating()->context, 0)) {
+    co_running()->coentry();
+  }
+}
+
+cothread_t co_active() {
+  if(!co_running()) co_running() = &co_primary();
+  return (cothread_t)co_running();
+}
+
+cothread_t co_create(unsigned int size, void (*coentry)(void)) {
+  if(!co_running()) co_running() = &co_primary();
+
+  cothread_struct* thread = (cothread_struct*)malloc(sizeof(cothread_struct));
+  if(thread) {
+    struct sigaction handler;
+    struct sigaction old_handler;
+
+    stack_t stack;
+    stack_t old_stack;
+
+    thread->coentry = 0;
+    thread->stack = 0;
+
+    stack.ss_flags = 0;
+    stack.ss_size = size;
+    thread->stack = stack.ss_sp = malloc(size);
+    if(stack.ss_sp && !sigaltstack(&stack, &old_stack)) {
+      handler.sa_handler = springboard;
+      handler.sa_flags = SA_ONSTACK;
+      sigemptyset(&handler.sa_mask);
+      creating() = thread;
+
+      if(!sigaction(SIGUSR1, &handler, &old_handler)) {
+        if(!raise(SIGUSR1)) {
+          thread->coentry = coentry;
+        }
+        sigaltstack(&old_stack, 0);
+        sigaction(SIGUSR1, &old_handler, 0);
+      }
+    }
+
+    if(thread->coentry != coentry) {
+      co_delete(thread);
+      thread = 0;
+    }
+  }
+
+  return (cothread_t)thread;
+}
+
+void co_delete(cothread_t cothread) {
+  if(cothread) {
+    if(((cothread_struct*)cothread)->stack) {
+      free(((cothread_struct*)cothread)->stack);
+    }
+    free(cothread);
+  }
+}
+
+void co_switch(cothread_t cothread) {
+  if(!sigsetjmp(co_running()->context, 0)) {
+    co_running() = (cothread_struct*)cothread;
+    siglongjmp(co_running()->context, 1);
+  }
+}
+
+}
+
+}
+
+#if DIY_USE_FORTIFY_LEVEL
+#define __USE_FORTIFY_LEVEL DIY_USE_FORTIFY_LEVEL
+#undef DIY_USE_FORTIFY_LEVEL
+#endif
diff --git a/include/vtkdiy2/critical-resource.hpp b/include/vtkdiy2/critical-resource.hpp
index 45c5ccadc4a..e6dd5047e13 100644
--- a/include/vtkdiy2/critical-resource.hpp
+++ b/include/vtkdiy2/critical-resource.hpp
@@ -17,6 +17,9 @@ namespace diy
       const T&  operator*() const                           { return x_; }
       const T*  operator->() const                          { return &x_; }
 
+      void      lock()                                      { lock_.lock(); }
+      void      unlock()                                    { lock_.unlock(); }
+
     private:
       T&                        x_;
       lock_guard<Mutex>         lock_;
@@ -33,6 +36,8 @@ namespace diy
                         critical_resource()                 {}
                         critical_resource(const T& x):
                             x_(x)                           {}
+                        critical_resource(T&& x):
+                            x_(std::move(x))                {}
 
       accessor          access()                            { return accessor(x_, m_); }
       const_accessor    const_access() const                { return const_accessor(x_, m_); }
diff --git a/include/vtkdiy2/decomposition.hpp b/include/vtkdiy2/decomposition.hpp
index e1d624ca959..12f86437c5c 100644
--- a/include/vtkdiy2/decomposition.hpp
+++ b/include/vtkdiy2/decomposition.hpp
@@ -63,8 +63,8 @@ namespace detail
     static Coordinate   from(int i, int n, Coordinate min, Coordinate max, bool)      { return min + (max - min)/n * i; }
     static Coordinate   to  (int i, int n, Coordinate min, Coordinate max, bool)      { return min + (max - min)/n * (i+1); }
 
-    static int          lower(Coordinate x, int n, Coordinate min, Coordinate max, bool)   { Coordinate width = (max - min)/n; Coordinate res = std::floor((x - min)/width); if (min + res*width == x) return (res - 1); else return res; }
-    static int          upper(Coordinate x, int n, Coordinate min, Coordinate max, bool)   { Coordinate width = (max - min)/n; Coordinate res = std::ceil ((x - min)/width); if (min + res*width == x) return (res + 1); else return res; }
+    static int          lower(Coordinate x, int n, Coordinate min, Coordinate max, bool)   { Coordinate width = (max - min)/n; auto res = static_cast<int>(std::floor((x - min)/width)); if (min + res*width == x) return (res - 1); else return res; }
+    static int          upper(Coordinate x, int n, Coordinate min, Coordinate max, bool)   { Coordinate width = (max - min)/n; auto res = static_cast<int>(std::ceil ((x - min)/width)); if (min + res*width == x) return (res + 1); else return res; }
   };
 }
 
@@ -123,6 +123,7 @@ namespace detail
     template<class Point>
     int             lowest_gid(const Point& p) const;
 
+    DivisionsVector gid_to_coords(int gid) const                                { DivisionsVector coords; gid_to_coords(gid, coords); return coords; }
     void            gid_to_coords(int gid, DivisionsVector& coords) const       { gid_to_coords(gid, coords, divisions); }
     int             coords_to_gid(const DivisionsVector& coords) const          { return coords_to_gid(coords, divisions); }
     void            fill_divisions(std::vector<int>& divisions) const;
@@ -131,8 +132,8 @@ namespace detail
     void            fill_bounds(Bounds& bounds, int gid, bool add_ghosts = false) const;
 
     static bool     all(const std::vector<int>& v, int x);
-    static void     gid_to_coords(int gid, DivisionsVector& coords, const DivisionsVector& divisions);
-    static int      coords_to_gid(const DivisionsVector& coords, const DivisionsVector& divisions);
+    static void     gid_to_coords(int gid, DivisionsVector& coords, const DivisionsVector& divs);
+    static int      coords_to_gid(const DivisionsVector& coords, const DivisionsVector& divs);
 
     static void     factor(std::vector<unsigned>& factors, int n);
 
@@ -409,25 +410,25 @@ all(const std::vector<int>& v, int x)
 template<class Bounds>
 void
 diy::RegularDecomposer<Bounds>::
-gid_to_coords(int gid, DivisionsVector& coords, const DivisionsVector& divisions)
+gid_to_coords(int gid, DivisionsVector& coords, const DivisionsVector& divs)
 {
-  int dim = static_cast<int>(divisions.size());
-  for (int i = 0; i < dim; ++i)
+  coords.clear();
+  for (int i = 0; i < static_cast<int>(divs.size()); ++i)
   {
-    coords.push_back(gid % divisions[i]);
-    gid /= divisions[i];
+    coords.push_back(gid % divs[i]);
+    gid /= divs[i];
   }
 }
 
 template<class Bounds>
 int
 diy::RegularDecomposer<Bounds>::
-coords_to_gid(const DivisionsVector& coords, const DivisionsVector& divisions)
+coords_to_gid(const DivisionsVector& coords, const DivisionsVector& divs)
 {
   int gid = 0;
   for (int i = static_cast<int>(coords.size()) - 1; i >= 0; --i)
   {
-    gid *= divisions[i];
+    gid *= divs[i];
     gid += coords[i];
   }
   return gid;
@@ -552,8 +553,7 @@ fill_divisions(std::vector<int>& divisions_) const
     }
 
     // iterate over factorization of number of blocks (factors are sorted smallest to largest)
-    // NB: using int instead of size_t because must be negative in order to break out of loop
-    for (int i = factors.size() - 1; i >= 0; --i)
+    for (auto f = factors.rbegin(); f != factors.rend(); ++f)
     {
         // fill in missing divs by dividing dimension w/ largest block size
         // except when this would be illegal (resulting in bounds.max < bounds.min;
@@ -565,19 +565,19 @@ fill_divisions(std::vector<int>& divisions_) const
         // split the dimension with the largest block size (first element in vector)
         Coordinate min =
             detail::BoundsHelper<Bounds>::from(0,
-                                               missing_divs[0].nb * factors[i],
+                                               missing_divs[0].nb * (*f),
                                                domain.min[missing_divs[0].dim],
                                                domain.max[missing_divs[0].dim],
                                                share_face[missing_divs[0].dim]);
         Coordinate max =
             detail::BoundsHelper<Bounds>::to(0,
-                                             missing_divs[0].nb * factors[i],
+                                             missing_divs[0].nb * (*f),
                                              domain.min[missing_divs[0].dim],
                                              domain.max[missing_divs[0].dim],
                                              share_face[missing_divs[0].dim]);
         if (max >= min)
         {
-            missing_divs[0].nb    *= factors[i];
+            missing_divs[0].nb    *= (*f);
             missing_divs[0].b_size = max - min;
         }
         else
diff --git a/include/vtkdiy2/detail/algorithms/kdtree-sampling.hpp b/include/vtkdiy2/detail/algorithms/kdtree-sampling.hpp
index 2a99ced5175..6b3cf382376 100644
--- a/include/vtkdiy2/detail/algorithms/kdtree-sampling.hpp
+++ b/include/vtkdiy2/detail/algorithms/kdtree-sampling.hpp
@@ -74,7 +74,7 @@ operator()(Block* b, const diy::ReduceProxy& srp, const KDTreePartners& partners
         dim = partners.dim(srp.round() - 1);
 
     if (srp.round() == partners.rounds())
-        update_links(b, srp, dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain); // -1 would be the "uninformative" link round
+        update_links(b, srp, dim, partners.sub_round((int)srp.round() - 2), (int)partners.swap_rounds(), partners.wrap, partners.domain); // -1 would be the "uninformative" link round
     else if (partners.swap_round(srp.round()) && partners.sub_round(srp.round()) < 0)       // link round
     {
         dequeue_exchange(b, srp, dim);         // from the swap round
@@ -92,7 +92,7 @@ operator()(Block* b, const diy::ReduceProxy& srp, const KDTreePartners& partners
             int prev_dim = dim - 1;
             if (prev_dim < 0)
                 prev_dim += dim_;
-            update_links(b, srp, prev_dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain);    // -1 would be the "uninformative" link round
+            update_links(b, srp, prev_dim, partners.sub_round((int)srp.round() - 2), (int)partners.swap_rounds(), partners.wrap, partners.domain);    // -1 would be the "uninformative" link round
         }
 
         compute_local_samples(b, srp, dim);
@@ -134,7 +134,7 @@ divide_gid(int gid, bool lower, int round, int rounds) const
 template<class Block, class Point>
 void
 diy::detail::KDTreeSamplingPartition<Block,Point>::
-update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const
+update_links(Block*, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const
 {
     auto        log  = get_logger();
     int         gid  = srp.gid();
@@ -253,7 +253,7 @@ update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int roun
 template<class Block, class Point>
 void
 diy::detail::KDTreeSamplingPartition<Block,Point>::
-split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const
+split_to_neighbors(Block*, const diy::ReduceProxy& srp, int) const
 {
     int         lid  = srp.master()->lid(srp.gid());
     RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
@@ -290,7 +290,7 @@ compute_local_samples(Block* b, const diy::ReduceProxy& srp, int dim) const
 template<class Block, class Point>
 void
 diy::detail::KDTreeSamplingPartition<Block,Point>::
-add_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const
+add_samples(Block*, const diy::ReduceProxy& srp, Samples& samples) const
 {
     // dequeue and combine the samples
     for (int i = 0; i < srp.in_link().size(); ++i)
@@ -307,7 +307,7 @@ add_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const
 template<class Block, class Point>
 void
 diy::detail::KDTreeSamplingPartition<Block,Point>::
-receive_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const
+receive_samples(Block*, const diy::ReduceProxy& srp, Samples& samples) const
 {
     srp.dequeue(srp.in_link().target(0).gid, samples);
 }
@@ -315,7 +315,7 @@ receive_samples(Block* b, const diy::ReduceProxy& srp, Samples& samples) const
 template<class Block, class Point>
 void
 diy::detail::KDTreeSamplingPartition<Block,Point>::
-forward_samples(Block* b, const diy::ReduceProxy& srp, const Samples& samples) const
+forward_samples(Block*, const diy::ReduceProxy& srp, const Samples& samples) const
 {
     for (int i = 0; i < srp.out_link().size(); ++i)
         srp.enqueue(srp.out_link().target(i), samples);
diff --git a/include/vtkdiy2/detail/algorithms/kdtree.hpp b/include/vtkdiy2/detail/algorithms/kdtree.hpp
index 2a46c400100..3754c868cb9 100644
--- a/include/vtkdiy2/detail/algorithms/kdtree.hpp
+++ b/include/vtkdiy2/detail/algorithms/kdtree.hpp
@@ -68,10 +68,10 @@ struct diy::detail::KDTreePartners
                         wrap(wrap_),
                         domain(domain_)
   {
-    for (unsigned i = 0; i < swap.rounds(); ++i)
+    for (int i = 0; i < swap.rounds(); ++i)
     {
       // fill histogram rounds
-      for (unsigned j = 0; j < histogram.rounds(); ++j)
+      for (int j = 0; j < histogram.rounds(); ++j)
       {
         rounds_.push_back(std::make_pair(false, j));
         dim_.push_back(i % dim);
@@ -115,7 +115,7 @@ struct diy::detail::KDTreePartners
     else if (swap_round(round) && sub_round(round) < 0)       // link round
         swap.incoming(sub_round(round - 1) + 1, gid, partners, m);
     else if (swap_round(round))
-        histogram.incoming(histogram.rounds(), gid, partners, m);
+        histogram.incoming(static_cast<int>(histogram.rounds()), gid, partners, m);
     else
     {
         if (round > 0 && sub_round(round) == 0)
@@ -177,7 +177,7 @@ operator()(Block* b, const diy::ReduceProxy& srp, const KDTreePartners& partners
         dim = partners.dim(srp.round() - 1);
 
     if (srp.round() == partners.rounds())
-        update_links(b, srp, dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain); // -1 would be the "uninformative" link round
+        update_links(b, srp, dim, partners.sub_round((int)srp.round() - 2), (int)partners.swap_rounds(), partners.wrap, partners.domain); // -1 would be the "uninformative" link round
     else if (partners.swap_round(srp.round()) && partners.sub_round(srp.round()) < 0)       // link round
     {
         dequeue_exchange(b, srp, dim);         // from the swap round
@@ -195,7 +195,7 @@ operator()(Block* b, const diy::ReduceProxy& srp, const KDTreePartners& partners
             int prev_dim = dim - 1;
             if (prev_dim < 0)
                 prev_dim += dim_;
-            update_links(b, srp, prev_dim, partners.sub_round(srp.round() - 2), partners.swap_rounds(), partners.wrap, partners.domain);    // -1 would be the "uninformative" link round
+            update_links(b, srp, prev_dim, partners.sub_round((int)srp.round() - 2), (int)partners.swap_rounds(), partners.wrap, partners.domain);    // -1 would be the "uninformative" link round
         }
 
         compute_local_histogram(b, srp, dim);
@@ -229,7 +229,7 @@ divide_gid(int gid, bool lower, int round, int rounds) const
 template<class Block, class Point>
 void
 diy::detail::KDTreePartition<Block,Point>::
-update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const
+update_links(Block*, const diy::ReduceProxy& srp, int dim, int round, int rounds, bool wrap, const Bounds& domain) const
 {
     int         gid  = srp.gid();
     int         lid  = srp.master()->lid(gid);
@@ -346,7 +346,7 @@ update_links(Block* b, const diy::ReduceProxy& srp, int dim, int round, int roun
 template<class Block, class Point>
 void
 diy::detail::KDTreePartition<Block,Point>::
-split_to_neighbors(Block* b, const diy::ReduceProxy& srp, int dim) const
+split_to_neighbors(Block*, const diy::ReduceProxy& srp, int) const
 {
     int         lid  = srp.master()->lid(srp.gid());
     RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
@@ -366,20 +366,23 @@ void
 diy::detail::KDTreePartition<Block,Point>::
 compute_local_histogram(Block* b, const diy::ReduceProxy& srp, int dim) const
 {
+    auto udim = static_cast<unsigned>(dim);
     int         lid  = srp.master()->lid(srp.gid());
     RCLink*     link = static_cast<RCLink*>(srp.master()->link(lid));
 
     // compute and enqueue local histogram
     Histogram histogram(bins_);
 
-    float   width = (link->core().max[dim] - link->core().min[dim])/bins_;
+    float   width = (link->core().max[udim] - link->core().min[udim])/bins_;
     for (size_t i = 0; i < (b->*points_).size(); ++i)
     {
-        float x = (b->*points_)[i][dim];
-        int loc = (x - link->core().min[dim]) / width;
-        if (loc < 0)
-            throw std::runtime_error(fmt::format("{} {} {}", loc, x, link->core().min[dim]));
-        if (loc >= (int) bins_)
+        float x = (b->*points_)[i][udim];
+        float floc = (x - link->core().min[udim]) / width;
+        if (floc < 0)
+            throw std::runtime_error(fmt::format("{} {} {}", floc, x, link->core().min[udim]));
+
+        auto loc = static_cast<size_t>(floc);
+        if (loc >= bins_)
             loc = bins_ - 1;
         ++(histogram[loc]);
     }
@@ -390,7 +393,7 @@ compute_local_histogram(Block* b, const diy::ReduceProxy& srp, int dim) const
 template<class Block, class Point>
 void
 diy::detail::KDTreePartition<Block,Point>::
-add_histogram(Block* b, const diy::ReduceProxy& srp, Histogram& histogram) const
+add_histogram(Block*, const diy::ReduceProxy& srp, Histogram& histogram) const
 {
     // dequeue and add up the histograms
     for (int i = 0; i < srp.in_link().size(); ++i)
@@ -407,7 +410,7 @@ add_histogram(Block* b, const diy::ReduceProxy& srp, Histogram& histogram) const
 template<class Block, class Point>
 void
 diy::detail::KDTreePartition<Block,Point>::
-receive_histogram(Block* b, const diy::ReduceProxy& srp, Histogram& histogram) const
+receive_histogram(Block*, const diy::ReduceProxy& srp, Histogram& histogram) const
 {
     srp.dequeue(srp.in_link().target(0).gid, histogram);
 }
@@ -415,7 +418,7 @@ receive_histogram(Block* b, const diy::ReduceProxy& srp, Histogram& histogram) c
 template<class Block, class Point>
 void
 diy::detail::KDTreePartition<Block,Point>::
-forward_histogram(Block* b, const diy::ReduceProxy& srp, const Histogram& histogram) const
+forward_histogram(Block*, const diy::ReduceProxy& srp, const Histogram& histogram) const
 {
     for (int i = 0; i < srp.out_link().size(); ++i)
         srp.enqueue(srp.out_link().target(i), histogram);
@@ -445,22 +448,26 @@ enqueue_exchange(Block* b, const diy::ReduceProxy& srp, int dim, const Histogram
     size_t cur   = 0;
     float  width = (link->core().max[dim] - link->core().min[dim])/bins_;
     float  split = 0;
-    size_t i = 0;
-    for (; i < histogram.size(); ++i)
+
+    // scope-block for variable `i`
     {
-        if (cur + histogram[i] > total/2)
-            break;
-        cur += histogram[i];
+        size_t i = 0;
+        for (; i < histogram.size(); ++i)
+        {
+            if (cur + histogram[i] > total/2)
+                break;
+            cur += histogram[i];
+        }
+        if (i == 0)
+            ++i;
+        else if (i >= histogram.size() - 1)
+            i = histogram.size() - 2;
+        split = link->core().min[dim] + width*i;
+        log->trace("Found split: {} (dim={}) in {} - {}", split, dim, link->core().min[dim], link->core().max[dim]);
     }
-    if (i == 0)
-        ++i;
-    else if (i >= histogram.size() - 1)
-        i = histogram.size() - 2;
-    split = link->core().min[dim] + width*i;
-    log->trace("Found split: {} (dim={}) in {} - {}", split, dim, link->core().min[dim], link->core().max[dim]);
 
     // subset and enqueue
-    std::vector< std::vector<Point> > out_points(srp.out_link().size());
+    std::vector< std::vector<Point> > out_points(static_cast<size_t>(srp.out_link().size()));
     for (size_t i = 0; i < (b->*points_).size(); ++i)
     {
       float x = (b->*points_)[i][dim];
diff --git a/include/vtkdiy2/detail/algorithms/load-balance-collective.hpp b/include/vtkdiy2/detail/algorithms/load-balance-collective.hpp
new file mode 100644
index 00000000000..a22ee3d54bd
--- /dev/null
+++ b/include/vtkdiy2/detail/algorithms/load-balance-collective.hpp
@@ -0,0 +1,125 @@
+#pragma once
+
+#include <queue>
+#include "load-balance.hpp"
+
+namespace diy
+{
+
+namespace detail
+{
+
+// exchange work information among all processes using synchronous collective method
+inline void exchange_work_info(diy::Master&            master,
+                        const WorkInfo&         my_work_info,           // my process' work info
+                        std::vector<WorkInfo>&  all_work_info)          // (output) global work info
+{
+    auto nprocs = master.communicator().size();     // global number of procs
+    all_work_info.resize(nprocs);
+    diy::mpi::detail::all_gather(master.communicator(), &my_work_info, sizeof(WorkInfo), MPI_BYTE, &all_work_info[0]);
+}
+
+// determine move info from work info
+inline void decide_move_info(std::vector<WorkInfo>&        all_work_info,          // global work info
+                      std::vector<MoveInfo>&        all_move_info)          // (output) move info for all moves
+{
+    all_move_info.clear();
+
+    // move all blocks with an approximation to the longest processing time first (LPTF) scheduling algorithm
+    // https://en.wikipedia.org/wiki/Longest-processing-time-first_scheduling
+    // we iteratively move the heaviest block to lightest proc
+    // constrained by only recording the heaviest block for each proc, not all blocks
+
+    // minimum proc_work priority queue, top is min proc_work
+    auto cmp = [&](WorkInfo& a, WorkInfo& b) { return a.proc_work > b.proc_work; };
+    std::priority_queue<WorkInfo, std::vector<WorkInfo>, decltype(cmp)> min_proc_work_q(cmp);
+    for (auto i = 0; i < all_work_info.size(); i++)
+        min_proc_work_q.push(all_work_info[i]);
+
+    // sort all_work_info by descending top_work
+    std::sort(all_work_info.begin(), all_work_info.end(),
+            [&](WorkInfo& a, WorkInfo& b) { return a.top_work > b.top_work; });
+
+    // walk the all_work_info vector in descending top_work order
+    // move the next heaviest block to the lightest proc
+    for (auto i = 0; i < all_work_info.size(); i++)
+    {
+        auto src_work_info = all_work_info[i];                      // heaviest block
+        auto dst_work_info = min_proc_work_q.top();                 // lightest proc
+
+        // sanity check that the move makes sense
+        if (src_work_info.proc_work - dst_work_info.proc_work > src_work_info.top_work &&   // improve load balance
+                src_work_info.proc_rank != dst_work_info.proc_rank &&                       // not self
+                src_work_info.nlids > 1)                                                    // don't leave a proc with no blocks
+        {
+            MoveInfo move_info;
+            move_info.src_proc  = src_work_info.proc_rank;
+            move_info.dst_proc  = dst_work_info.proc_rank;
+            move_info.move_gid  = src_work_info.top_gid;
+            all_move_info.push_back(move_info);
+
+            // update the min_proc_work priority queue
+            auto work_info = min_proc_work_q.top();                 // lightest proc
+            work_info.proc_work += src_work_info.top_work;
+            if (work_info.top_work < src_work_info.top_work)
+            {
+                work_info.top_work = src_work_info.top_work;
+                work_info.top_gid  = src_work_info.top_gid;
+            }
+            min_proc_work_q.pop();
+            min_proc_work_q.push(work_info);
+        }
+    }
+}
+
+// move one block from src to dst proc
+inline void move_block(diy::Master&            master,
+                const MoveInfo&         move_info)
+{
+    // sanity check that source and destination are different
+    if (move_info.src_proc == move_info.dst_proc)
+    {
+        fmt::print(stderr, "Error: move_block(): source and destination are same. This should not happen.\n");
+        abort();
+    }
+
+    if (master.communicator().rank() == move_info.src_proc)
+    {
+        diy::MemoryBuffer bb;
+
+        // move the block from src to dst proc
+        void* send_b = master.block(master.lid(move_info.move_gid));
+        master.saver()(send_b, bb);
+        master.communicator().send(move_info.dst_proc, 0, bb.buffer);
+
+        // move the link for the moving block
+        diy::Link* send_link = master.link(master.lid(move_info.move_gid));
+        diy::LinkFactory::save(bb, send_link);
+        master.communicator().send(move_info.dst_proc, 0, bb.buffer);
+
+        // remove the block from the master
+        int move_lid = master.lid(move_info.move_gid);
+        master.destroyer()(master.release(move_lid));
+    }
+    else if (master.communicator().rank() == move_info.dst_proc)
+    {
+        diy::MemoryBuffer bb;
+
+        // move the block from src to dst proc
+        void* recv_b = master.creator()();
+        master.communicator().recv(move_info.src_proc, 0, bb.buffer);
+        master.loader()(recv_b, bb);
+
+        // move the link for the moving block
+        diy::Link* recv_link;
+        master.communicator().recv(move_info.src_proc, 0, bb.buffer);
+        recv_link = diy::LinkFactory::load(bb);
+
+        // add the block to the master
+        master.add(move_info.move_gid, recv_b, recv_link);
+    }
+}
+
+}   // namespace detail
+
+}   // namespace diy
diff --git a/include/vtkdiy2/detail/algorithms/load-balance-sampling.hpp b/include/vtkdiy2/detail/algorithms/load-balance-sampling.hpp
new file mode 100644
index 00000000000..e4c6f254a6c
--- /dev/null
+++ b/include/vtkdiy2/detail/algorithms/load-balance-sampling.hpp
@@ -0,0 +1,217 @@
+#pragma once
+
+#include "load-balance.hpp"
+
+namespace diy
+{
+
+namespace detail
+{
+
+// send requests for work info
+inline void send_req(AuxBlock*,                                // local block (unused)
+              const diy::Master::ProxyWithLink& cp,     // communication proxy for neighbor blocks
+              std::set<int>& procs)                     // processes to query
+{
+    // send requests for work info to sample_procs
+    int v = 1;                                          // any message will do
+    for (auto proc_iter = procs.begin(); proc_iter != procs.end(); proc_iter++)
+    {
+        int gid    = *proc_iter;
+        int proc   = *proc_iter;
+        diy::BlockID dest_block = {gid, proc};
+        cp.enqueue(dest_block, v);
+    }
+}
+
+// receive requests for work info
+inline void recv_req(AuxBlock*,                                // local block (unused)
+              const diy::Master::ProxyWithLink& cp,     // communication proxy for neighbor blocks
+              std::vector<int>& req_procs)              // processes requesting work info
+{
+    std::vector<int> incoming_gids;
+    cp.incoming(incoming_gids);
+
+    // for anything incoming, dequeue data received in the last exchange
+    for (int i = 0; i < incoming_gids.size(); i++)
+    {
+        int gid = incoming_gids[i];
+        if (cp.incoming(gid).size())
+        {
+            int v;
+            cp.dequeue(gid, v);
+            req_procs.push_back(gid);                   // aux_master has 1 gid per proc, so gid = proc
+        }
+    }
+}
+
+// get work information from a random sample of processes
+inline void exchange_sample_work_info(diy::Master&             master,                 // the real master with multiple blocks per process
+                               diy::Master&             aux_master,             // auxiliary master with 1 block per process for communicating between procs
+                               float                    sample_frac,            // fraction of procs to sample 0.0 < sample_size <= 1.0
+                               const WorkInfo&          my_work_info,           // my process' work info
+                               std::vector<WorkInfo>&   sample_work_info)       // (output) vector of sorted sample work info, sorted by increasing total work per process
+{
+    auto nprocs = master.communicator().size();     // global number of procs
+    auto my_proc = master.communicator().rank();    // rank of my proc
+
+    // pick a random sample of processes, w/o duplicates, and excluding myself
+    int nsamples = static_cast<int>(sample_frac * (nprocs - 1));
+    std::set<int> sample_procs;
+    for (auto i = 0; i < nsamples; i++)
+    {
+        int rand_proc;
+        do
+        {
+            std::uniform_int_distribution<> distrib(0, nprocs - 1);     // inclusive
+            rand_proc = distrib(master.mt_gen);
+        } while (sample_procs.find(rand_proc) != sample_procs.end() || rand_proc == my_proc);
+        sample_procs.insert(rand_proc);
+    }
+
+    // rexchange requests for work info
+    std::vector<int> req_procs;     // requests for work info received from these processes
+    aux_master.foreach([&](AuxBlock* b, const diy::Master::ProxyWithLink& cp)
+            { send_req(b, cp, sample_procs); });
+    aux_master.exchange(true);      // true = remote
+    aux_master.foreach([&](AuxBlock* b, const diy::Master::ProxyWithLink& cp)
+            { recv_req(b, cp, req_procs); });
+
+    // send work info
+    int work_info_tag = 0;
+    std::vector<diy::mpi::request> reqs(req_procs.size());
+    for (auto i = 0; i < req_procs.size(); i++)
+        reqs[i] = mpi::detail::isend(MPI_Comm(master.communicator()), req_procs[i], work_info_tag, &my_work_info, sizeof(WorkInfo), MPI_BYTE);
+
+    // receive work info
+    sample_work_info.resize(nsamples);
+    for (auto i = 0; i < nsamples; i++)
+        mpi::detail::recv(MPI_Comm(master.communicator()), diy::mpi::any_source, work_info_tag, &sample_work_info[i], sizeof(WorkInfo), MPI_BYTE);
+
+    // ensure all the send requests cleared
+    for (auto i = 0; i < req_procs.size(); i++)
+        reqs[i].wait();
+
+    // sort sample_work_info by proc_work
+    std::sort(sample_work_info.begin(), sample_work_info.end(),
+            [&](WorkInfo& a, WorkInfo& b) { return a.proc_work < b.proc_work; });
+}
+
+// send block
+inline void send_block(AuxBlock*,                                              // local block (unused)
+                const diy::Master::ProxyWithLink&   cp,                 // communication proxy for neighbor blocks
+                diy::Master&                        master,             // real master with multiple blocks per process
+                const std::vector<WorkInfo>&        sample_work_info,   // sampled work info
+                const WorkInfo&                     my_work_info,       // my work info
+                float                               quantile)           // quantile cutoff above which to move blocks (0.0 - 1.0)
+{
+    MoveInfo move_info = {-1, -1, -1};
+
+    // my rank's position in the sampled work info, sorted by proc_work
+    int my_work_idx = sample_work_info.size();                                          // index where my work would be in the sample_work
+    for (auto i = 0; i < sample_work_info.size(); i++)
+    {
+        if (my_work_info.proc_work < sample_work_info[i].proc_work)
+        {
+            my_work_idx = i;
+            break;
+        }
+    }
+
+    // send my heaviest block if it passes the quantile cutoff
+    if (my_work_idx >= quantile * sample_work_info.size())
+    {
+        // pick the destination process to be the mirror image of my work location in the samples
+        // ie, the heavier my process, the lighter the destination process
+        int target = sample_work_info.size() - my_work_idx;
+
+        auto src_work_info = my_work_info;
+        auto dst_work_info = sample_work_info[target];
+
+        // sanity check that the move makes sense
+        if (src_work_info.proc_work - dst_work_info.proc_work > src_work_info.top_work &&   // improve load balance
+                src_work_info.proc_rank != dst_work_info.proc_rank &&                       // not self
+                src_work_info.nlids > 1)                                                    // don't leave a proc with no blocks
+        {
+
+            move_info.move_gid = my_work_info.top_gid;
+            move_info.src_proc = my_work_info.proc_rank;
+            move_info.dst_proc = sample_work_info[target].proc_rank;
+
+            // destination in aux_master, where gid = proc
+            diy::BlockID dest_block = {move_info.dst_proc, move_info.dst_proc};
+
+            // enqueue the gid of the moving block
+            cp.enqueue(dest_block, move_info.move_gid);
+
+            // enqueue the block
+            void* send_b = master.block(master.lid(move_info.move_gid));
+            diy::MemoryBuffer bb;
+            master.saver()(send_b, bb);
+            cp.enqueue(dest_block, bb.buffer);
+
+            // enqueue the link for the block
+            diy::Link* send_link = master.link(master.lid(move_info.move_gid));
+            diy::LinkFactory::save(bb, send_link);
+            cp.enqueue(dest_block, bb.buffer);
+
+            // remove the block from the master
+            int move_lid = master.lid(move_info.move_gid);
+            master.destroyer()(master.release(move_lid));
+        }
+    }
+}
+
+// receive block
+inline void recv_block(AuxBlock*,                                      // local block (unused)
+                const diy::Master::ProxyWithLink&   cp,         // communication proxy for neighbor blocks
+                diy::Master&                        master)     // real master with multiple blocks per process
+{
+    std::vector<int> incoming_gids;
+    cp.incoming(incoming_gids);
+
+    // for anything incoming, dequeue data received in the last exchange
+    for (int i = 0; i < incoming_gids.size(); i++)
+    {
+        int gid = incoming_gids[i];
+        if (cp.incoming(gid).size())
+        {
+            // dequeue the gid of the moving block
+            int move_gid;
+            cp.dequeue(gid, move_gid);
+
+            // dequeue the block
+            void* recv_b = master.creator()();
+            diy::MemoryBuffer bb;
+            cp.dequeue(gid, bb.buffer);
+            master.loader()(recv_b, bb);
+
+            // dequeue the link
+            diy::Link* recv_link;
+            cp.dequeue(gid, bb.buffer);
+            recv_link = diy::LinkFactory::load(bb);
+
+            // add block to the master
+            master.add(move_gid, recv_b, recv_link);
+        }
+    }
+}
+
+// move blocks based on sampled work info
+inline void move_sample_blocks(diy::Master&                    master,                 // real master with multiple blocks per process
+                        diy::Master&                    aux_master,             // auxiliary master with 1 block per process for communcating between procs
+                        const std::vector<WorkInfo>&    sample_work_info,       // sampled work info
+                        const WorkInfo&                 my_work_info,           // my work info
+                        float                           quantile)               // quantile cutoff above which to move blocks (0.0 - 1.0)
+{
+    // rexchange moving blocks
+    aux_master.foreach([&](AuxBlock* b, const diy::Master::ProxyWithLink& cp)
+            { send_block(b, cp, master, sample_work_info, my_work_info, quantile); });
+    aux_master.exchange(true);      // true = remote
+    aux_master.foreach([&](AuxBlock* b, const diy::Master::ProxyWithLink& cp)
+            { recv_block(b, cp, master); });
+}
+
+}   // namespace detail
+
+}   // namespace diy
diff --git a/include/vtkdiy2/detail/algorithms/load-balance.hpp b/include/vtkdiy2/detail/algorithms/load-balance.hpp
new file mode 100644
index 00000000000..f394c1c4c9f
--- /dev/null
+++ b/include/vtkdiy2/detail/algorithms/load-balance.hpp
@@ -0,0 +1,38 @@
+#pragma once
+
+namespace diy
+{
+
+namespace detail
+{
+
+// information about work for one process
+struct WorkInfo
+{
+    int     proc_rank;          // mpi rank of this process
+    int     top_gid;            // gid of most expensive block in this process TODO: can be top-k-gids, as long as k is fixed and known by all
+    Work    top_work;           // work of top_gid TODO: can be vector of top-k work, as long as k is fixed and known by all
+    Work    proc_work;          // total work of this process
+    int     nlids;              // local number of blocks in this process
+};
+
+// information about a block that is moving
+struct MoveInfo
+{
+    MoveInfo(): move_gid(-1), src_proc(-1), dst_proc(-1)   {}
+    MoveInfo(int move_gid_, int src_proc_, int dst_proc_) : move_gid(move_gid_), src_proc(src_proc_), dst_proc(dst_proc_) {}
+    int move_gid;
+    int src_proc;
+    int dst_proc;
+};
+
+// auxiliary empty block structure
+struct AuxBlock
+{
+    static void*    create()            { return new AuxBlock; }
+    static void     destroy(void* b)    { delete static_cast<AuxBlock*>(b); }
+};
+
+}   // namespace detail
+
+} // namespace diy
diff --git a/include/vtkdiy2/detail/algorithms/sort.hpp b/include/vtkdiy2/detail/algorithms/sort.hpp
index 3e9ffd82f36..e1c83c76401 100644
--- a/include/vtkdiy2/detail/algorithms/sort.hpp
+++ b/include/vtkdiy2/detail/algorithms/sort.hpp
@@ -85,29 +85,29 @@ struct SampleSort<Block,T,Cmp>::Sampler
                     Sampler(ValuesVector values_, ValuesVector dividers_, const Cmp& cmp_, size_t num_samples_):
                         values(values_), dividers(dividers_), cmp(cmp_), num_samples(num_samples_)    {}
 
-    void            operator()(Block* b, const ReduceProxy& srp, const RegularSwapPartners& partners) const
+    void            operator()(Block* b, const ReduceProxy& srp, const RegularSwapPartners&) const
     {
         int k_in  = srp.in_link().size();
         int k_out = srp.out_link().size();
 
-        std::vector<T> samples;
+        std::vector<T> samps;
 
         if (k_in == 0)
         {
             // draw random samples
             for (size_t i = 0; i < num_samples; ++i)
-                samples.push_back((b->*values)[std::rand() % (b->*values).size()]);
+                samps.push_back((b->*values)[std::rand() % (b->*values).size()]);
         } else
-            dequeue_values(samples, srp, false);
+            dequeue_values(samps, srp, false);
 
         if (k_out == 0)
         {
             // pick subsamples that separate quantiles
-            std::sort(samples.begin(), samples.end(), cmp);
+            std::sort(samps.begin(), samps.end(), cmp);
             std::vector<T>  subsamples(srp.nblocks() - 1);
-            int step = samples.size() / srp.nblocks();       // NB: subsamples.size() + 1
+            size_t step = samps.size() / srp.nblocks();       // NB: subsamples.size() + 1
             for (size_t i = 0; i < subsamples.size(); ++i)
-                subsamples[i] = samples[(i+1)*step];
+                subsamples[i] = samps[(i+1)*step];
             (b->*dividers).swap(subsamples);
         }
         else
@@ -115,7 +115,7 @@ struct SampleSort<Block,T,Cmp>::Sampler
             for (int i = 0; i < k_out; ++i)
             {
                 MemoryBuffer& out = srp.outgoing(srp.out_link().target(i));
-                save(out, &samples[0], samples.size());
+                save(out, &samps[0], samps.size());
             }
         }
     }
@@ -139,7 +139,7 @@ struct SampleSort<Block,T,Cmp>::Exchanger
             // enqueue values to the correct locations
             for (size_t i = 0; i < (b->*values).size(); ++i)
             {
-                int to = std::lower_bound((b->*samples).begin(), (b->*samples).end(), (b->*values)[i], cmp) - (b->*samples).begin();
+                int to = static_cast<int>(std::lower_bound((b->*samples).begin(), (b->*samples).end(), (b->*values)[i], cmp) - (b->*samples).begin());
                 rp.enqueue(rp.out_link().target(to), (b->*values)[i]);
             }
             (b->*values).clear();
diff --git a/include/vtkdiy2/detail/block_traits.hpp b/include/vtkdiy2/detail/block_traits.hpp
index eb4b7c547d3..984613d94bb 100644
--- a/include/vtkdiy2/detail/block_traits.hpp
+++ b/include/vtkdiy2/detail/block_traits.hpp
@@ -10,7 +10,7 @@ namespace detail
     template<class F>
     struct block_traits
     {
-        typedef typename std::remove_pointer<typename function_traits<F>::template arg<0>::type>::type type;
+        typedef typename std::remove_pointer<typename std::remove_reference<typename function_traits<F>::template arg<0>::type>::type>::type type;
     };
 
     // matches block member functions
diff --git a/include/vtkdiy2/detail/master/collectives.hpp b/include/vtkdiy2/detail/master/collectives.hpp
index 303ba74a6ea..c98aef2a113 100644
--- a/include/vtkdiy2/detail/master/collectives.hpp
+++ b/include/vtkdiy2/detail/master/collectives.hpp
@@ -20,7 +20,7 @@ namespace diy
 
           void  init()                                    { out_ = in_; }
           void  update(const CollectiveOp& other)         { out_ = op_(out_, static_cast<const AllReduceOp&>(other).in_); }
-          void  global(const mpi::communicator& comm)     { T res; mpi::all_reduce(comm, out_, res, op_); out_ = res; }
+          void  global(const mpi::communicator& comm)     { T res{}; mpi::all_reduce(comm, out_, res, op_); out_ = res; }
           void  copy_from(const CollectiveOp& other)      { out_ = static_cast<const AllReduceOp&>(other).out_; }
           void  result_out(void* dest) const              { *reinterpret_cast<T*>(dest) = out_; }
 
diff --git a/include/vtkdiy2/detail/master/communication.hpp b/include/vtkdiy2/detail/master/communication.hpp
index 51cc435b280..1f6b0800fa9 100644
--- a/include/vtkdiy2/detail/master/communication.hpp
+++ b/include/vtkdiy2/detail/master/communication.hpp
@@ -5,11 +5,13 @@ namespace diy
         int from, to;
         int nparts;
         int round;
+        int nblobs;
     };
 
     struct Master::InFlightSend
     {
         std::shared_ptr<MemoryBuffer> message;
+        BinaryBlob                    blob;
         mpi::request                  request;
 
         MessageInfo info;           // for debug purposes
@@ -18,12 +20,18 @@ namespace diy
     struct Master::InFlightRecv
     {
         MemoryBuffer    message;
-        MessageInfo     info { -1, -1, -1, -1 };
+        MessageInfo     info { -1, -1, -1, -1, -1 };
         bool            done = false;
+        MemoryManagement mem;
 
         inline bool     recv(mpi::communicator& comm, const mpi::status& status);
         inline void     place(IncomingRound* in, bool unload, ExternalStorage* storage, IExchangeInfo* iexchange);
-        void            reset()     { *this = InFlightRecv(); }
+        void            reset()
+        {
+            MemoryManagement mem_ = mem;
+            *this = InFlightRecv();
+            mem = mem_;
+        }
     };
 
     struct Master::InFlightRecvsMap: public std::map<int, InFlightRecv>
@@ -63,7 +71,7 @@ namespace diy
         struct mpi_datatype< diy::detail::VectorWindow<T> >
         {
             using VecWin = diy::detail::VectorWindow<T>;
-            static MPI_Datatype         datatype()                { return get_mpi_datatype<T>(); }
+            static diy::mpi::datatype   datatype()                { return get_mpi_datatype<T>(); }
             static const void*          address(const VecWin& x)  { return x.begin; }
             static void*                address(VecWin& x)        { return x.begin; }
             static int                  count(const VecWin& x)    { return static_cast<int>(x.count); }
@@ -111,7 +119,7 @@ recv(mpi::communicator& comm, const mpi::status& status)
 
         result = true;
     }
-    else
+    else if (info.nparts > 0)
     {
         size_t start_idx = message.buffer.size();
         size_t count = status.count<char>();
@@ -124,9 +132,24 @@ recv(mpi::communicator& comm, const mpi::status& status)
         comm.recv(status.source(), status.tag(), window);
 
         info.nparts--;
+    } else if (info.nblobs > 0)
+    {
+        size_t count = status.count<char>();
+        detail::VectorWindow<char> window;
+
+        char* buffer = mem.allocate(info.to, count);
+
+        window.begin = buffer;
+        window.count = count;
+
+        comm.recv(status.source(), status.tag(), window);
+
+        message.save_binary_blob(buffer, count, mem.deallocate);
+
+        info.nblobs--;
     }
 
-    if (info.nparts == 0)
+    if (info.nparts == 0 && info.nblobs == 0)
         done = true;
 
     return result;
@@ -135,35 +158,21 @@ recv(mpi::communicator& comm, const mpi::status& status)
 // once the InFlightRecv is done, place it either out of core or in the appropriate incoming queue
 void
 diy::Master::InFlightRecv::
-place(IncomingRound* in, bool unload, ExternalStorage* storage, IExchangeInfo* iexchange)
+place(IncomingRound* in, bool unload, ExternalStorage* storage, IExchangeInfo*)
 {
-    size_t size     = message.size();
     int from        = info.from;
     int to          = info.to;
-    int external    = -1;
+
+    message.reset();
+
+    auto access = in->map[to][from].access();
+    access->emplace_back(std::move(message));
 
     if (unload)
     {
         get_logger()->debug("Directly unloading queue {} <- {}", to, from);
-        external = storage->put(message);       // unload directly
-    }
-    else if (!iexchange)
-    {
-        in->map[to].queues[from].swap(message);
-        in->map[to].queues[from].reset();       // buffer position = 0
-    }
-    else    // iexchange
-    {
-        auto log = get_logger();
-        log->debug("[{}] Received queue {} <- {}", iexchange->comm.rank(), to, from);
-
-        iexchange->not_done(to);
-        in->map[to].queues[from].append_binary(&message.buffer[0], message.size());        // append instead of overwrite
-
-        iexchange->dec_work();
-        log->debug("[{}] Decrementing work after receiving\n", to);
+        access->back().unload(storage);
     }
-    in->map[to].records[from] = QueueRecord(size, external);
 
     ++(in->received);
 }
diff --git a/include/vtkdiy2/detail/master/execution.hpp b/include/vtkdiy2/detail/master/execution.hpp
index 8f9c0dbb8b0..d74d25cc0f5 100644
--- a/include/vtkdiy2/detail/master/execution.hpp
+++ b/include/vtkdiy2/detail/master/execution.hpp
@@ -1,3 +1,5 @@
+#include <algorithm>
+
 struct diy::Master::ProcessBlock
 {
           ProcessBlock(Master&                    master_,
@@ -61,8 +63,7 @@ struct diy::Master::ProcessBlock
           cmd->execute(skip ? 0 : master.block(i), master.proxy(i));
 
           // no longer need them, so get rid of them
-          current_incoming[gid].queues.clear();
-          current_incoming[gid].records.clear();
+          current_incoming[gid].clear();
       }
 
       if (skip && master.block(i) == 0)
diff --git a/include/vtkdiy2/detail/master/foreach_exchange.hpp b/include/vtkdiy2/detail/master/foreach_exchange.hpp
new file mode 100644
index 00000000000..a4d89a8bec8
--- /dev/null
+++ b/include/vtkdiy2/detail/master/foreach_exchange.hpp
@@ -0,0 +1,105 @@
+struct diy::Master::CoroutineArg
+{
+    int                                                 lid;
+    ProxyWithLink&                                      proxy;
+    coroutine::cothread_t                               main;
+    Master*                                             master;
+    std::function<void(int, const ProxyWithLink&)>      f;
+};
+
+void
+diy::Master::
+launch_process_block_coroutine()
+{
+    using namespace coroutine;
+
+    CoroutineArg*           arg     = static_cast<CoroutineArg*>(argument());
+    int                     lid     = arg->lid;
+    ProxyWithLink&          cp      = arg->proxy;
+    cothread_t              main    = arg->main;
+    // unused
+    //Master*                 master  = arg->master;
+    auto                    f       = arg->f;
+
+    co_switch(main);        // give the argument back
+
+    f(lid, cp);
+    cp.set_done(true);
+    co_switch(main);
+}
+
+template<class F>
+void
+diy::Master::
+foreach_exchange(const F& f, bool remote, unsigned int stack_size)
+{
+    using Block = typename detail::block_traits<F>::type;
+    foreach_exchange_<Block>(f, remote, stack_size);
+}
+
+template<class Block>
+void
+diy::Master::
+foreach_exchange_(const CoroutineCallback<Block>& f, bool remote, unsigned int stack_size)
+{
+    auto scoped = prof.scoped("foreach_exchange");
+    DIY_UNUSED(scoped);
+
+    assert(commands_.empty());  // can't have queued, unexecuted commands left over from ordinary foreach()
+
+    using namespace coroutine;
+
+    // setup coroutines and proxies
+    std::vector<cothread_t>                     coroutines;
+    std::vector<std::unique_ptr<ProxyWithLink>> proxies;
+    std::vector<Block*>                         blocks(size(), nullptr);
+    for (int i = 0; i < size(); ++i)
+    {
+        cothread_t c = co_create(stack_size, &launch_process_block_coroutine);
+        coroutines.push_back(c);
+        proxies.emplace_back(make_unique<ProxyWithLink>(proxy(i)));
+
+        auto trampoline = [&f,&blocks,this](int lid, const ProxyWithLink& cp)
+        {
+            Block* const& b = blocks[lid];
+            f(b, cp);
+        };
+        CoroutineArg arg { i, *proxies.back(), co_active(), this, trampoline };
+        argument() = &arg;
+
+        co_switch(c);
+    }
+
+    int ndone = 0;
+    std::vector<bool>   done(size(), false);
+    while(ndone < size())
+    {
+        // TODO: parallelize the loop using multiple threads
+        for(int i = 0; i < size(); ++i)
+        {
+            if (done[i] == true)
+                continue;
+
+            blocks[i] = get<Block>(i); // load block in case it's out of core
+
+            cothread_t  c  = coroutines[i];
+            auto&       cp = *proxies[i];
+            cp.init();      // reset incoming/outgoing/collectives
+            cp.set_main(co_active());
+            co_switch(c);
+
+            if (cp.done())
+            {
+                done[i] = true;
+                ndone++;
+            }
+        }
+
+        // TODO: should we disable the last exchange; the user could call that one manually
+        // exchange calls execute(), but commands_ are empty, so it should be Ok
+        exchange(remote);
+    }
+
+    for (cothread_t cor : coroutines)
+        co_delete(cor);
+}
diff --git a/include/vtkdiy2/detail/master/iexchange-collective.hpp b/include/vtkdiy2/detail/master/iexchange-collective.hpp
index 575519af724..ad7a9fb2619 100644
--- a/include/vtkdiy2/detail/master/iexchange-collective.hpp
+++ b/include/vtkdiy2/detail/master/iexchange-collective.hpp
@@ -1,18 +1,26 @@
+#include <atomic>
+
 namespace diy
 {
     struct Master::IExchangeInfoCollective: public IExchangeInfo
     {
-      using IExchangeInfo::IExchangeInfo;
+                        IExchangeInfoCollective(mpi::communicator c, stats::Profiler& p):
+                            IExchangeInfo(c, p)
+      {
+          local_work_ = 0;
+          dirty = 0;
+          state = 0;
+      }
 
       inline bool       all_done() override;                    // get global all done status
       inline void       add_work(int work) override;            // add work to global work counter
       inline void       control() override;
 
-      int               local_work_ = 0;
-      int               dirty = 0;
+      std::atomic<int>  local_work_;
+      std::atomic<int>  dirty;
       int               local_dirty, all_dirty;
 
-      int               state = 0;
+      std::atomic<int>  state;
       mpi::request      r;
 
       // debug
diff --git a/include/vtkdiy2/detail/master/iexchange-dud.hpp b/include/vtkdiy2/detail/master/iexchange-dud.hpp
deleted file mode 100644
index 050c2e3a18a..00000000000
--- a/include/vtkdiy2/detail/master/iexchange-dud.hpp
+++ /dev/null
@@ -1,338 +0,0 @@
-namespace diy
-{
-    struct Master::IExchangeInfoDUD: public IExchangeInfo
-    {
-        struct Message
-        {
-            std::array<int,2>   msg;
-            mpi::request        request;
-        };
-
-      using IExchangeInfo::IExchangeInfo;
-
-      inline bool       all_done() override;                             // get global all done status
-      inline void       add_work(int work) override;                     // add work to global work counter
-      inline void       control() override;
-      inline void       update_subtree(int diff);
-      inline bool       process_work_update();
-      inline void       check_for_abort();
-      inline void       abort(int trial);
-      inline void       process_done(int source, int trial);
-      inline void       reset_child_confirmations();
-      int               right_bit(int x) const                  { return ((x ^ (x-1)) + 1) >> 1; }
-
-      void              send(int rk, int  type, int  x)
-      {
-          inflight_.emplace_back();
-          Message& m = inflight_.back();
-          m.msg[0] = type;
-          m.msg[1] = x;
-          m.request = comm.issend(rk, tags::iexchange, m.msg);
-          log->trace("[{}] Sending to {}, type = {}, x = {}", comm.rank(), rk, type, x);
-      }
-      void              recv(int rk, int& type, int& x)
-      {
-          std::array<int,2> msg;
-          comm.recv(rk, tags::iexchange, msg);
-          type = msg[0];
-          x = msg[1];
-          log->trace("[{}] Received from {}, type = {}, x = {}, msg = {}", comm.rank(), rk, type, x);
-      }
-
-      inline bool       nudge();
-      int               parent() const                          { return comm.rank() & (comm.rank() - 1); }     // flip the last 1 to 0
-      inline void       signal_children(int tag, int x);
-      bool              incomplete() const                      { return subtree_work_ > 0 || !inflight_.empty(); }
-      bool              stale() const                           { return subtree_work_ != last_subtree_work_message_ || local_work_ != last_local_work_message_; }
-
-      struct type       { enum {
-                                    work_update = 0,
-                                    done,
-                                    abort
-                                }; };
-      std::unordered_map<int, bool>       done;                 // gid -> done
-
-      int                                 local_work_   = 0, last_local_work_message_   = 0;
-      int                                 subtree_work_ = 0, last_subtree_work_message_ = 0;
-      int                                 down_up_down_ = 0;
-
-      std::list<Message>                  inflight_;
-      int                                 last_trial_ = -1;
-      int                                 child_confirmations = -1;
-
-      // debug
-      bool                                first_dud = true;
-      using IExchangeInfo::prof;
-    };
-}
-
-
-// get global all done status
-bool
-diy::Master::IExchangeInfoDUD::
-all_done()
-{
-    if (down_up_down_ == 3)
-        while (!inflight_.empty()) nudge();     // make sure that all the messages are propagated before we finish
-                                                // if we've decided that we are done, the only outstanding messages
-                                                // can be the done signals to children; nothing else should be in-flight
-
-    return down_up_down_ == 3;
-}
-
-// add arbitrary units of work to global work counter
-void
-diy::Master::IExchangeInfoDUD::
-add_work(int work)
-{
-    int cur_local_work = local_work_;
-    local_work_ += work;
-    assert(local_work_ >= 0);
-
-    log->trace("[{}] Adding work: work = {}, local_work = {}, cur_local_work = {}", comm.rank(), work, local_work_, cur_local_work);
-
-    if ((cur_local_work == 0) ^ (local_work_ == 0))     // changed from or to zero
-    {
-        int diff    = (local_work_ - last_local_work_message_);
-        update_subtree(diff);
-        last_local_work_message_ = local_work_;
-    }
-}
-
-void
-diy::Master::IExchangeInfoDUD::
-update_subtree(int diff)
-{
-    int cur_subtree_work = subtree_work_;
-    subtree_work_ += diff;
-    log->debug("[{}] Updating subtree: diff = {}, subtree_work_ = {}", comm.rank(), diff, subtree_work_);
-    assert(subtree_work_ >= 0);
-
-    if ((cur_subtree_work == 0) ^ (subtree_work_ == 0))     // changed from or to zero
-    {
-        if (comm.rank() != 0)
-        {
-            int subtree_diff = (subtree_work_ - last_subtree_work_message_);
-            log->debug("[{}] Sending subtree update: diff = {}, subtree_diff = {}", comm.rank(), diff, subtree_diff);
-            send(parent(), type::work_update, subtree_diff);
-            last_subtree_work_message_ = subtree_work_;
-            if (down_up_down_ == 1)
-                abort(last_trial_);
-            else if (down_up_down_ == 2)
-                log->warn("[{}] Enqueueing work update after finishing, diff = {}", comm.rank(), subtree_diff);
-                // This is Ok in general: if this happens, somebody else must abort this trial.
-            else if (down_up_down_ == 3)
-                log->critical("[{}] Enqueueing work update after all done, diff = {}", comm.rank(), subtree_diff);
-        } else
-        {
-            assert(down_up_down_ < 2);
-            down_up_down_ = 0;      // if we are updating work on the root, definitely abort the down-up-down protocol
-        }
-    }
-}
-
-void
-diy::Master::IExchangeInfoDUD::
-control()
-{
-    mpi::optional<mpi::status> ostatus = comm.iprobe(mpi::any_source, tags::iexchange);
-    while(ostatus)
-    {
-        int t, x;
-        recv(ostatus->source(), t, x);
-        if (t == type::work_update)
-        {
-            // x = diff
-            log->debug("[{}] subtree update request from {}, diff = {}", comm.rank(), ostatus->source(), x);
-            update_subtree(x);      // for now propagates up the tree verbatim
-        } else if (t == type::abort)
-        {
-            // x = trial
-            assert(x >= -1);
-            abort(x);
-        } else if (t == type::done)
-        {
-            process_done(ostatus->source(), x);
-        }
-        ostatus = comm.iprobe(mpi::any_source, tags::iexchange);
-    }
-
-    // initiate down-up-down protocol
-    if (subtree_work_ == 0 && comm.rank() == 0 && down_up_down_ == 0)
-    {
-        // debug
-        if (first_dud)
-        {
-            prof >> "iexchange-control";        // consensus-time cannot nest in iexchange-control
-            prof << "consensus-time";
-            prof << "iexchange-control";
-            first_dud = false;
-        }
-
-        down_up_down_ = 1;
-        reset_child_confirmations();
-        if (child_confirmations)
-        {
-            signal_children(type::done, ++last_trial_);
-            log->info("Initiated down-up-down, trial = {}", last_trial_);
-        } else // no children
-            down_up_down_ = 3;
-    }
-
-    while(nudge());
-}
-
-void
-diy::Master::IExchangeInfoDUD::
-abort(int trial)
-{
-    if (down_up_down_ == 0) // already aborted
-        return;
-
-    if (trial != last_trial_)
-        return;
-
-    log->warn("[{}] aborting trial {}", comm.rank(), trial);
-    assert(trial >= 0);
-
-    down_up_down_ = 0;
-
-    if (comm.rank() != 0)
-    {
-        send(parent(), type::abort, trial);    // propagate abort
-        if (down_up_down_ >= 2)
-            log->critical("[{}] sending abort after done", comm.rank());
-        last_trial_ = -1;       // all future aborts for this trial will be stale
-    }
-}
-
-void
-diy::Master::IExchangeInfoDUD::
-process_done(int source, int trial)
-{
-    if (trial < -1)
-    {
-        log->critical("[{}] done with source = {}, trial = {}", comm.rank(), source, trial);
-        assert(trial >= -1);
-    }
-
-    while(nudge());     // clear up finished requests: this is necessary since requests may have been received;
-                        // we are now getting responses to them; but they are still listed in our inflight_
-
-    if (source == parent())
-    {
-        if (trial == last_trial_)       // confirmation that we are done
-        {
-            assert(down_up_down_ == 2);
-            log->info("[{}] received done confirmation from parent, trial = {}; incomplete = {}, subtree = {}, stale = {}",
-                      comm.rank(), trial, incomplete(), subtree_work_, stale());
-            down_up_down_ = 3;
-            assert(!incomplete() && !stale());
-        } else
-        {
-            last_trial_ = trial;
-            down_up_down_ = 1;
-
-            // check that there are no changes
-            if (incomplete() || stale())
-                abort(trial);
-        }
-
-        // pass the message to children
-        if (down_up_down_ > 0)
-        {
-            reset_child_confirmations();
-            if (child_confirmations)
-            {
-                log->info("[{}] signalling done to children, trial = {}", comm.rank(), trial);
-                signal_children(type::done, trial);
-            }
-            else if (down_up_down_ < 2)     // no children, signal back to parent right away, unless it was the final done
-            {
-                down_up_down_ = 2;
-                log->info("[{}] signalling done to parent (1), trial = {}, incomplete = {}", comm.rank(), trial, incomplete());
-                send(parent(), type::done, trial);
-            }
-        }
-    } else // signal from a child
-    {
-        if (trial == last_trial_)
-        {
-            int child_mask = right_bit(source);
-            child_confirmations &= ~child_mask;
-            if (child_confirmations == 0)       // done
-            {
-                if (comm.rank() != 0)
-                {
-                    if (incomplete() || stale())        // heard from all the children, but there is something incomplete
-                        abort(trial);
-                    else
-                    {
-                        down_up_down_ = 2;
-                        log->info("[{}] signalling done to parent (2), trial = {}, incomplete = {}", comm.rank(), trial, incomplete());
-                        send(parent(), type::done, trial);
-                    }
-                }
-                else if (down_up_down_ == 1)
-                {
-                    log->info("[{}] received done confirmation from children at root, trial = {}", comm.rank(), trial);
-                    // initiate final down
-                    down_up_down_ = 3;
-                    signal_children(type::done, trial);
-                }
-            }
-        } // else stale trial confirmation, drop
-    }
-}
-
-void
-diy::Master::IExchangeInfoDUD::
-reset_child_confirmations()
-{
-    child_confirmations = 0;
-    int child_mask = 1;
-    int child = child_mask | comm.rank();
-    while (child != comm.rank() && child < comm.size())
-    {
-        child_confirmations |= child_mask;
-
-        child_mask <<= 1;
-        child = child_mask | comm.rank();
-    }
-}
-
-void
-diy::Master::IExchangeInfoDUD::
-signal_children(int tag, int x)
-{
-    int child_mask = 1;
-    int child = child_mask | comm.rank();
-    while (child != comm.rank() && child < comm.size())
-    {
-        send(child, tag, x);
-
-        child_mask <<= 1;
-        child = child_mask | comm.rank();
-    }
-}
-
-bool
-diy::Master::IExchangeInfoDUD::
-nudge()
-{
-  bool success = false;
-  for (auto it = inflight_.begin(); it != inflight_.end();)
-  {
-    mpi::optional<mpi::status> ostatus = it->request.test();
-    if (ostatus)
-    {
-      success = true;
-      it = inflight_.erase(it);
-    }
-    else
-      ++it;
-  }
-  return success;
-}
-
-
diff --git a/include/vtkdiy2/detail/master/iexchange.hpp b/include/vtkdiy2/detail/master/iexchange.hpp
index 38fe255c726..1d10c1b728a 100644
--- a/include/vtkdiy2/detail/master/iexchange.hpp
+++ b/include/vtkdiy2/detail/master/iexchange.hpp
@@ -5,17 +5,11 @@ namespace diy
       using   Clock   = std::chrono::high_resolution_clock;
       using   Time    = Clock::time_point;
 
-                        IExchangeInfo(mpi::communicator comm_, size_t min_queue_size, size_t max_hold_time, bool fine, stats::Profiler& prof_):
-                            comm(comm_),
-                            fine_(fine),
-                            min_queue_size_(min_queue_size),
-                            max_hold_time_(max_hold_time),
-                            prof(prof_)                         { time_stamp_send(); }
+                        IExchangeInfo(mpi::communicator c, stats::Profiler& p):
+                            comm(c),
+                            prof(p)                             {}
       virtual           ~IExchangeInfo()                        {}
 
-      void              not_done(int gid)                       { update_done(gid, false); }
-      inline void       update_done(int gid, bool done_);
-
       virtual bool      all_done() =0;                             // get global all done status
       virtual void      add_work(int work) =0;                     // add work to global work counter
       virtual void      control() =0;
@@ -23,43 +17,12 @@ namespace diy
       void              inc_work()                              { add_work(1); }   // increment work counter
       void              dec_work()                              { add_work(-1); }  // decremnent work counter
 
-      // shortcut
-      void              time_stamp_send()                       { time_last_send = Clock::now(); }
-      bool              hold(size_t queue_size)                 { return queue_size < min_queue_size_ && hold_time() < max_hold_time_; }
-      size_t            hold_time()                             { return std::chrono::duration_cast<std::chrono::milliseconds>(Clock::now() - time_last_send).count(); }
-      bool              fine() const                            { return fine_; }
-
       mpi::communicator                   comm;
-      std::unordered_map<int, bool>       done;                 // gid -> done
-
-      bool                                fine_ = false;
 
       std::shared_ptr<spd::logger>        log = get_logger();
-      Time                                time_last_send;       // time of last send from any queue in send_outgoing_queues()
-
-      size_t                              min_queue_size_;      // minimum short message size (bytes)
-      size_t                              max_hold_time_;       // maximum short message hold time (milliseconds)
-
-      int                                 from_gid = -1;        // gid of current block, for shortcut sending of only this block's queues
-
       stats::Profiler&                    prof;
     };
 }
 
-void
-diy::Master::IExchangeInfo::
-update_done(int gid, bool done_)
-{
-    if (done[gid] != done_)
-    {
-        done[gid] = done_;
-        if (done_)
-            dec_work();
-        else
-            inc_work();
-    }
-}
-
 
-#include "iexchange-dud.hpp"
 #include "iexchange-collective.hpp"
diff --git a/include/vtkdiy2/detail/reduce/all-to-all.hpp b/include/vtkdiy2/detail/reduce/all-to-all.hpp
index 7e30825474f..498917c1db7 100644
--- a/include/vtkdiy2/detail/reduce/all-to-all.hpp
+++ b/include/vtkdiy2/detail/reduce/all-to-all.hpp
@@ -23,31 +23,38 @@ namespace detail
       }
     }
 
-    void operator()(Block* b, const ReduceProxy& srp, const RegularSwapPartners& partners) const
+    void operator()(Block* b, const ReduceProxy& srp, const RegularSwapPartners&) const
     {
       int k_in  = srp.in_link().size();
       int k_out = srp.out_link().size();
 
       if (k_in == 0 && k_out == 0)  // special case of a single block
       {
-          ReduceProxy all_srp_out(srp, srp.block(), 0, srp.assigner(), empty_link,         all_neighbors_link);
-          ReduceProxy all_srp_in (srp, srp.block(), 1, srp.assigner(), all_neighbors_link, empty_link);
+          ReduceProxy all_srp(std::move(const_cast<ReduceProxy&>(srp)), srp.block(), 0, srp.assigner(), empty_link, all_neighbors_link);
 
-          op(b, all_srp_out);
-          MemoryBuffer& in_queue = all_srp_in.incoming(all_srp_in.in_link().target(0).gid);
-          in_queue.swap(all_srp_out.outgoing(all_srp_out.out_link().target(0)));
+          op(b, all_srp);
+
+          MemoryBuffer& in_queue = all_srp.incoming(all_srp.out_link().target(0).gid);
+          in_queue.swap(all_srp.outgoing(all_srp.out_link().target(0)));
           in_queue.reset();
+          all_srp.outgoing()->clear();
+
+          // change to incoming proxy
+          all_srp.set_round(1);
+          auto& in_link  = const_cast<Link&>(all_srp.in_link());
+          auto& out_link = const_cast<Link&>(all_srp.out_link());
+          in_link.swap(out_link);
 
-          op(b, all_srp_in);
+          op(b, all_srp);
           return;
       }
 
       if (k_in == 0)                // initial round
       {
-        ReduceProxy all_srp(srp, srp.block(), 0, srp.assigner(), empty_link, all_neighbors_link);
+        ReduceProxy all_srp(std::move(const_cast<ReduceProxy&>(srp)), srp.block(), 0, srp.assigner(), empty_link, all_neighbors_link);
         op(b, all_srp);
 
-        Master::OutgoingQueues all_queues;
+        Master::Proxy::OutgoingQueues all_queues;
         all_queues.swap(*all_srp.outgoing());       // clears out the queues and stores them locally
 
         // enqueue outgoing
@@ -67,10 +74,10 @@ namespace detail
       } else if (k_out == 0)        // final round
       {
         // dequeue incoming + reorder into the correct order
-        ReduceProxy all_srp(srp, srp.block(), 1, srp.assigner(), all_neighbors_link, empty_link);
+        ReduceProxy all_srp(std::move(const_cast<ReduceProxy&>(srp)), srp.block(), 1, srp.assigner(), all_neighbors_link, empty_link);
 
-        Master::IncomingQueues all_incoming;
-        all_incoming.swap(*srp.incoming());
+        Master::Proxy::IncomingQueues all_incoming;
+        all_incoming.swap(*all_srp.incoming());
 
         std::pair<int, int> range;      // all the ranges should be the same
         for (int i = 0; i < k_in; ++i)
diff --git a/include/vtkdiy2/dynamic-point.hpp b/include/vtkdiy2/dynamic-point.hpp
index 67d52b458a8..35dd4b6852d 100644
--- a/include/vtkdiy2/dynamic-point.hpp
+++ b/include/vtkdiy2/dynamic-point.hpp
@@ -7,7 +7,7 @@
 #include <algorithm>
 
 #include "constants.h"
-#include "itlib/small_vector.hpp"
+#include "thirdparty/itlib/small_vector.hpp"
 
 namespace diy
 {
@@ -23,10 +23,10 @@ class DynamicPoint: public itlib::small_vector<Coordinate_, static_size>
         struct rebind       { typedef DynamicPoint<U> type; };
 
     public:
-                            DynamicPoint(int dim, Coordinate x = 0):
+                            DynamicPoint(size_t dim, Coordinate x = 0):
                                 Parent(dim, x)                      {}
         template<class T>   DynamicPoint(const DynamicPoint<T>& p)  { for (size_t i = 0; i < dimension(); ++i) (*this)[i] = p[i]; }
-        template<class T>   DynamicPoint(const T* a, int dim)       { for (size_t i = 0; i < static_cast<size_t>(dim); ++i) (*this)[i] = a[i]; }
+        template<class T>   DynamicPoint(const T* a, size_t dim)       { for (size_t i = 0; i < dim; ++i) (*this)[i] = a[i]; }
         template<class T>   DynamicPoint(const std::vector<T>& a):
                                 Parent(a.begin(), a.end())          {}
                             DynamicPoint(std::initializer_list<Coordinate> lst):
@@ -36,13 +36,13 @@ class DynamicPoint: public itlib::small_vector<Coordinate_, static_size>
                             DynamicPoint(const DynamicPoint&)       =default;
         DynamicPoint&       operator=(const DynamicPoint&)          =default;
 
-        unsigned            dimension() const                       { return Parent::size(); }
+        unsigned            dimension() const                       { return static_cast<unsigned>(Parent::size()); }
 
-        static DynamicPoint zero(int dim)                           { return DynamicPoint(dim, 0); }
-        static DynamicPoint one(int dim)                            { return DynamicPoint(dim, 1); }
+        static DynamicPoint zero(size_t dim)                           { return DynamicPoint(dim, 0); }
+        static DynamicPoint one(size_t dim)                            { return DynamicPoint(dim, 1); }
 
-        DynamicPoint        drop(int dim) const                     { DynamicPoint p(dimension() - 1); size_t c = 0; for (size_t i = 0; i < dimension();   ++i) { if (i == dim) continue; p[c++] = (*this)[i]; } return p; }
-        DynamicPoint        lift(int dim, Coordinate x) const       { DynamicPoint p(dimension() + 1); for (size_t i = 0; i < dimension()+1; ++i) { if (i < dim) p[i] = (*this)[i]; else if (i == dim) p[i] = x; else if (i > dim) p[i] = (*this)[i-1]; } return p; }
+        DynamicPoint        drop(size_t dim) const                     { DynamicPoint p(dimension() - 1); size_t c = 0; for (size_t i = 0; i < dimension(); ++i) { if (i == dim) continue; p[c++] = (*this)[i]; } return p; }
+        DynamicPoint        lift(size_t dim, Coordinate x) const       { DynamicPoint p(dimension() + 1); for (size_t i = 0; i < dimension()+1; ++i) { if (i < dim) p[i] = (*this)[i]; else if (i == dim) p[i] = x; else if (i > dim) p[i] = (*this)[i-1]; } return p; }
 
         using Parent::operator[];
 
diff --git a/include/vtkdiy2/factory.hpp b/include/vtkdiy2/factory.hpp
index 90d4ff37fff..a89a1eb5100 100644
--- a/include/vtkdiy2/factory.hpp
+++ b/include/vtkdiy2/factory.hpp
@@ -26,8 +26,6 @@ class Factory
         template <class T>
         struct Registrar: Base
         {
-            friend T;
-
             static bool registerT()
             {
                 const auto name = typeid(T).name();
@@ -37,17 +35,29 @@ class Factory
                 };
                 return true;
             }
-            static bool registered;
+            static volatile bool registered;
 
             std::string id() const override     { return typeid(T).name(); }
 
+#if defined(__NVCC__)
+	    protected:
+#else
             private:
+              friend T;
+#endif
+#if defined(__INTEL_COMPILER)
+                __attribute__ ((used))
+#endif
                 Registrar(): Base(Key{}) { (void)registered; }
         };
 
-        friend Base;
 
+#if defined(__NVCC__)
+    protected:
+#else
     private:
+      friend Base;
+#endif
         class Key
         {
             Key(){};
@@ -67,7 +77,7 @@ class Factory
 
 template <class Base, class... Args>
 template <class T>
-bool Factory<Base, Args...>::Registrar<T>::registered = Factory<Base, Args...>::Registrar<T>::registerT();
+volatile bool Factory<Base, Args...>::Registrar<T>::registered = Factory<Base, Args...>::Registrar<T>::registerT();
 
 }
 
diff --git a/include/vtkdiy2/fmt/compile.h b/include/vtkdiy2/fmt/compile.h
deleted file mode 100644
index 82625bbc657..00000000000
--- a/include/vtkdiy2/fmt/compile.h
+++ /dev/null
@@ -1,466 +0,0 @@
-// Formatting library for C++ - experimental format string compilation
-//
-// Copyright (c) 2012 - present, Victor Zverovich and fmt contributors
-// All rights reserved.
-//
-// For the license information refer to format.h.
-
-#ifndef FMT_COMPILE_H_
-#define FMT_COMPILE_H_
-
-#include <vector>
-#include "format.h"
-
-FMT_BEGIN_NAMESPACE
-namespace internal {
-
-template <typename Char> struct format_part {
- public:
-  struct named_argument_id {
-    FMT_CONSTEXPR named_argument_id(internal::string_view_metadata id)
-        : id(id) {}
-    internal::string_view_metadata id;
-  };
-
-  struct argument_id {
-    FMT_CONSTEXPR argument_id() : argument_id(0u) {}
-
-    FMT_CONSTEXPR argument_id(unsigned id)
-        : which(which_arg_id::index), val(id) {}
-
-    FMT_CONSTEXPR argument_id(internal::string_view_metadata id)
-        : which(which_arg_id::named_index), val(id) {}
-
-    enum class which_arg_id { index, named_index };
-
-    which_arg_id which;
-
-    union value {
-      FMT_CONSTEXPR value() : index(0u) {}
-      FMT_CONSTEXPR value(unsigned id) : index(id) {}
-      FMT_CONSTEXPR value(internal::string_view_metadata id)
-          : named_index(id) {}
-
-      unsigned index;
-      internal::string_view_metadata named_index;
-    } val;
-  };
-
-  struct specification {
-    FMT_CONSTEXPR specification() : arg_id(0u) {}
-    FMT_CONSTEXPR specification(unsigned id) : arg_id(id) {}
-
-    FMT_CONSTEXPR specification(internal::string_view_metadata id)
-        : arg_id(id) {}
-
-    argument_id arg_id;
-    internal::dynamic_format_specs<Char> parsed_specs;
-  };
-
-  FMT_CONSTEXPR format_part()
-      : which(kind::argument_id), end_of_argument_id(0u), val(0u) {}
-
-  FMT_CONSTEXPR format_part(internal::string_view_metadata text)
-      : which(kind::text), end_of_argument_id(0u), val(text) {}
-
-  FMT_CONSTEXPR format_part(unsigned id)
-      : which(kind::argument_id), end_of_argument_id(0u), val(id) {}
-
-  FMT_CONSTEXPR format_part(named_argument_id arg_id)
-      : which(kind::named_argument_id), end_of_argument_id(0u), val(arg_id) {}
-
-  FMT_CONSTEXPR format_part(specification spec)
-      : which(kind::specification), end_of_argument_id(0u), val(spec) {}
-
-  enum class kind { argument_id, named_argument_id, text, specification };
-
-  kind which;
-  std::size_t end_of_argument_id;
-  union value {
-    FMT_CONSTEXPR value() : arg_id(0u) {}
-    FMT_CONSTEXPR value(unsigned id) : arg_id(id) {}
-    FMT_CONSTEXPR value(named_argument_id named_id)
-        : named_arg_id(named_id.id) {}
-    FMT_CONSTEXPR value(internal::string_view_metadata t) : text(t) {}
-    FMT_CONSTEXPR value(specification s) : spec(s) {}
-    unsigned arg_id;
-    internal::string_view_metadata named_arg_id;
-    internal::string_view_metadata text;
-    specification spec;
-  } val;
-};
-
-template <typename Char, typename PartsContainer>
-class format_preparation_handler : public internal::error_handler {
- private:
-  using part = format_part<Char>;
-
- public:
-  using iterator = typename basic_string_view<Char>::iterator;
-
-  FMT_CONSTEXPR format_preparation_handler(basic_string_view<Char> format,
-                                           PartsContainer& parts)
-      : parts_(parts), format_(format), parse_context_(format) {}
-
-  FMT_CONSTEXPR void on_text(const Char* begin, const Char* end) {
-    if (begin == end) return;
-    const auto offset = begin - format_.data();
-    const auto size = end - begin;
-    parts_.push_back(part(string_view_metadata(offset, size)));
-  }
-
-  FMT_CONSTEXPR void on_arg_id() {
-    parts_.push_back(part(parse_context_.next_arg_id()));
-  }
-
-  FMT_CONSTEXPR void on_arg_id(unsigned id) {
-    parse_context_.check_arg_id(id);
-    parts_.push_back(part(id));
-  }
-
-  FMT_CONSTEXPR void on_arg_id(basic_string_view<Char> id) {
-    const auto view = string_view_metadata(format_, id);
-    const auto arg_id = typename part::named_argument_id(view);
-    parts_.push_back(part(arg_id));
-  }
-
-  FMT_CONSTEXPR void on_replacement_field(const Char* ptr) {
-    parts_.back().end_of_argument_id = ptr - format_.begin();
-  }
-
-  FMT_CONSTEXPR const Char* on_format_specs(const Char* begin,
-                                            const Char* end) {
-    const auto specs_offset = to_unsigned(begin - format_.begin());
-
-    using parse_context = basic_parse_context<Char>;
-    internal::dynamic_format_specs<Char> parsed_specs;
-    dynamic_specs_handler<parse_context> handler(parsed_specs, parse_context_);
-    begin = parse_format_specs(begin, end, handler);
-
-    if (*begin != '}') on_error("missing '}' in format string");
-
-    auto& last_part = parts_.back();
-    auto specs = last_part.which == part::kind::argument_id
-                     ? typename part::specification(last_part.val.arg_id)
-                     : typename part::specification(last_part.val.named_arg_id);
-    specs.parsed_specs = parsed_specs;
-    last_part = part(specs);
-    last_part.end_of_argument_id = specs_offset;
-    return begin;
-  }
-
- private:
-  PartsContainer& parts_;
-  basic_string_view<Char> format_;
-  basic_parse_context<Char> parse_context_;
-};
-
-template <typename Format, typename PreparedPartsProvider, typename... Args>
-class prepared_format {
- public:
-  using char_type = char_t<Format>;
-  using format_part_t = format_part<char_type>;
-
-  constexpr prepared_format(Format f)
-      : format_(std::move(f)), parts_provider_(to_string_view(format_)) {}
-
-  prepared_format() = delete;
-
-  using context = buffer_context<char_type>;
-
-  template <typename Range, typename Context>
-  auto vformat_to(Range out, basic_format_args<Context> args) const ->
-      typename Context::iterator {
-    const auto format_view = internal::to_string_view(format_);
-    basic_parse_context<char_type> parse_ctx(format_view);
-    Context ctx(out.begin(), args);
-
-    const auto& parts = parts_provider_.parts();
-    for (auto part_it = parts.begin(); part_it != parts.end(); ++part_it) {
-      const auto& part = *part_it;
-      const auto& value = part.val;
-
-      switch (part.which) {
-      case format_part_t::kind::text: {
-        const auto text = value.text.to_view(format_view.data());
-        auto output = ctx.out();
-        auto&& it = internal::reserve(output, text.size());
-        it = std::copy_n(text.begin(), text.size(), it);
-        ctx.advance_to(output);
-      } break;
-
-      case format_part_t::kind::argument_id: {
-        advance_parse_context_to_specification(parse_ctx, part);
-        format_arg<Range>(parse_ctx, ctx, value.arg_id);
-      } break;
-
-      case format_part_t::kind::named_argument_id: {
-        advance_parse_context_to_specification(parse_ctx, part);
-        const auto named_arg_id =
-            value.named_arg_id.to_view(format_view.data());
-        format_arg<Range>(parse_ctx, ctx, named_arg_id);
-      } break;
-      case format_part_t::kind::specification: {
-        const auto& arg_id_value = value.spec.arg_id.val;
-        const auto arg = value.spec.arg_id.which ==
-                                 format_part_t::argument_id::which_arg_id::index
-                             ? ctx.arg(arg_id_value.index)
-                             : ctx.arg(arg_id_value.named_index.to_view(
-                                   to_string_view(format_).data()));
-
-        auto specs = value.spec.parsed_specs;
-
-        handle_dynamic_spec<internal::width_checker>(
-            specs.width, specs.width_ref, ctx, format_view.begin());
-        handle_dynamic_spec<internal::precision_checker>(
-            specs.precision, specs.precision_ref, ctx, format_view.begin());
-
-        check_prepared_specs(specs, arg.type());
-        advance_parse_context_to_specification(parse_ctx, part);
-        ctx.advance_to(
-            visit_format_arg(arg_formatter<Range>(ctx, nullptr, &specs), arg));
-      } break;
-      }
-    }
-
-    return ctx.out();
-  }
-
- private:
-  void advance_parse_context_to_specification(
-      basic_parse_context<char_type>& parse_ctx,
-      const format_part_t& part) const {
-    const auto view = to_string_view(format_);
-    const auto specification_begin = view.data() + part.end_of_argument_id;
-    advance_to(parse_ctx, specification_begin);
-  }
-
-  template <typename Range, typename Context, typename Id>
-  void format_arg(basic_parse_context<char_type>& parse_ctx, Context& ctx,
-                  Id arg_id) const {
-    parse_ctx.check_arg_id(arg_id);
-    const auto stopped_at =
-        visit_format_arg(arg_formatter<Range>(ctx), ctx.arg(arg_id));
-    ctx.advance_to(stopped_at);
-  }
-
-  template <typename Char>
-  void check_prepared_specs(const basic_format_specs<Char>& specs,
-                            internal::type arg_type) const {
-    internal::error_handler h;
-    numeric_specs_checker<internal::error_handler> checker(h, arg_type);
-    if (specs.align == align::numeric) checker.require_numeric_argument();
-    if (specs.sign != sign::none) checker.check_sign();
-    if (specs.alt) checker.require_numeric_argument();
-    if (specs.precision >= 0) checker.check_precision();
-  }
-
- private:
-  Format format_;
-  PreparedPartsProvider parts_provider_;
-};
-
-template <typename Char> struct part_counter {
-  unsigned num_parts = 0;
-
-  FMT_CONSTEXPR void on_text(const Char* begin, const Char* end) {
-    if (begin != end) ++num_parts;
-  }
-
-  FMT_CONSTEXPR void on_arg_id() { ++num_parts; }
-  FMT_CONSTEXPR void on_arg_id(unsigned) { ++num_parts; }
-  FMT_CONSTEXPR void on_arg_id(basic_string_view<Char>) { ++num_parts; }
-
-  FMT_CONSTEXPR void on_replacement_field(const Char*) {}
-
-  FMT_CONSTEXPR const Char* on_format_specs(const Char* begin,
-                                            const Char* end) {
-    // Find the matching brace.
-    unsigned braces_counter = 0;
-    for (; begin != end; ++begin) {
-      if (*begin == '{') {
-        ++braces_counter;
-      } else if (*begin == '}') {
-        if (braces_counter == 0u) break;
-        --braces_counter;
-      }
-    }
-    return begin;
-  }
-
-  FMT_CONSTEXPR void on_error(const char*) {}
-};
-
-template <typename Format> class compiletime_prepared_parts_type_provider {
- private:
-  using char_type = char_t<Format>;
-
-  static FMT_CONSTEXPR unsigned count_parts() {
-    FMT_CONSTEXPR_DECL const auto text = to_string_view(Format{});
-    part_counter<char_type> counter;
-    internal::parse_format_string</*IS_CONSTEXPR=*/true>(text, counter);
-    return counter.num_parts;
-  }
-
-// Workaround for old compilers. Compiletime parts preparation will not be
-// performed with them anyway.
-#if FMT_USE_CONSTEXPR
-  static FMT_CONSTEXPR_DECL const unsigned number_of_format_parts =
-      compiletime_prepared_parts_type_provider::count_parts();
-#else
-  static const unsigned number_of_format_parts = 0u;
-#endif
-
- public:
-  template <unsigned N> struct format_parts_array {
-    using value_type = format_part<char_type>;
-
-    FMT_CONSTEXPR format_parts_array() : arr{} {}
-
-    FMT_CONSTEXPR value_type& operator[](unsigned ind) { return arr[ind]; }
-
-    FMT_CONSTEXPR const value_type* begin() const { return arr; }
-    FMT_CONSTEXPR const value_type* end() const { return begin() + N; }
-
-   private:
-    value_type arr[N];
-  };
-
-  struct empty {
-    // Parts preparator will search for it
-    using value_type = format_part<char_type>;
-  };
-
-  using type = conditional_t<number_of_format_parts != 0,
-                             format_parts_array<number_of_format_parts>, empty>;
-};
-
-template <typename Parts> class compiletime_prepared_parts_collector {
- private:
-  using format_part = typename Parts::value_type;
-
- public:
-  FMT_CONSTEXPR explicit compiletime_prepared_parts_collector(Parts& parts)
-      : parts_{parts}, counter_{0u} {}
-
-  FMT_CONSTEXPR void push_back(format_part part) { parts_[counter_++] = part; }
-
-  FMT_CONSTEXPR format_part& back() { return parts_[counter_ - 1]; }
-
- private:
-  Parts& parts_;
-  unsigned counter_;
-};
-
-template <typename PartsContainer, typename Char>
-FMT_CONSTEXPR PartsContainer prepare_parts(basic_string_view<Char> format) {
-  PartsContainer parts;
-  internal::parse_format_string</*IS_CONSTEXPR=*/false>(
-      format, format_preparation_handler<Char, PartsContainer>(format, parts));
-  return parts;
-}
-
-template <typename PartsContainer, typename Char>
-FMT_CONSTEXPR PartsContainer
-prepare_compiletime_parts(basic_string_view<Char> format) {
-  using collector = compiletime_prepared_parts_collector<PartsContainer>;
-
-  PartsContainer parts;
-  collector c(parts);
-  internal::parse_format_string</*IS_CONSTEXPR=*/true>(
-      format, format_preparation_handler<Char, collector>(format, c));
-  return parts;
-}
-
-template <typename PartsContainer> class runtime_parts_provider {
- public:
-  runtime_parts_provider() = delete;
-  template <typename Char>
-  runtime_parts_provider(basic_string_view<Char> format)
-      : parts_(prepare_parts<PartsContainer>(format)) {}
-
-  const PartsContainer& parts() const { return parts_; }
-
- private:
-  PartsContainer parts_;
-};
-
-template <typename Format, typename PartsContainer>
-struct compiletime_parts_provider {
-  compiletime_parts_provider() = delete;
-  template <typename Char>
-  FMT_CONSTEXPR compiletime_parts_provider(basic_string_view<Char>) {}
-
-  const PartsContainer& parts() const {
-    static FMT_CONSTEXPR_DECL const PartsContainer prepared_parts =
-        prepare_compiletime_parts<PartsContainer>(
-            internal::to_string_view(Format{}));
-
-    return prepared_parts;
-  }
-};
-}  // namespace internal
-
-#if FMT_USE_CONSTEXPR
-template <typename... Args, typename S,
-          FMT_ENABLE_IF(is_compile_string<S>::value)>
-FMT_CONSTEXPR auto compile(S format_str) -> internal::prepared_format<
-    S,
-    internal::compiletime_parts_provider<
-        S,
-        typename internal::compiletime_prepared_parts_type_provider<S>::type>,
-    Args...> {
-  return format_str;
-}
-#endif
-
-template <typename... Args, typename Char, size_t N>
-auto compile(const Char (&format_str)[N]) -> internal::prepared_format<
-    std::basic_string<Char>,
-    internal::runtime_parts_provider<std::vector<internal::format_part<Char>>>,
-    Args...> {
-  return std::basic_string<Char>(format_str, N - 1);
-}
-
-template <typename CompiledFormat, typename... Args,
-          typename Char = typename CompiledFormat::char_type>
-std::basic_string<Char> format(const CompiledFormat& cf, const Args&... args) {
-  basic_memory_buffer<Char> buffer;
-  using range = internal::buffer_range<Char>;
-  using context = buffer_context<Char>;
-  cf.template vformat_to<range, context>(range(buffer),
-                                         {make_format_args<context>(args...)});
-  return to_string(buffer);
-}
-
-template <typename OutputIt, typename CompiledFormat, typename... Args>
-OutputIt format_to(OutputIt out, const CompiledFormat& cf,
-                   const Args&... args) {
-  using char_type = typename CompiledFormat::char_type;
-  using range = internal::output_range<OutputIt, char_type>;
-  using context = format_context_t<OutputIt, char_type>;
-  return cf.template vformat_to<range, context>(
-      range(out), {make_format_args<context>(args...)});
-}
-
-template <typename OutputIt, typename CompiledFormat, typename... Args,
-          FMT_ENABLE_IF(internal::is_output_iterator<OutputIt>::value)>
-format_to_n_result<OutputIt> format_to_n(OutputIt out, size_t n,
-                                         const CompiledFormat& cf,
-                                         const Args&... args) {
-  auto it =
-      format_to(internal::truncating_iterator<OutputIt>(out, n), cf, args...);
-  return {it.base(), it.count()};
-}
-
-template <typename CompiledFormat, typename... Args>
-std::size_t formatted_size(const CompiledFormat& cf, const Args&... args) {
-  return fmt::format_to(
-             internal::counting_iterator<typename CompiledFormat::char_type>(),
-             cf, args...)
-      .count();
-}
-
-FMT_END_NAMESPACE
-
-#endif  // FMT_COMPILE_H_
diff --git a/include/vtkdiy2/fmt/core.h b/include/vtkdiy2/fmt/core.h
deleted file mode 100644
index 2dc6ed552bb..00000000000
--- a/include/vtkdiy2/fmt/core.h
+++ /dev/null
@@ -1,1420 +0,0 @@
-// Formatting library for C++ - the core API
-//
-// Copyright (c) 2012 - present, Victor Zverovich
-// All rights reserved.
-//
-// For the license information refer to format.h.
-
-#ifndef FMT_CORE_H_
-#define FMT_CORE_H_
-
-#include <cassert>
-#include <cstdio>  // std::FILE
-#include <cstring>
-#include <iterator>
-#include <string>
-#include <type_traits>
-
-// The fmt library version in the form major * 10000 + minor * 100 + patch.
-#define FMT_VERSION 60000
-
-#ifdef __has_feature
-#  define FMT_HAS_FEATURE(x) __has_feature(x)
-#else
-#  define FMT_HAS_FEATURE(x) 0
-#endif
-
-#if defined(__has_include) && !defined(__INTELLISENSE__) && \
-    !(defined(__INTEL_COMPILER) && __INTEL_COMPILER < 1600)
-#  define FMT_HAS_INCLUDE(x) __has_include(x)
-#else
-#  define FMT_HAS_INCLUDE(x) 0
-#endif
-
-#ifdef __has_cpp_attribute
-#  define FMT_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
-#else
-#  define FMT_HAS_CPP_ATTRIBUTE(x) 0
-#endif
-
-#if defined(__GNUC__) && !defined(__clang__)
-#  define FMT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-#else
-#  define FMT_GCC_VERSION 0
-#endif
-
-#if __cplusplus >= 201103L || defined(__GXX_EXPERIMENTAL_CXX0X__)
-#  define FMT_HAS_GXX_CXX11 FMT_GCC_VERSION
-#else
-#  define FMT_HAS_GXX_CXX11 0
-#endif
-
-#ifdef _MSC_VER
-#  define FMT_MSC_VER _MSC_VER
-#else
-#  define FMT_MSC_VER 0
-#endif
-
-// Check if relaxed C++14 constexpr is supported.
-// GCC doesn't allow throw in constexpr until version 6 (bug 67371).
-#ifndef FMT_USE_CONSTEXPR
-#  define FMT_USE_CONSTEXPR                                           \
-    (FMT_HAS_FEATURE(cxx_relaxed_constexpr) || FMT_MSC_VER >= 1910 || \
-     (FMT_GCC_VERSION >= 600 && __cplusplus >= 201402L))
-#endif
-#if FMT_USE_CONSTEXPR
-#  define FMT_CONSTEXPR constexpr
-#  define FMT_CONSTEXPR_DECL constexpr
-#else
-#  define FMT_CONSTEXPR inline
-#  define FMT_CONSTEXPR_DECL
-#endif
-
-#ifndef FMT_OVERRIDE
-#  if FMT_HAS_FEATURE(cxx_override) || \
-      (FMT_GCC_VERSION >= 408 && FMT_HAS_GXX_CXX11) || FMT_MSC_VER >= 1900
-#    define FMT_OVERRIDE override
-#  else
-#    define FMT_OVERRIDE
-#  endif
-#endif
-
-// Check if exceptions are disabled.
-#ifndef FMT_EXCEPTIONS
-#  if (defined(__GNUC__) && !defined(__EXCEPTIONS)) || \
-      FMT_MSC_VER && !_HAS_EXCEPTIONS
-#    define FMT_EXCEPTIONS 0
-#  else
-#    define FMT_EXCEPTIONS 1
-#  endif
-#endif
-
-// Define FMT_USE_NOEXCEPT to make fmt use noexcept (C++11 feature).
-#ifndef FMT_USE_NOEXCEPT
-#  define FMT_USE_NOEXCEPT 0
-#endif
-
-#if FMT_USE_NOEXCEPT || FMT_HAS_FEATURE(cxx_noexcept) || \
-    (FMT_GCC_VERSION >= 408 && FMT_HAS_GXX_CXX11) || FMT_MSC_VER >= 1900
-#  define FMT_DETECTED_NOEXCEPT noexcept
-#  define FMT_HAS_CXX11_NOEXCEPT 1
-#else
-#  define FMT_DETECTED_NOEXCEPT throw()
-#  define FMT_HAS_CXX11_NOEXCEPT 0
-#endif
-
-#ifndef FMT_NOEXCEPT
-#  if FMT_EXCEPTIONS || FMT_HAS_CXX11_NOEXCEPT
-#    define FMT_NOEXCEPT FMT_DETECTED_NOEXCEPT
-#  else
-#    define FMT_NOEXCEPT
-#  endif
-#endif
-
-// [[noreturn]] is disabled on MSVC because of bogus unreachable code warnings.
-#if FMT_EXCEPTIONS && FMT_HAS_CPP_ATTRIBUTE(noreturn) && !FMT_MSC_VER
-#  define FMT_NORETURN [[noreturn]]
-#else
-#  define FMT_NORETURN
-#endif
-
-#ifndef FMT_DEPRECATED
-#  if (FMT_HAS_CPP_ATTRIBUTE(deprecated) && __cplusplus >= 201402L) || \
-      FMT_MSC_VER >= 1900
-#    define FMT_DEPRECATED [[deprecated]]
-#  else
-#    if defined(__GNUC__) || defined(__clang__)
-#      define FMT_DEPRECATED __attribute__((deprecated))
-#    elif FMT_MSC_VER
-#      define FMT_DEPRECATED __declspec(deprecated)
-#    else
-#      define FMT_DEPRECATED /* deprecated */
-#    endif
-#  endif
-#endif
-// Workaround broken [[deprecated]] in the Intel compiler.
-#ifdef __INTEL_COMPILER
-#  define FMT_DEPRECATED_ALIAS
-#else
-#  define FMT_DEPRECATED_ALIAS FMT_DEPRECATED
-#endif
-
-#ifndef FMT_BEGIN_NAMESPACE
-#  if FMT_HAS_FEATURE(cxx_inline_namespaces) || FMT_GCC_VERSION >= 404 || \
-      FMT_MSC_VER >= 1900
-#    define FMT_INLINE_NAMESPACE inline namespace
-#    define FMT_END_NAMESPACE \
-      }                       \
-      }
-#  else
-#    define FMT_INLINE_NAMESPACE namespace
-#    define FMT_END_NAMESPACE \
-      }                       \
-      using namespace v6;     \
-      }
-#  endif
-#  define FMT_BEGIN_NAMESPACE \
-    namespace fmt {           \
-    FMT_INLINE_NAMESPACE v6 {
-#endif
-
-#if !defined(FMT_HEADER_ONLY) && defined(_WIN32)
-#  ifdef FMT_EXPORT
-#    define FMT_API __declspec(dllexport)
-#  elif defined(FMT_SHARED)
-#    define FMT_API __declspec(dllimport)
-#    define FMT_EXTERN_TEMPLATE_API FMT_API
-#  endif
-#endif
-#ifndef FMT_API
-#  define FMT_API
-#endif
-#ifndef FMT_EXTERN_TEMPLATE_API
-#  define FMT_EXTERN_TEMPLATE_API
-#endif
-
-#ifndef FMT_HEADER_ONLY
-#  define FMT_EXTERN extern
-#else
-#  define FMT_EXTERN
-#endif
-
-#ifndef FMT_ASSERT
-#  define FMT_ASSERT(condition, message) assert((condition) && message)
-#endif
-
-// libc++ supports string_view in pre-c++17.
-#if (FMT_HAS_INCLUDE(<string_view>) &&                       \
-     (__cplusplus > 201402L || defined(_LIBCPP_VERSION))) || \
-    (defined(_MSVC_LANG) && _MSVC_LANG > 201402L && _MSC_VER >= 1910)
-#  include <string_view>
-#  define FMT_USE_STRING_VIEW
-#elif FMT_HAS_INCLUDE("experimental/string_view") && __cplusplus >= 201402L
-#  include <experimental/string_view>
-#  define FMT_USE_EXPERIMENTAL_STRING_VIEW
-#endif
-
-FMT_BEGIN_NAMESPACE
-
-// Implementations of enable_if_t and other types for pre-C++14 systems.
-template <bool B, class T = void>
-using enable_if_t = typename std::enable_if<B, T>::type;
-template <bool B, class T, class F>
-using conditional_t = typename std::conditional<B, T, F>::type;
-template <bool B> using bool_constant = std::integral_constant<bool, B>;
-template <typename T>
-using remove_reference_t = typename std::remove_reference<T>::type;
-template <typename T>
-using remove_const_t = typename std::remove_const<T>::type;
-
-struct monostate {};
-
-// An enable_if helper to be used in template parameters which results in much
-// shorter symbols: https://godbolt.org/z/sWw4vP. Extra parentheses are needed
-// to workaround a bug in MSVC 2019 (see #1140 and #1186).
-#define FMT_ENABLE_IF(...) enable_if_t<(__VA_ARGS__), int> = 0
-
-namespace internal {
-
-// A workaround for gcc 4.8 to make void_t work in a SFINAE context.
-template <typename... Ts> struct void_t_impl { using type = void; };
-
-#if defined(FMT_USE_STRING_VIEW)
-template <typename Char> using std_string_view = std::basic_string_view<Char>;
-#elif defined(FMT_USE_EXPERIMENTAL_STRING_VIEW)
-template <typename Char>
-using std_string_view = std::experimental::basic_string_view<Char>;
-#else
-template <typename T> struct std_string_view {};
-#endif
-
-// Casts nonnegative integer to unsigned.
-template <typename Int>
-FMT_CONSTEXPR typename std::make_unsigned<Int>::type to_unsigned(Int value) {
-  FMT_ASSERT(value >= 0, "negative value");
-  return static_cast<typename std::make_unsigned<Int>::type>(value);
-}
-}  // namespace internal
-
-template <typename... Ts>
-using void_t = typename internal::void_t_impl<Ts...>::type;
-
-/**
-  An implementation of ``std::basic_string_view`` for pre-C++17. It provides a
-  subset of the API. ``fmt::basic_string_view`` is used for format strings even
-  if ``std::string_view`` is available to prevent issues when a library is
-  compiled with a different ``-std`` option than the client code (which is not
-  recommended).
- */
-template <typename Char> class basic_string_view {
- private:
-  const Char* data_;
-  size_t size_;
-
- public:
-  using char_type = Char;
-  using iterator = const Char*;
-
-  FMT_CONSTEXPR basic_string_view() FMT_NOEXCEPT : data_(nullptr), size_(0) {}
-
-  /** Constructs a string reference object from a C string and a size. */
-  FMT_CONSTEXPR basic_string_view(const Char* s, size_t count) FMT_NOEXCEPT
-      : data_(s),
-        size_(count) {}
-
-  /**
-    \rst
-    Constructs a string reference object from a C string computing
-    the size with ``std::char_traits<Char>::length``.
-    \endrst
-   */
-  basic_string_view(const Char* s)
-      : data_(s), size_(std::char_traits<Char>::length(s)) {}
-
-  /** Constructs a string reference from a ``std::basic_string`` object. */
-  template <typename Alloc>
-  FMT_CONSTEXPR basic_string_view(const std::basic_string<Char, Alloc>& s)
-      FMT_NOEXCEPT : data_(s.data()),
-                     size_(s.size()) {}
-
-  template <
-      typename S,
-      FMT_ENABLE_IF(std::is_same<S, internal::std_string_view<Char>>::value)>
-  FMT_CONSTEXPR basic_string_view(S s) FMT_NOEXCEPT : data_(s.data()),
-                                                      size_(s.size()) {}
-
-  /** Returns a pointer to the string data. */
-  FMT_CONSTEXPR const Char* data() const { return data_; }
-
-  /** Returns the string size. */
-  FMT_CONSTEXPR size_t size() const { return size_; }
-
-  FMT_CONSTEXPR iterator begin() const { return data_; }
-  FMT_CONSTEXPR iterator end() const { return data_ + size_; }
-
-  FMT_CONSTEXPR void remove_prefix(size_t n) {
-    data_ += n;
-    size_ -= n;
-  }
-
-  // Lexicographically compare this string reference to other.
-  int compare(basic_string_view other) const {
-    size_t str_size = size_ < other.size_ ? size_ : other.size_;
-    int result = std::char_traits<Char>::compare(data_, other.data_, str_size);
-    if (result == 0)
-      result = size_ == other.size_ ? 0 : (size_ < other.size_ ? -1 : 1);
-    return result;
-  }
-
-  friend bool operator==(basic_string_view lhs, basic_string_view rhs) {
-    return lhs.compare(rhs) == 0;
-  }
-  friend bool operator!=(basic_string_view lhs, basic_string_view rhs) {
-    return lhs.compare(rhs) != 0;
-  }
-  friend bool operator<(basic_string_view lhs, basic_string_view rhs) {
-    return lhs.compare(rhs) < 0;
-  }
-  friend bool operator<=(basic_string_view lhs, basic_string_view rhs) {
-    return lhs.compare(rhs) <= 0;
-  }
-  friend bool operator>(basic_string_view lhs, basic_string_view rhs) {
-    return lhs.compare(rhs) > 0;
-  }
-  friend bool operator>=(basic_string_view lhs, basic_string_view rhs) {
-    return lhs.compare(rhs) >= 0;
-  }
-};
-
-using string_view = basic_string_view<char>;
-using wstring_view = basic_string_view<wchar_t>;
-
-#ifndef __cpp_char8_t
-// A UTF-8 code unit type.
-enum char8_t : unsigned char {};
-#endif
-
-/** Specifies if ``T`` is a character type. Can be specialized by users. */
-template <typename T> struct is_char : std::false_type {};
-template <> struct is_char<char> : std::true_type {};
-template <> struct is_char<wchar_t> : std::true_type {};
-template <> struct is_char<char8_t> : std::true_type {};
-template <> struct is_char<char16_t> : std::true_type {};
-template <> struct is_char<char32_t> : std::true_type {};
-
-/**
-  \rst
-  Returns a string view of `s`. In order to add custom string type support to
-  {fmt} provide an overload of `to_string_view` for it in the same namespace as
-  the type for the argument-dependent lookup to work.
-
-  **Example**::
-
-    namespace my_ns {
-    inline string_view to_string_view(const my_string& s) {
-      return {s.data(), s.length()};
-    }
-    }
-    std::string message = fmt::format(my_string("The answer is {}"), 42);
-  \endrst
- */
-template <typename Char, FMT_ENABLE_IF(is_char<Char>::value)>
-inline basic_string_view<Char> to_string_view(const Char* s) {
-  return s;
-}
-
-template <typename Char, typename Traits, typename Allocator>
-inline basic_string_view<Char> to_string_view(
-    const std::basic_string<Char, Traits, Allocator>& s) {
-  return {s.data(), s.size()};
-}
-
-template <typename Char>
-inline basic_string_view<Char> to_string_view(basic_string_view<Char> s) {
-  return s;
-}
-
-template <typename Char,
-          FMT_ENABLE_IF(!std::is_empty<internal::std_string_view<Char>>::value)>
-inline basic_string_view<Char> to_string_view(
-    internal::std_string_view<Char> s) {
-  return s;
-}
-
-// A base class for compile-time strings. It is defined in the fmt namespace to
-// make formatting functions visible via ADL, e.g. format(fmt("{}"), 42).
-struct compile_string {};
-
-template <typename S>
-struct is_compile_string : std::is_base_of<compile_string, S> {};
-
-template <typename S, FMT_ENABLE_IF(is_compile_string<S>::value)>
-constexpr basic_string_view<typename S::char_type> to_string_view(const S& s) {
-  return s;
-}
-
-namespace internal {
-void to_string_view(...);
-using fmt::v6::to_string_view;
-
-// Specifies whether S is a string type convertible to fmt::basic_string_view.
-// It should be a constexpr function but MSVC 2017 fails to compile it in
-// enable_if and MSVC 2015 fails to compile it as an alias template.
-template <typename S>
-struct is_string : std::is_class<decltype(to_string_view(std::declval<S>()))> {
-};
-
-template <typename S, typename = void> struct char_t_impl {};
-template <typename S> struct char_t_impl<S, enable_if_t<is_string<S>::value>> {
-  using result = decltype(to_string_view(std::declval<S>()));
-  using type = typename result::char_type;
-};
-
-struct error_handler {
-  FMT_CONSTEXPR error_handler() {}
-  FMT_CONSTEXPR error_handler(const error_handler&) {}
-
-  // This function is intentionally not constexpr to give a compile-time error.
-  FMT_NORETURN FMT_API void on_error(const char* message);
-};
-}  // namespace internal
-
-/** String's character type. */
-template <typename S> using char_t = typename internal::char_t_impl<S>::type;
-
-// Parsing context consisting of a format string range being parsed and an
-// argument counter for automatic indexing.
-template <typename Char, typename ErrorHandler = internal::error_handler>
-class basic_parse_context : private ErrorHandler {
- private:
-  basic_string_view<Char> format_str_;
-  int next_arg_id_;
-
- public:
-  using char_type = Char;
-  using iterator = typename basic_string_view<Char>::iterator;
-
-  explicit FMT_CONSTEXPR basic_parse_context(basic_string_view<Char> format_str,
-                                             ErrorHandler eh = ErrorHandler())
-      : ErrorHandler(eh), format_str_(format_str), next_arg_id_(0) {}
-
-  // Returns an iterator to the beginning of the format string range being
-  // parsed.
-  FMT_CONSTEXPR iterator begin() const FMT_NOEXCEPT {
-    return format_str_.begin();
-  }
-
-  // Returns an iterator past the end of the format string range being parsed.
-  FMT_CONSTEXPR iterator end() const FMT_NOEXCEPT { return format_str_.end(); }
-
-  // Advances the begin iterator to ``it``.
-  FMT_CONSTEXPR void advance_to(iterator it) {
-    format_str_.remove_prefix(internal::to_unsigned(it - begin()));
-  }
-
-  // Returns the next argument index.
-  FMT_CONSTEXPR int next_arg_id() {
-    if (next_arg_id_ >= 0) return next_arg_id_++;
-    on_error("cannot switch from manual to automatic argument indexing");
-    return 0;
-  }
-
-  FMT_CONSTEXPR bool check_arg_id(int) {
-    if (next_arg_id_ > 0) {
-      on_error("cannot switch from automatic to manual argument indexing");
-      return false;
-    }
-    next_arg_id_ = -1;
-    return true;
-  }
-
-  FMT_CONSTEXPR void check_arg_id(basic_string_view<Char>) {}
-
-  FMT_CONSTEXPR void on_error(const char* message) {
-    ErrorHandler::on_error(message);
-  }
-
-  FMT_CONSTEXPR ErrorHandler error_handler() const { return *this; }
-};
-
-using format_parse_context = basic_parse_context<char>;
-using wformat_parse_context = basic_parse_context<wchar_t>;
-
-using parse_context FMT_DEPRECATED_ALIAS = basic_parse_context<char>;
-using wparse_context FMT_DEPRECATED_ALIAS = basic_parse_context<wchar_t>;
-
-template <typename Context> class basic_format_arg;
-template <typename Context> class basic_format_args;
-
-// A formatter for objects of type T.
-template <typename T, typename Char = char, typename Enable = void>
-struct formatter {
-  // A deleted default constructor indicates a disabled formatter.
-  formatter() = delete;
-};
-
-template <typename T, typename Char, typename Enable = void>
-struct FMT_DEPRECATED convert_to_int
-    : bool_constant<!std::is_arithmetic<T>::value &&
-                    std::is_convertible<T, int>::value> {};
-
-namespace internal {
-
-// Specifies if T has an enabled formatter specialization. A type can be
-// formattable even if it doesn't have a formatter e.g. via a conversion.
-template <typename T, typename Context>
-using has_formatter =
-    std::is_constructible<typename Context::template formatter_type<T>>;
-
-/** A contiguous memory buffer with an optional growing ability. */
-template <typename T> class buffer {
- private:
-  buffer(const buffer&) = delete;
-  void operator=(const buffer&) = delete;
-
-  T* ptr_;
-  std::size_t size_;
-  std::size_t capacity_;
-
- protected:
-  // Don't initialize ptr_ since it is not accessed to save a few cycles.
-  buffer(std::size_t sz) FMT_NOEXCEPT : size_(sz), capacity_(sz) {}
-
-  buffer(T* p = nullptr, std::size_t sz = 0, std::size_t cap = 0) FMT_NOEXCEPT
-      : ptr_(p),
-        size_(sz),
-        capacity_(cap) {}
-
-  /** Sets the buffer data and capacity. */
-  void set(T* buf_data, std::size_t buf_capacity) FMT_NOEXCEPT {
-    ptr_ = buf_data;
-    capacity_ = buf_capacity;
-  }
-
-  /** Increases the buffer capacity to hold at least *capacity* elements. */
-  virtual void grow(std::size_t capacity) = 0;
-
- public:
-  using value_type = T;
-  using const_reference = const T&;
-
-  virtual ~buffer() {}
-
-  T* begin() FMT_NOEXCEPT { return ptr_; }
-  T* end() FMT_NOEXCEPT { return ptr_ + size_; }
-
-  /** Returns the size of this buffer. */
-  std::size_t size() const FMT_NOEXCEPT { return size_; }
-
-  /** Returns the capacity of this buffer. */
-  std::size_t capacity() const FMT_NOEXCEPT { return capacity_; }
-
-  /** Returns a pointer to the buffer data. */
-  T* data() FMT_NOEXCEPT { return ptr_; }
-
-  /** Returns a pointer to the buffer data. */
-  const T* data() const FMT_NOEXCEPT { return ptr_; }
-
-  /**
-    Resizes the buffer. If T is a POD type new elements may not be initialized.
-   */
-  void resize(std::size_t new_size) {
-    reserve(new_size);
-    size_ = new_size;
-  }
-
-  /** Clears this buffer. */
-  void clear() { size_ = 0; }
-
-  /** Reserves space to store at least *capacity* elements. */
-  void reserve(std::size_t new_capacity) {
-    if (new_capacity > capacity_) grow(new_capacity);
-  }
-
-  void push_back(const T& value) {
-    reserve(size_ + 1);
-    ptr_[size_++] = value;
-  }
-
-  /** Appends data to the end of the buffer. */
-  template <typename U> void append(const U* begin, const U* end);
-
-  T& operator[](std::size_t index) { return ptr_[index]; }
-  const T& operator[](std::size_t index) const { return ptr_[index]; }
-};
-
-// A container-backed buffer.
-template <typename Container>
-class container_buffer : public buffer<typename Container::value_type> {
- private:
-  Container& container_;
-
- protected:
-  void grow(std::size_t capacity) FMT_OVERRIDE {
-    container_.resize(capacity);
-    this->set(&container_[0], capacity);
-  }
-
- public:
-  explicit container_buffer(Container& c)
-      : buffer<typename Container::value_type>(c.size()), container_(c) {}
-};
-
-// Extracts a reference to the container from back_insert_iterator.
-template <typename Container>
-inline Container& get_container(std::back_insert_iterator<Container> it) {
-  using bi_iterator = std::back_insert_iterator<Container>;
-  struct accessor : bi_iterator {
-    accessor(bi_iterator iter) : bi_iterator(iter) {}
-    using bi_iterator::container;
-  };
-  return *accessor(it).container;
-}
-
-template <typename T, typename Char = char, typename Enable = void>
-struct fallback_formatter {
-  fallback_formatter() = delete;
-};
-
-// Specifies if T has an enabled fallback_formatter specialization.
-template <typename T, typename Context>
-using has_fallback_formatter =
-    std::is_constructible<fallback_formatter<T, typename Context::char_type>>;
-
-template <typename Char> struct named_arg_base;
-template <typename T, typename Char> struct named_arg;
-
-enum type {
-  none_type,
-  named_arg_type,
-  // Integer types should go first,
-  int_type,
-  uint_type,
-  long_long_type,
-  ulong_long_type,
-  bool_type,
-  char_type,
-  last_integer_type = char_type,
-  // followed by floating-point types.
-  double_type,
-  long_double_type,
-  last_numeric_type = long_double_type,
-  cstring_type,
-  string_type,
-  pointer_type,
-  custom_type
-};
-
-// Maps core type T to the corresponding type enum constant.
-template <typename T, typename Char>
-struct type_constant : std::integral_constant<type, custom_type> {};
-
-#define FMT_TYPE_CONSTANT(Type, constant) \
-  template <typename Char>                \
-  struct type_constant<Type, Char> : std::integral_constant<type, constant> {}
-
-FMT_TYPE_CONSTANT(const named_arg_base<Char>&, named_arg_type);
-FMT_TYPE_CONSTANT(int, int_type);
-FMT_TYPE_CONSTANT(unsigned, uint_type);
-FMT_TYPE_CONSTANT(long long, long_long_type);
-FMT_TYPE_CONSTANT(unsigned long long, ulong_long_type);
-FMT_TYPE_CONSTANT(bool, bool_type);
-FMT_TYPE_CONSTANT(Char, char_type);
-FMT_TYPE_CONSTANT(double, double_type);
-FMT_TYPE_CONSTANT(long double, long_double_type);
-FMT_TYPE_CONSTANT(const Char*, cstring_type);
-FMT_TYPE_CONSTANT(basic_string_view<Char>, string_type);
-FMT_TYPE_CONSTANT(const void*, pointer_type);
-
-FMT_CONSTEXPR bool is_integral(type t) {
-  FMT_ASSERT(t != named_arg_type, "invalid argument type");
-  return t > none_type && t <= last_integer_type;
-}
-
-FMT_CONSTEXPR bool is_arithmetic(type t) {
-  FMT_ASSERT(t != named_arg_type, "invalid argument type");
-  return t > none_type && t <= last_numeric_type;
-}
-
-template <typename Char> struct string_value {
-  const Char* data;
-  std::size_t size;
-};
-
-template <typename Context> struct custom_value {
-  using parse_context = basic_parse_context<typename Context::char_type>;
-  const void* value;
-  void (*format)(const void* arg, parse_context& parse_ctx, Context& ctx);
-};
-
-// A formatting argument value.
-template <typename Context> class value {
- public:
-  using char_type = typename Context::char_type;
-
-  union {
-    int int_value;
-    unsigned uint_value;
-    long long long_long_value;
-    unsigned long long ulong_long_value;
-    bool bool_value;
-    char_type char_value;
-    double double_value;
-    long double long_double_value;
-    const void* pointer;
-    string_value<char_type> string;
-    custom_value<Context> custom;
-    const named_arg_base<char_type>* named_arg;
-  };
-
-  FMT_CONSTEXPR value(int val = 0) : int_value(val) {}
-  FMT_CONSTEXPR value(unsigned val) : uint_value(val) {}
-  value(long long val) : long_long_value(val) {}
-  value(unsigned long long val) : ulong_long_value(val) {}
-  value(double val) : double_value(val) {}
-  value(long double val) : long_double_value(val) {}
-  value(bool val) : bool_value(val) {}
-  value(char_type val) : char_value(val) {}
-  value(const char_type* val) { string.data = val; }
-  value(basic_string_view<char_type> val) {
-    string.data = val.data();
-    string.size = val.size();
-  }
-  value(const void* val) : pointer(val) {}
-
-  template <typename T> value(const T& val) {
-    custom.value = &val;
-    // Get the formatter type through the context to allow different contexts
-    // have different extension points, e.g. `formatter<T>` for `format` and
-    // `printf_formatter<T>` for `printf`.
-    custom.format = format_custom_arg<
-        T, conditional_t<has_formatter<T, Context>::value,
-                         typename Context::template formatter_type<T>,
-                         fallback_formatter<T, char_type>>>;
-  }
-
-  value(const named_arg_base<char_type>& val) { named_arg = &val; }
-
- private:
-  // Formats an argument of a custom type, such as a user-defined class.
-  template <typename T, typename Formatter>
-  static void format_custom_arg(const void* arg,
-                                basic_parse_context<char_type>& parse_ctx,
-                                Context& ctx) {
-    Formatter f;
-    parse_ctx.advance_to(f.parse(parse_ctx));
-    ctx.advance_to(f.format(*static_cast<const T*>(arg), ctx));
-  }
-};
-
-template <typename Context, typename T>
-FMT_CONSTEXPR basic_format_arg<Context> make_arg(const T& value);
-
-// To minimize the number of types we need to deal with, long is translated
-// either to int or to long long depending on its size.
-enum { long_short = sizeof(long) == sizeof(int) };
-using long_type = conditional_t<long_short, int, long long>;
-using ulong_type = conditional_t<long_short, unsigned, unsigned long long>;
-
-// Maps formatting arguments to core types.
-template <typename Context> struct arg_mapper {
-  using char_type = typename Context::char_type;
-
-  FMT_CONSTEXPR int map(signed char val) { return val; }
-  FMT_CONSTEXPR unsigned map(unsigned char val) { return val; }
-  FMT_CONSTEXPR int map(short val) { return val; }
-  FMT_CONSTEXPR unsigned map(unsigned short val) { return val; }
-  FMT_CONSTEXPR int map(int val) { return val; }
-  FMT_CONSTEXPR unsigned map(unsigned val) { return val; }
-  FMT_CONSTEXPR long_type map(long val) { return val; }
-  FMT_CONSTEXPR ulong_type map(unsigned long val) { return val; }
-  FMT_CONSTEXPR long long map(long long val) { return val; }
-  FMT_CONSTEXPR unsigned long long map(unsigned long long val) { return val; }
-  FMT_CONSTEXPR bool map(bool val) { return val; }
-
-  template <typename T, FMT_ENABLE_IF(is_char<T>::value)>
-  FMT_CONSTEXPR char_type map(T val) {
-    static_assert(
-        std::is_same<T, char>::value || std::is_same<T, char_type>::value,
-        "mixing character types is disallowed");
-    return val;
-  }
-
-  FMT_CONSTEXPR double map(float val) { return static_cast<double>(val); }
-  FMT_CONSTEXPR double map(double val) { return val; }
-  FMT_CONSTEXPR long double map(long double val) { return val; }
-
-  FMT_CONSTEXPR const char_type* map(char_type* val) { return val; }
-  FMT_CONSTEXPR const char_type* map(const char_type* val) { return val; }
-  template <typename T, FMT_ENABLE_IF(is_string<T>::value)>
-  FMT_CONSTEXPR basic_string_view<char_type> map(const T& val) {
-    static_assert(std::is_same<char_type, char_t<T>>::value,
-                  "mixing character types is disallowed");
-    return to_string_view(val);
-  }
-  template <typename T,
-            FMT_ENABLE_IF(
-                std::is_constructible<basic_string_view<char_type>, T>::value &&
-                !is_string<T>::value)>
-  FMT_CONSTEXPR basic_string_view<char_type> map(const T& val) {
-    return basic_string_view<char_type>(val);
-  }
-  FMT_CONSTEXPR const char* map(const signed char* val) {
-    static_assert(std::is_same<char_type, char>::value, "invalid string type");
-    return reinterpret_cast<const char*>(val);
-  }
-  FMT_CONSTEXPR const char* map(const unsigned char* val) {
-    static_assert(std::is_same<char_type, char>::value, "invalid string type");
-    return reinterpret_cast<const char*>(val);
-  }
-
-  FMT_CONSTEXPR const void* map(void* val) { return val; }
-  FMT_CONSTEXPR const void* map(const void* val) { return val; }
-  FMT_CONSTEXPR const void* map(std::nullptr_t val) { return val; }
-  template <typename T> FMT_CONSTEXPR int map(const T*) {
-    // Formatting of arbitrary pointers is disallowed. If you want to output
-    // a pointer cast it to "void *" or "const void *". In particular, this
-    // forbids formatting of "[const] volatile char *" which is printed as bool
-    // by iostreams.
-    static_assert(!sizeof(T), "formatting of non-void pointers is disallowed");
-    return 0;
-  }
-
-  template <typename T,
-            FMT_ENABLE_IF(std::is_enum<T>::value &&
-                          !has_formatter<T, Context>::value &&
-                          !has_fallback_formatter<T, Context>::value)>
-  FMT_CONSTEXPR int map(const T& val) {
-    return static_cast<int>(val);
-  }
-  template <typename T,
-            FMT_ENABLE_IF(!is_string<T>::value && !is_char<T>::value &&
-                          (has_formatter<T, Context>::value ||
-                           has_fallback_formatter<T, Context>::value))>
-  FMT_CONSTEXPR const T& map(const T& val) {
-    return val;
-  }
-
-  template <typename T>
-  FMT_CONSTEXPR const named_arg_base<char_type>& map(
-      const named_arg<T, char_type>& val) {
-    auto arg = make_arg<Context>(val.value);
-    std::memcpy(val.data, &arg, sizeof(arg));
-    return val;
-  }
-};
-
-// A type constant after applying arg_mapper<Context>.
-template <typename T, typename Context>
-using mapped_type_constant =
-    type_constant<decltype(arg_mapper<Context>().map(std::declval<T>())),
-                  typename Context::char_type>;
-
-// Maximum number of arguments with packed types.
-enum { max_packed_args = 15 };
-enum : unsigned long long { is_unpacked_bit = 1ull << 63 };
-
-template <typename Context> class arg_map;
-}  // namespace internal
-
-// A formatting argument. It is a trivially copyable/constructible type to
-// allow storage in basic_memory_buffer.
-template <typename Context> class basic_format_arg {
- private:
-  internal::value<Context> value_;
-  internal::type type_;
-
-  template <typename ContextType, typename T>
-  friend FMT_CONSTEXPR basic_format_arg<ContextType> internal::make_arg(
-      const T& value);
-
-  template <typename Visitor, typename Ctx>
-  friend FMT_CONSTEXPR auto visit_format_arg(Visitor&& vis,
-                                             const basic_format_arg<Ctx>& arg)
-      -> decltype(vis(0));
-
-  friend class basic_format_args<Context>;
-  friend class internal::arg_map<Context>;
-
-  using char_type = typename Context::char_type;
-
- public:
-  class handle {
-   public:
-    explicit handle(internal::custom_value<Context> custom) : custom_(custom) {}
-
-    void format(basic_parse_context<char_type>& parse_ctx, Context& ctx) const {
-      custom_.format(custom_.value, parse_ctx, ctx);
-    }
-
-   private:
-    internal::custom_value<Context> custom_;
-  };
-
-  FMT_CONSTEXPR basic_format_arg() : type_(internal::none_type) {}
-
-  FMT_CONSTEXPR explicit operator bool() const FMT_NOEXCEPT {
-    return type_ != internal::none_type;
-  }
-
-  internal::type type() const { return type_; }
-
-  bool is_integral() const { return internal::is_integral(type_); }
-  bool is_arithmetic() const { return internal::is_arithmetic(type_); }
-};
-
-/**
-  \rst
-  Visits an argument dispatching to the appropriate visit method based on
-  the argument type. For example, if the argument type is ``double`` then
-  ``vis(value)`` will be called with the value of type ``double``.
-  \endrst
- */
-template <typename Visitor, typename Context>
-FMT_CONSTEXPR auto visit_format_arg(Visitor&& vis,
-                                    const basic_format_arg<Context>& arg)
-    -> decltype(vis(0)) {
-  using char_type = typename Context::char_type;
-  switch (arg.type_) {
-  case internal::none_type:
-    break;
-  case internal::named_arg_type:
-    FMT_ASSERT(false, "invalid argument type");
-    break;
-  case internal::int_type:
-    return vis(arg.value_.int_value);
-  case internal::uint_type:
-    return vis(arg.value_.uint_value);
-  case internal::long_long_type:
-    return vis(arg.value_.long_long_value);
-  case internal::ulong_long_type:
-    return vis(arg.value_.ulong_long_value);
-  case internal::bool_type:
-    return vis(arg.value_.bool_value);
-  case internal::char_type:
-    return vis(arg.value_.char_value);
-  case internal::double_type:
-    return vis(arg.value_.double_value);
-  case internal::long_double_type:
-    return vis(arg.value_.long_double_value);
-  case internal::cstring_type:
-    return vis(arg.value_.string.data);
-  case internal::string_type:
-    return vis(basic_string_view<char_type>(arg.value_.string.data,
-                                            arg.value_.string.size));
-  case internal::pointer_type:
-    return vis(arg.value_.pointer);
-  case internal::custom_type:
-    return vis(typename basic_format_arg<Context>::handle(arg.value_.custom));
-  }
-  return vis(monostate());
-}
-
-namespace internal {
-// A map from argument names to their values for named arguments.
-template <typename Context> class arg_map {
- private:
-  arg_map(const arg_map&) = delete;
-  void operator=(const arg_map&) = delete;
-
-  using char_type = typename Context::char_type;
-
-  struct entry {
-    basic_string_view<char_type> name;
-    basic_format_arg<Context> arg;
-  };
-
-  entry* map_;
-  unsigned size_;
-
-  void push_back(value<Context> val) {
-    const auto& named = *val.named_arg;
-    map_[size_] = {named.name, named.template deserialize<Context>()};
-    ++size_;
-  }
-
- public:
-  arg_map() : map_(nullptr), size_(0) {}
-  void init(const basic_format_args<Context>& args);
-  ~arg_map() { delete[] map_; }
-
-  basic_format_arg<Context> find(basic_string_view<char_type> name) const {
-    // The list is unsorted, so just return the first matching name.
-    for (entry *it = map_, *end = map_ + size_; it != end; ++it) {
-      if (it->name == name) return it->arg;
-    }
-    return {};
-  }
-};
-
-// A type-erased reference to an std::locale to avoid heavy <locale> include.
-class locale_ref {
- private:
-  const void* locale_;  // A type-erased pointer to std::locale.
-
- public:
-  locale_ref() : locale_(nullptr) {}
-  template <typename Locale> explicit locale_ref(const Locale& loc);
-
-  template <typename Locale> Locale get() const;
-};
-
-template <typename> constexpr unsigned long long encode_types() { return 0; }
-
-template <typename Context, typename Arg, typename... Args>
-constexpr unsigned long long encode_types() {
-  return mapped_type_constant<Arg, Context>::value |
-         (encode_types<Context, Args...>() << 4);
-}
-
-template <typename Context, typename T>
-FMT_CONSTEXPR basic_format_arg<Context> make_arg(const T& value) {
-  basic_format_arg<Context> arg;
-  arg.type_ = mapped_type_constant<T, Context>::value;
-  arg.value_ = arg_mapper<Context>().map(value);
-  return arg;
-}
-
-template <bool IS_PACKED, typename Context, typename T,
-          FMT_ENABLE_IF(IS_PACKED)>
-inline value<Context> make_arg(const T& val) {
-  return arg_mapper<Context>().map(val);
-}
-
-template <bool IS_PACKED, typename Context, typename T,
-          FMT_ENABLE_IF(!IS_PACKED)>
-inline basic_format_arg<Context> make_arg(const T& value) {
-  return make_arg<Context>(value);
-}
-}  // namespace internal
-
-// Formatting context.
-template <typename OutputIt, typename Char> class basic_format_context {
- public:
-  /** The character type for the output. */
-  using char_type = Char;
-
- private:
-  OutputIt out_;
-  basic_format_args<basic_format_context> args_;
-  internal::arg_map<basic_format_context> map_;
-  internal::locale_ref loc_;
-
-  basic_format_context(const basic_format_context&) = delete;
-  void operator=(const basic_format_context&) = delete;
-
- public:
-  using iterator = OutputIt;
-  using format_arg = basic_format_arg<basic_format_context>;
-  template <typename T> using formatter_type = formatter<T, char_type>;
-
-  /**
-   Constructs a ``basic_format_context`` object. References to the arguments are
-   stored in the object so make sure they have appropriate lifetimes.
-   */
-  basic_format_context(OutputIt out,
-                       basic_format_args<basic_format_context> ctx_args,
-                       internal::locale_ref loc = internal::locale_ref())
-      : out_(out), args_(ctx_args), loc_(loc) {}
-
-  format_arg arg(int id) const { return args_.get(id); }
-
-  // Checks if manual indexing is used and returns the argument with the
-  // specified name.
-  format_arg arg(basic_string_view<char_type> name);
-
-  internal::error_handler error_handler() { return {}; }
-  void on_error(const char* message) { error_handler().on_error(message); }
-
-  // Returns an iterator to the beginning of the output range.
-  iterator out() { return out_; }
-
-  // Advances the begin iterator to ``it``.
-  void advance_to(iterator it) { out_ = it; }
-
-  internal::locale_ref locale() { return loc_; }
-};
-
-template <typename Char>
-using buffer_context =
-    basic_format_context<std::back_insert_iterator<internal::buffer<Char>>,
-                         Char>;
-using format_context = buffer_context<char>;
-using wformat_context = buffer_context<wchar_t>;
-
-/**
-  \rst
-  An array of references to arguments. It can be implicitly converted into
-  `~fmt::basic_format_args` for passing into type-erased formatting functions
-  such as `~fmt::vformat`.
-  \endrst
- */
-template <typename Context, typename... Args> class format_arg_store {
- private:
-  static const size_t num_args = sizeof...(Args);
-  static const bool is_packed = num_args < internal::max_packed_args;
-
-  using value_type = conditional_t<is_packed, internal::value<Context>,
-                                   basic_format_arg<Context>>;
-
-  // If the arguments are not packed, add one more element to mark the end.
-  value_type data_[num_args + (num_args == 0 ? 1 : 0)];
-
-  friend class basic_format_args<Context>;
-
- public:
-  static constexpr unsigned long long types =
-      is_packed ? internal::encode_types<Context, Args...>()
-                : internal::is_unpacked_bit | num_args;
-  FMT_DEPRECATED static constexpr unsigned long long TYPES = types;
-
-  format_arg_store(const Args&... args)
-      : data_{internal::make_arg<is_packed, Context>(args)...} {}
-};
-
-/**
-  \rst
-  Constructs an `~fmt::format_arg_store` object that contains references to
-  arguments and can be implicitly converted to `~fmt::format_args`. `Context`
-  can be omitted in which case it defaults to `~fmt::context`.
-  See `~fmt::arg` for lifetime considerations.
-  \endrst
- */
-template <typename Context = format_context, typename... Args>
-inline format_arg_store<Context, Args...> make_format_args(
-    const Args&... args) {
-  return {args...};
-}
-
-/** Formatting arguments. */
-template <typename Context> class basic_format_args {
- public:
-  using size_type = int;
-  using format_arg = basic_format_arg<Context>;
-
- private:
-  // To reduce compiled code size per formatting function call, types of first
-  // max_packed_args arguments are passed in the types_ field.
-  unsigned long long types_;
-  union {
-    // If the number of arguments is less than max_packed_args, the argument
-    // values are stored in values_, otherwise they are stored in args_.
-    // This is done to reduce compiled code size as storing larger objects
-    // may require more code (at least on x86-64) even if the same amount of
-    // data is actually copied to stack. It saves ~10% on the bloat test.
-    const internal::value<Context>* values_;
-    const format_arg* args_;
-  };
-
-  bool is_packed() const { return (types_ & internal::is_unpacked_bit) == 0; }
-
-  internal::type type(int index) const {
-    int shift = index * 4;
-    return static_cast<internal::type>((types_ & (0xfull << shift)) >> shift);
-  }
-
-  friend class internal::arg_map<Context>;
-
-  void set_data(const internal::value<Context>* values) { values_ = values; }
-  void set_data(const format_arg* args) { args_ = args; }
-
-  format_arg do_get(int index) const {
-    format_arg arg;
-    if (!is_packed()) {
-      auto num_args = max_size();
-      if (index < num_args) arg = args_[index];
-      return arg;
-    }
-    if (index > internal::max_packed_args) return arg;
-    arg.type_ = type(index);
-    if (arg.type_ == internal::none_type) return arg;
-    internal::value<Context>& val = arg.value_;
-    val = values_[index];
-    return arg;
-  }
-
- public:
-  basic_format_args() : types_(0) {}
-
-  /**
-   \rst
-   Constructs a `basic_format_args` object from `~fmt::format_arg_store`.
-   \endrst
-   */
-  template <typename... Args>
-  basic_format_args(const format_arg_store<Context, Args...>& store)
-      : types_(static_cast<unsigned long long>(store.types)) {
-    set_data(store.data_);
-  }
-
-  /**
-   \rst
-   Constructs a `basic_format_args` object from a dynamic set of arguments.
-   \endrst
-   */
-  basic_format_args(const format_arg* args, int count)
-      : types_(internal::is_unpacked_bit | internal::to_unsigned(count)) {
-    set_data(args);
-  }
-
-  /** Returns the argument at specified index. */
-  format_arg get(int index) const {
-    format_arg arg = do_get(index);
-    if (arg.type_ == internal::named_arg_type)
-      arg = arg.value_.named_arg->template deserialize<Context>();
-    return arg;
-  }
-
-  int max_size() const {
-    unsigned long long max_packed = internal::max_packed_args;
-    return static_cast<int>(is_packed() ? max_packed
-                                        : types_ & ~internal::is_unpacked_bit);
-  }
-};
-
-/** An alias to ``basic_format_args<context>``. */
-// It is a separate type rather than an alias to make symbols readable.
-struct format_args : basic_format_args<format_context> {
-  template <typename... Args>
-  format_args(Args&&... args)
-      : basic_format_args<format_context>(std::forward<Args>(args)...) {}
-};
-struct wformat_args : basic_format_args<wformat_context> {
-  template <typename... Args>
-  wformat_args(Args&&... args)
-      : basic_format_args<wformat_context>(std::forward<Args>(args)...) {}
-};
-
-template <typename Container> struct is_contiguous : std::false_type {};
-
-template <typename Char>
-struct is_contiguous<std::basic_string<Char>> : std::true_type {};
-
-template <typename Char>
-struct is_contiguous<internal::buffer<Char>> : std::true_type {};
-
-namespace internal {
-
-template <typename OutputIt>
-struct is_contiguous_back_insert_iterator : std::false_type {};
-template <typename Container>
-struct is_contiguous_back_insert_iterator<std::back_insert_iterator<Container>>
-    : is_contiguous<Container> {};
-
-template <typename Char> struct named_arg_base {
-  basic_string_view<Char> name;
-
-  // Serialized value<context>.
-  mutable char data[sizeof(basic_format_arg<buffer_context<Char>>)];
-
-  named_arg_base(basic_string_view<Char> nm) : name(nm) {}
-
-  template <typename Context> basic_format_arg<Context> deserialize() const {
-    basic_format_arg<Context> arg;
-    std::memcpy(&arg, data, sizeof(basic_format_arg<Context>));
-    return arg;
-  }
-};
-
-template <typename T, typename Char> struct named_arg : named_arg_base<Char> {
-  const T& value;
-
-  named_arg(basic_string_view<Char> name, const T& val)
-      : named_arg_base<Char>(name), value(val) {}
-};
-
-template <typename..., typename S, FMT_ENABLE_IF(!is_compile_string<S>::value)>
-inline void check_format_string(const S&) {
-#if defined(FMT_ENFORCE_COMPILE_STRING)
-  static_assert(is_compile_string<S>::value,
-                "FMT_ENFORCE_COMPILE_STRING requires all format strings to "
-                "utilize FMT_STRING() or fmt().");
-#endif
-}
-template <typename..., typename S, FMT_ENABLE_IF(is_compile_string<S>::value)>
-void check_format_string(S);
-
-struct view {};
-template <bool...> struct bool_pack;
-template <bool... Args>
-using all_true =
-    std::is_same<bool_pack<Args..., true>, bool_pack<true, Args...>>;
-
-template <typename... Args, typename S, typename Char = char_t<S>>
-inline format_arg_store<buffer_context<Char>, remove_reference_t<Args>...>
-make_args_checked(const S& format_str,
-                  const remove_reference_t<Args>&... args) {
-  static_assert(all_true<(!std::is_base_of<view, remove_reference_t<Args>>() ||
-                          !std::is_reference<Args>())...>::value,
-                "passing views as lvalues is disallowed");
-  check_format_string<remove_const_t<remove_reference_t<Args>>...>(format_str);
-  return {args...};
-}
-
-template <typename Char>
-std::basic_string<Char> vformat(basic_string_view<Char> format_str,
-                                basic_format_args<buffer_context<Char>> args);
-
-template <typename Char>
-typename buffer_context<Char>::iterator vformat_to(
-    buffer<Char>& buf, basic_string_view<Char> format_str,
-    basic_format_args<buffer_context<Char>> args);
-}  // namespace internal
-
-/**
-  \rst
-  Returns a named argument to be used in a formatting function.
-
-  The named argument holds a reference and does not extend the lifetime
-  of its arguments.
-  Consequently, a dangling reference can accidentally be created.
-  The user should take care to only pass this function temporaries when
-  the named argument is itself a temporary, as per the following example.
-
-  **Example**::
-
-    fmt::print("Elapsed time: {s:.2f} seconds", fmt::arg("s", 1.23));
-  \endrst
- */
-template <typename S, typename T, typename Char = char_t<S>>
-inline internal::named_arg<T, Char> arg(const S& name, const T& arg) {
-  static_assert(internal::is_string<S>::value, "");
-  return {name, arg};
-}
-
-// Disable nested named arguments, e.g. ``arg("a", arg("b", 42))``.
-template <typename S, typename T, typename Char>
-void arg(S, internal::named_arg<T, Char>) = delete;
-
-/** Formats a string and writes the output to ``out``. */
-// GCC 8 and earlier cannot handle std::back_insert_iterator<Container> with
-// vformat_to<ArgFormatter>(...) overload, so SFINAE on iterator type instead.
-template <typename OutputIt, typename S, typename Char = char_t<S>,
-          FMT_ENABLE_IF(
-              internal::is_contiguous_back_insert_iterator<OutputIt>::value)>
-OutputIt vformat_to(OutputIt out, const S& format_str,
-                    basic_format_args<buffer_context<Char>> args) {
-  using container = remove_reference_t<decltype(internal::get_container(out))>;
-  internal::container_buffer<container> buf((internal::get_container(out)));
-  internal::vformat_to(buf, to_string_view(format_str), args);
-  return out;
-}
-
-template <typename Container, typename S, typename... Args,
-          FMT_ENABLE_IF(
-              is_contiguous<Container>::value&& internal::is_string<S>::value)>
-inline std::back_insert_iterator<Container> format_to(
-    std::back_insert_iterator<Container> out, const S& format_str,
-    Args&&... args) {
-  return vformat_to(
-      out, to_string_view(format_str),
-      {internal::make_args_checked<Args...>(format_str, args...)});
-}
-
-template <typename S, typename Char = char_t<S>>
-inline std::basic_string<Char> vformat(
-    const S& format_str, basic_format_args<buffer_context<Char>> args) {
-  return internal::vformat(to_string_view(format_str), args);
-}
-
-/**
-  \rst
-  Formats arguments and returns the result as a string.
-
-  **Example**::
-
-    #include <fmt/core.h>
-    std::string message = fmt::format("The answer is {}", 42);
-  \endrst
-*/
-// Pass char_t as a default template parameter instead of using
-// std::basic_string<char_t<S>> to reduce the symbol size.
-template <typename S, typename... Args, typename Char = char_t<S>>
-inline std::basic_string<Char> format(const S& format_str, Args&&... args) {
-  return internal::vformat(
-      to_string_view(format_str),
-      {internal::make_args_checked<Args...>(format_str, args...)});
-}
-
-FMT_API void vprint(std::FILE* f, string_view format_str, format_args args);
-FMT_API void vprint(std::FILE* f, wstring_view format_str, wformat_args args);
-
-/**
-  \rst
-  Prints formatted data to the file *f*. For wide format strings,
-  *f* should be in wide-oriented mode set via ``fwide(f, 1)`` or
-  ``_setmode(_fileno(f), _O_U8TEXT)`` on Windows.
-
-  **Example**::
-
-    fmt::print(stderr, "Don't {}!", "panic");
-  \endrst
- */
-template <typename S, typename... Args,
-          FMT_ENABLE_IF(internal::is_string<S>::value)>
-inline void print(std::FILE* f, const S& format_str, Args&&... args) {
-  vprint(f, to_string_view(format_str),
-         internal::make_args_checked<Args...>(format_str, args...));
-}
-
-FMT_API void vprint(string_view format_str, format_args args);
-FMT_API void vprint(wstring_view format_str, wformat_args args);
-
-/**
-  \rst
-  Prints formatted data to ``stdout``.
-
-  **Example**::
-
-    fmt::print("Elapsed time: {0:.2f} seconds", 1.23);
-  \endrst
- */
-template <typename S, typename... Args,
-          FMT_ENABLE_IF(internal::is_string<S>::value)>
-inline void print(const S& format_str, Args&&... args) {
-  vprint(to_string_view(format_str),
-         internal::make_args_checked<Args...>(format_str, args...));
-}
-FMT_END_NAMESPACE
-
-#endif  // FMT_CORE_H_
diff --git a/include/vtkdiy2/fmt/format-inl.h b/include/vtkdiy2/fmt/format-inl.h
deleted file mode 100644
index 147062fe5d3..00000000000
--- a/include/vtkdiy2/fmt/format-inl.h
+++ /dev/null
@@ -1,1000 +0,0 @@
-// Formatting library for C++
-//
-// Copyright (c) 2012 - 2016, Victor Zverovich
-// All rights reserved.
-//
-// For the license information refer to format.h.
-
-#ifndef FMT_FORMAT_INL_H_
-#define FMT_FORMAT_INL_H_
-
-#include "format.h"
-
-#include <string.h>
-
-#include <cctype>
-#include <cerrno>
-#include <climits>
-#include <cmath>
-#include <cstdarg>
-#include <cstddef>  // for std::ptrdiff_t
-#include <cstring>  // for std::memmove
-#include <cwchar>
-#if !defined(FMT_STATIC_THOUSANDS_SEPARATOR)
-#  include <locale>
-#endif
-
-#if FMT_USE_WINDOWS_H
-#  if !defined(FMT_HEADER_ONLY) && !defined(WIN32_LEAN_AND_MEAN)
-#    define WIN32_LEAN_AND_MEAN
-#  endif
-#  if defined(NOMINMAX) || defined(FMT_WIN_MINMAX)
-#    include <windows.h>
-#  else
-#    define NOMINMAX
-#    include <windows.h>
-#    undef NOMINMAX
-#  endif
-#endif
-
-#if FMT_EXCEPTIONS
-#  define FMT_TRY try
-#  define FMT_CATCH(x) catch (x)
-#else
-#  define FMT_TRY if (true)
-#  define FMT_CATCH(x) if (false)
-#endif
-
-#ifdef _MSC_VER
-#  pragma warning(push)
-#  pragma warning(disable : 4127)  // conditional expression is constant
-#  pragma warning(disable : 4702)  // unreachable code
-// Disable deprecation warning for strerror. The latter is not called but
-// MSVC fails to detect it.
-#  pragma warning(disable : 4996)
-#endif
-
-// Dummy implementations of strerror_r and strerror_s called if corresponding
-// system functions are not available.
-inline fmt::internal::null<> strerror_r(int, char*, ...) {
-  return fmt::internal::null<>();
-}
-inline fmt::internal::null<> strerror_s(char*, std::size_t, ...) {
-  return fmt::internal::null<>();
-}
-
-FMT_BEGIN_NAMESPACE
-namespace internal {
-
-#ifndef _MSC_VER
-#  define FMT_SNPRINTF snprintf
-#else  // _MSC_VER
-inline int fmt_snprintf(char* buffer, size_t size, const char* format, ...) {
-  va_list args;
-  va_start(args, format);
-  int result = vsnprintf_s(buffer, size, _TRUNCATE, format, args);
-  va_end(args);
-  return result;
-}
-#  define FMT_SNPRINTF fmt_snprintf
-#endif  // _MSC_VER
-
-using format_func = void (*)(internal::buffer<char>&, int, string_view);
-
-// Portable thread-safe version of strerror.
-// Sets buffer to point to a string describing the error code.
-// This can be either a pointer to a string stored in buffer,
-// or a pointer to some static immutable string.
-// Returns one of the following values:
-//   0      - success
-//   ERANGE - buffer is not large enough to store the error message
-//   other  - failure
-// Buffer should be at least of size 1.
-FMT_FUNC int safe_strerror(int error_code, char*& buffer,
-                           std::size_t buffer_size) FMT_NOEXCEPT {
-  FMT_ASSERT(buffer != nullptr && buffer_size != 0, "invalid buffer");
-
-  class dispatcher {
-   private:
-    int error_code_;
-    char*& buffer_;
-    std::size_t buffer_size_;
-
-    // A noop assignment operator to avoid bogus warnings.
-    void operator=(const dispatcher&) {}
-
-    // Handle the result of XSI-compliant version of strerror_r.
-    int handle(int result) {
-      // glibc versions before 2.13 return result in errno.
-      return result == -1 ? errno : result;
-    }
-
-    // Handle the result of GNU-specific version of strerror_r.
-    int handle(char* message) {
-      // If the buffer is full then the message is probably truncated.
-      if (message == buffer_ && strlen(buffer_) == buffer_size_ - 1)
-        return ERANGE;
-      buffer_ = message;
-      return 0;
-    }
-
-    // Handle the case when strerror_r is not available.
-    int handle(internal::null<>) {
-      return fallback(strerror_s(buffer_, buffer_size_, error_code_));
-    }
-
-    // Fallback to strerror_s when strerror_r is not available.
-    int fallback(int result) {
-      // If the buffer is full then the message is probably truncated.
-      return result == 0 && strlen(buffer_) == buffer_size_ - 1 ? ERANGE
-                                                                : result;
-    }
-
-#if !FMT_MSC_VER
-    // Fallback to strerror if strerror_r and strerror_s are not available.
-    int fallback(internal::null<>) {
-      errno = 0;
-      buffer_ = strerror(error_code_);
-      return errno;
-    }
-#endif
-
-   public:
-    dispatcher(int err_code, char*& buf, std::size_t buf_size)
-        : error_code_(err_code), buffer_(buf), buffer_size_(buf_size) {}
-
-    int run() { return handle(strerror_r(error_code_, buffer_, buffer_size_)); }
-  };
-  return dispatcher(error_code, buffer, buffer_size).run();
-}
-
-FMT_FUNC void format_error_code(internal::buffer<char>& out, int error_code,
-                                string_view message) FMT_NOEXCEPT {
-  // Report error code making sure that the output fits into
-  // inline_buffer_size to avoid dynamic memory allocation and potential
-  // bad_alloc.
-  out.resize(0);
-  static const char SEP[] = ": ";
-  static const char ERROR_STR[] = "error ";
-  // Subtract 2 to account for terminating null characters in SEP and ERROR_STR.
-  std::size_t error_code_size = sizeof(SEP) + sizeof(ERROR_STR) - 2;
-  auto abs_value = static_cast<uint32_or_64_t<int>>(error_code);
-  if (internal::is_negative(error_code)) {
-    abs_value = 0 - abs_value;
-    ++error_code_size;
-  }
-  error_code_size += internal::to_unsigned(internal::count_digits(abs_value));
-  internal::writer w(out);
-  if (message.size() <= inline_buffer_size - error_code_size) {
-    w.write(message);
-    w.write(SEP);
-  }
-  w.write(ERROR_STR);
-  w.write(error_code);
-  assert(out.size() <= inline_buffer_size);
-}
-
-// A wrapper around fwrite that throws on error.
-FMT_FUNC void fwrite_fully(const void* ptr, size_t size, size_t count,
-                           FILE* stream) {
-  size_t written = std::fwrite(ptr, size, count, stream);
-  if (written < count) {
-    FMT_THROW(system_error(errno, "cannot write to file"));
-  }
-}
-
-FMT_FUNC void report_error(format_func func, int error_code,
-                           string_view message) FMT_NOEXCEPT {
-  memory_buffer full_message;
-  func(full_message, error_code, message);
-  // Don't use fwrite_fully because the latter may throw.
-  (void)std::fwrite(full_message.data(), full_message.size(), 1, stderr);
-  std::fputc('\n', stderr);
-}
-}  // namespace internal
-
-#if !defined(FMT_STATIC_THOUSANDS_SEPARATOR)
-namespace internal {
-
-template <typename Locale>
-locale_ref::locale_ref(const Locale& loc) : locale_(&loc) {
-  static_assert(std::is_same<Locale, std::locale>::value, "");
-}
-
-template <typename Locale> Locale locale_ref::get() const {
-  static_assert(std::is_same<Locale, std::locale>::value, "");
-  return locale_ ? *static_cast<const std::locale*>(locale_) : std::locale();
-}
-
-template <typename Char> FMT_FUNC Char thousands_sep_impl(locale_ref loc) {
-  return std::use_facet<std::numpunct<Char>>(loc.get<std::locale>())
-      .thousands_sep();
-}
-template <typename Char> FMT_FUNC Char decimal_point_impl(locale_ref loc) {
-  return std::use_facet<std::numpunct<Char>>(loc.get<std::locale>())
-      .decimal_point();
-}
-}  // namespace internal
-#else
-template <typename Char>
-FMT_FUNC Char internal::thousands_sep_impl(locale_ref) {
-  return FMT_STATIC_THOUSANDS_SEPARATOR;
-}
-template <typename Char>
-FMT_FUNC Char internal::decimal_point_impl(locale_ref) {
-  return '.';
-}
-#endif
-
-FMT_API FMT_FUNC format_error::~format_error() FMT_NOEXCEPT {}
-FMT_API FMT_FUNC system_error::~system_error() FMT_NOEXCEPT {}
-
-FMT_FUNC void system_error::init(int err_code, string_view format_str,
-                                 format_args args) {
-  error_code_ = err_code;
-  memory_buffer buffer;
-  format_system_error(buffer, err_code, vformat(format_str, args));
-  std::runtime_error& base = *this;
-  base = std::runtime_error(to_string(buffer));
-}
-
-namespace internal {
-
-template <> FMT_FUNC int count_digits<4>(internal::fallback_uintptr n) {
-  // Assume little endian; pointer formatting is implementation-defined anyway.
-  int i = static_cast<int>(sizeof(void*)) - 1;
-  while (i > 0 && n.value[i] == 0) --i;
-  auto char_digits = std::numeric_limits<unsigned char>::digits / 4;
-  return i >= 0 ? i * char_digits + count_digits<4, unsigned>(n.value[i]) : 1;
-}
-
-template <typename T>
-int format_float(char* buf, std::size_t size, const char* format, int precision,
-                 T value) {
-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
-  if (precision > 100000)
-    throw std::runtime_error(
-        "fuzz mode - avoid large allocation inside snprintf");
-#endif
-  // Suppress the warning about nonliteral format string.
-  auto snprintf_ptr = FMT_SNPRINTF;
-  return precision < 0 ? snprintf_ptr(buf, size, format, value)
-                       : snprintf_ptr(buf, size, format, precision, value);
-}
-
-template <typename T>
-const char basic_data<T>::digits[] =
-    "0001020304050607080910111213141516171819"
-    "2021222324252627282930313233343536373839"
-    "4041424344454647484950515253545556575859"
-    "6061626364656667686970717273747576777879"
-    "8081828384858687888990919293949596979899";
-
-template <typename T>
-const char basic_data<T>::hex_digits[] = "0123456789abcdef";
-
-#define FMT_POWERS_OF_10(factor)                                             \
-  factor * 10, factor * 100, factor * 1000, factor * 10000, factor * 100000, \
-      factor * 1000000, factor * 10000000, factor * 100000000,               \
-      factor * 1000000000
-
-template <typename T>
-const uint64_t basic_data<T>::powers_of_10_64[] = {
-    1, FMT_POWERS_OF_10(1), FMT_POWERS_OF_10(1000000000ull),
-    10000000000000000000ull};
-
-template <typename T>
-const uint32_t basic_data<T>::zero_or_powers_of_10_32[] = {0,
-                                                           FMT_POWERS_OF_10(1)};
-
-template <typename T>
-const uint64_t basic_data<T>::zero_or_powers_of_10_64[] = {
-    0, FMT_POWERS_OF_10(1), FMT_POWERS_OF_10(1000000000ull),
-    10000000000000000000ull};
-
-// Normalized 64-bit significands of pow(10, k), for k = -348, -340, ..., 340.
-// These are generated by support/compute-powers.py.
-template <typename T>
-const uint64_t basic_data<T>::pow10_significands[] = {
-    0xfa8fd5a0081c0288, 0xbaaee17fa23ebf76, 0x8b16fb203055ac76,
-    0xcf42894a5dce35ea, 0x9a6bb0aa55653b2d, 0xe61acf033d1a45df,
-    0xab70fe17c79ac6ca, 0xff77b1fcbebcdc4f, 0xbe5691ef416bd60c,
-    0x8dd01fad907ffc3c, 0xd3515c2831559a83, 0x9d71ac8fada6c9b5,
-    0xea9c227723ee8bcb, 0xaecc49914078536d, 0x823c12795db6ce57,
-    0xc21094364dfb5637, 0x9096ea6f3848984f, 0xd77485cb25823ac7,
-    0xa086cfcd97bf97f4, 0xef340a98172aace5, 0xb23867fb2a35b28e,
-    0x84c8d4dfd2c63f3b, 0xc5dd44271ad3cdba, 0x936b9fcebb25c996,
-    0xdbac6c247d62a584, 0xa3ab66580d5fdaf6, 0xf3e2f893dec3f126,
-    0xb5b5ada8aaff80b8, 0x87625f056c7c4a8b, 0xc9bcff6034c13053,
-    0x964e858c91ba2655, 0xdff9772470297ebd, 0xa6dfbd9fb8e5b88f,
-    0xf8a95fcf88747d94, 0xb94470938fa89bcf, 0x8a08f0f8bf0f156b,
-    0xcdb02555653131b6, 0x993fe2c6d07b7fac, 0xe45c10c42a2b3b06,
-    0xaa242499697392d3, 0xfd87b5f28300ca0e, 0xbce5086492111aeb,
-    0x8cbccc096f5088cc, 0xd1b71758e219652c, 0x9c40000000000000,
-    0xe8d4a51000000000, 0xad78ebc5ac620000, 0x813f3978f8940984,
-    0xc097ce7bc90715b3, 0x8f7e32ce7bea5c70, 0xd5d238a4abe98068,
-    0x9f4f2726179a2245, 0xed63a231d4c4fb27, 0xb0de65388cc8ada8,
-    0x83c7088e1aab65db, 0xc45d1df942711d9a, 0x924d692ca61be758,
-    0xda01ee641a708dea, 0xa26da3999aef774a, 0xf209787bb47d6b85,
-    0xb454e4a179dd1877, 0x865b86925b9bc5c2, 0xc83553c5c8965d3d,
-    0x952ab45cfa97a0b3, 0xde469fbd99a05fe3, 0xa59bc234db398c25,
-    0xf6c69a72a3989f5c, 0xb7dcbf5354e9bece, 0x88fcf317f22241e2,
-    0xcc20ce9bd35c78a5, 0x98165af37b2153df, 0xe2a0b5dc971f303a,
-    0xa8d9d1535ce3b396, 0xfb9b7cd9a4a7443c, 0xbb764c4ca7a44410,
-    0x8bab8eefb6409c1a, 0xd01fef10a657842c, 0x9b10a4e5e9913129,
-    0xe7109bfba19c0c9d, 0xac2820d9623bf429, 0x80444b5e7aa7cf85,
-    0xbf21e44003acdd2d, 0x8e679c2f5e44ff8f, 0xd433179d9c8cb841,
-    0x9e19db92b4e31ba9, 0xeb96bf6ebadf77d9, 0xaf87023b9bf0ee6b,
-};
-
-// Binary exponents of pow(10, k), for k = -348, -340, ..., 340, corresponding
-// to significands above.
-template <typename T>
-const int16_t basic_data<T>::pow10_exponents[] = {
-    -1220, -1193, -1166, -1140, -1113, -1087, -1060, -1034, -1007, -980, -954,
-    -927,  -901,  -874,  -847,  -821,  -794,  -768,  -741,  -715,  -688, -661,
-    -635,  -608,  -582,  -555,  -529,  -502,  -475,  -449,  -422,  -396, -369,
-    -343,  -316,  -289,  -263,  -236,  -210,  -183,  -157,  -130,  -103, -77,
-    -50,   -24,   3,     30,    56,    83,    109,   136,   162,   189,  216,
-    242,   269,   295,   322,   348,   375,   402,   428,   455,   481,  508,
-    534,   561,   588,   614,   641,   667,   694,   720,   747,   774,  800,
-    827,   853,   880,   907,   933,   960,   986,   1013,  1039,  1066};
-
-template <typename T>
-const char basic_data<T>::foreground_color[] = "\x1b[38;2;";
-template <typename T>
-const char basic_data<T>::background_color[] = "\x1b[48;2;";
-template <typename T> const char basic_data<T>::reset_color[] = "\x1b[0m";
-template <typename T> const wchar_t basic_data<T>::wreset_color[] = L"\x1b[0m";
-
-template <typename T> struct bits {
-  static FMT_CONSTEXPR_DECL const int value =
-      static_cast<int>(sizeof(T) * std::numeric_limits<unsigned char>::digits);
-};
-
-// A handmade floating-point number f * pow(2, e).
-class fp {
- private:
-  using significand_type = uint64_t;
-
-  // All sizes are in bits.
-  // Subtract 1 to account for an implicit most significant bit in the
-  // normalized form.
-  static FMT_CONSTEXPR_DECL const int double_significand_size =
-      std::numeric_limits<double>::digits - 1;
-  static FMT_CONSTEXPR_DECL const uint64_t implicit_bit =
-      1ull << double_significand_size;
-
- public:
-  significand_type f;
-  int e;
-
-  static FMT_CONSTEXPR_DECL const int significand_size =
-      bits<significand_type>::value;
-
-  fp() : f(0), e(0) {}
-  fp(uint64_t f_val, int e_val) : f(f_val), e(e_val) {}
-
-  // Constructs fp from an IEEE754 double. It is a template to prevent compile
-  // errors on platforms where double is not IEEE754.
-  template <typename Double> explicit fp(Double d) {
-    // Assume double is in the format [sign][exponent][significand].
-    using limits = std::numeric_limits<Double>;
-    const int exponent_size =
-        bits<Double>::value - double_significand_size - 1;  // -1 for sign
-    const uint64_t significand_mask = implicit_bit - 1;
-    const uint64_t exponent_mask = (~0ull >> 1) & ~significand_mask;
-    const int exponent_bias = (1 << exponent_size) - limits::max_exponent - 1;
-    auto u = bit_cast<uint64_t>(d);
-    auto biased_e = (u & exponent_mask) >> double_significand_size;
-    f = u & significand_mask;
-    if (biased_e != 0)
-      f += implicit_bit;
-    else
-      biased_e = 1;  // Subnormals use biased exponent 1 (min exponent).
-    e = static_cast<int>(biased_e - exponent_bias - double_significand_size);
-  }
-
-  // Normalizes the value converted from double and multiplied by (1 << SHIFT).
-  template <int SHIFT = 0> void normalize() {
-    // Handle subnormals.
-    auto shifted_implicit_bit = implicit_bit << SHIFT;
-    while ((f & shifted_implicit_bit) == 0) {
-      f <<= 1;
-      --e;
-    }
-    // Subtract 1 to account for hidden bit.
-    auto offset = significand_size - double_significand_size - SHIFT - 1;
-    f <<= offset;
-    e -= offset;
-  }
-
-  // Compute lower and upper boundaries (m^- and m^+ in the Grisu paper), where
-  // a boundary is a value half way between the number and its predecessor
-  // (lower) or successor (upper). The upper boundary is normalized and lower
-  // has the same exponent but may be not normalized.
-  void compute_boundaries(fp& lower, fp& upper) const {
-    lower =
-        f == implicit_bit ? fp((f << 2) - 1, e - 2) : fp((f << 1) - 1, e - 1);
-    upper = fp((f << 1) + 1, e - 1);
-    upper.normalize<1>();  // 1 is to account for the exponent shift above.
-    lower.f <<= lower.e - upper.e;
-    lower.e = upper.e;
-  }
-};
-
-// Returns an fp number representing x - y. Result may not be normalized.
-inline fp operator-(fp x, fp y) {
-  FMT_ASSERT(x.f >= y.f && x.e == y.e, "invalid operands");
-  return fp(x.f - y.f, x.e);
-}
-
-// Computes an fp number r with r.f = x.f * y.f / pow(2, 64) rounded to nearest
-// with half-up tie breaking, r.e = x.e + y.e + 64. Result may not be
-// normalized.
-FMT_FUNC fp operator*(fp x, fp y) {
-  int exp = x.e + y.e + 64;
-#if FMT_USE_INT128
-  auto product = static_cast<__uint128_t>(x.f) * y.f;
-  auto f = static_cast<uint64_t>(product >> 64);
-  if ((static_cast<uint64_t>(product) & (1ULL << 63)) != 0) ++f;
-  return fp(f, exp);
-#else
-  // Multiply 32-bit parts of significands.
-  uint64_t mask = (1ULL << 32) - 1;
-  uint64_t a = x.f >> 32, b = x.f & mask;
-  uint64_t c = y.f >> 32, d = y.f & mask;
-  uint64_t ac = a * c, bc = b * c, ad = a * d, bd = b * d;
-  // Compute mid 64-bit of result and round.
-  uint64_t mid = (bd >> 32) + (ad & mask) + (bc & mask) + (1U << 31);
-  return fp(ac + (ad >> 32) + (bc >> 32) + (mid >> 32), exp);
-#endif
-}
-
-// Returns cached power (of 10) c_k = c_k.f * pow(2, c_k.e) such that its
-// (binary) exponent satisfies min_exponent <= c_k.e <= min_exponent + 28.
-FMT_FUNC fp get_cached_power(int min_exponent, int& pow10_exponent) {
-  const double one_over_log2_10 = 0.30102999566398114;  // 1 / log2(10)
-  int index = static_cast<int>(
-      std::ceil((min_exponent + fp::significand_size - 1) * one_over_log2_10));
-  // Decimal exponent of the first (smallest) cached power of 10.
-  const int first_dec_exp = -348;
-  // Difference between 2 consecutive decimal exponents in cached powers of 10.
-  const int dec_exp_step = 8;
-  index = (index - first_dec_exp - 1) / dec_exp_step + 1;
-  pow10_exponent = first_dec_exp + index * dec_exp_step;
-  return fp(data::pow10_significands[index], data::pow10_exponents[index]);
-}
-
-enum round_direction { unknown, up, down };
-
-// Given the divisor (normally a power of 10), the remainder = v % divisor for
-// some number v and the error, returns whether v should be rounded up, down, or
-// whether the rounding direction can't be determined due to error.
-// error should be less than divisor / 2.
-inline round_direction get_round_direction(uint64_t divisor, uint64_t remainder,
-                                           uint64_t error) {
-  FMT_ASSERT(remainder < divisor, "");  // divisor - remainder won't overflow.
-  FMT_ASSERT(error < divisor, "");      // divisor - error won't overflow.
-  FMT_ASSERT(error < divisor - error, "");  // error * 2 won't overflow.
-  // Round down if (remainder + error) * 2 <= divisor.
-  if (remainder <= divisor - remainder && error * 2 <= divisor - remainder * 2)
-    return down;
-  // Round up if (remainder - error) * 2 >= divisor.
-  if (remainder >= error &&
-      remainder - error >= divisor - (remainder - error)) {
-    return up;
-  }
-  return unknown;
-}
-
-namespace digits {
-enum result {
-  more,  // Generate more digits.
-  done,  // Done generating digits.
-  error  // Digit generation cancelled due to an error.
-};
-}
-
-// Generates output using the Grisu digit-gen algorithm.
-// error: the size of the region (lower, upper) outside of which numbers
-// definitely do not round to value (Delta in Grisu3).
-template <typename Handler>
-digits::result grisu_gen_digits(fp value, uint64_t error, int& exp,
-                                Handler& handler) {
-  fp one(1ull << -value.e, value.e);
-  // The integral part of scaled value (p1 in Grisu) = value / one. It cannot be
-  // zero because it contains a product of two 64-bit numbers with MSB set (due
-  // to normalization) - 1, shifted right by at most 60 bits.
-  uint32_t integral = static_cast<uint32_t>(value.f >> -one.e);
-  FMT_ASSERT(integral != 0, "");
-  FMT_ASSERT(integral == value.f >> -one.e, "");
-  // The fractional part of scaled value (p2 in Grisu) c = value % one.
-  uint64_t fractional = value.f & (one.f - 1);
-  exp = count_digits(integral);  // kappa in Grisu.
-  // Divide by 10 to prevent overflow.
-  auto result = handler.on_start(data::powers_of_10_64[exp - 1] << -one.e,
-                                 value.f / 10, error * 10, exp);
-  if (result != digits::more) return result;
-  // Generate digits for the integral part. This can produce up to 10 digits.
-  do {
-    uint32_t digit = 0;
-    // This optimization by miloyip reduces the number of integer divisions by
-    // one per iteration.
-    switch (exp) {
-    case 10:
-      digit = integral / 1000000000;
-      integral %= 1000000000;
-      break;
-    case 9:
-      digit = integral / 100000000;
-      integral %= 100000000;
-      break;
-    case 8:
-      digit = integral / 10000000;
-      integral %= 10000000;
-      break;
-    case 7:
-      digit = integral / 1000000;
-      integral %= 1000000;
-      break;
-    case 6:
-      digit = integral / 100000;
-      integral %= 100000;
-      break;
-    case 5:
-      digit = integral / 10000;
-      integral %= 10000;
-      break;
-    case 4:
-      digit = integral / 1000;
-      integral %= 1000;
-      break;
-    case 3:
-      digit = integral / 100;
-      integral %= 100;
-      break;
-    case 2:
-      digit = integral / 10;
-      integral %= 10;
-      break;
-    case 1:
-      digit = integral;
-      integral = 0;
-      break;
-    default:
-      FMT_ASSERT(false, "invalid number of digits");
-    }
-    --exp;
-    uint64_t remainder =
-        (static_cast<uint64_t>(integral) << -one.e) + fractional;
-    result = handler.on_digit(static_cast<char>('0' + digit),
-                              data::powers_of_10_64[exp] << -one.e, remainder,
-                              error, exp, true);
-    if (result != digits::more) return result;
-  } while (exp > 0);
-  // Generate digits for the fractional part.
-  for (;;) {
-    fractional *= 10;
-    error *= 10;
-    char digit =
-        static_cast<char>('0' + static_cast<char>(fractional >> -one.e));
-    fractional &= one.f - 1;
-    --exp;
-    result = handler.on_digit(digit, one.f, fractional, error, exp, false);
-    if (result != digits::more) return result;
-  }
-}
-
-// The fixed precision digit handler.
-struct fixed_handler {
-  char* buf;
-  int size;
-  int precision;
-  int exp10;
-  bool fixed;
-
-  digits::result on_start(uint64_t divisor, uint64_t remainder, uint64_t error,
-                          int& exp) {
-    // Non-fixed formats require at least one digit and no precision adjustment.
-    if (!fixed) return digits::more;
-    // Adjust fixed precision by exponent because it is relative to decimal
-    // point.
-    precision += exp + exp10;
-    // Check if precision is satisfied just by leading zeros, e.g.
-    // format("{:.2f}", 0.001) gives "0.00" without generating any digits.
-    if (precision > 0) return digits::more;
-    if (precision < 0) return digits::done;
-    auto dir = get_round_direction(divisor, remainder, error);
-    if (dir == unknown) return digits::error;
-    buf[size++] = dir == up ? '1' : '0';
-    return digits::done;
-  }
-
-  digits::result on_digit(char digit, uint64_t divisor, uint64_t remainder,
-                          uint64_t error, int, bool integral) {
-    FMT_ASSERT(remainder < divisor, "");
-    buf[size++] = digit;
-    if (size < precision) return digits::more;
-    if (!integral) {
-      // Check if error * 2 < divisor with overflow prevention.
-      // The check is not needed for the integral part because error = 1
-      // and divisor > (1 << 32) there.
-      if (error >= divisor || error >= divisor - error) return digits::error;
-    } else {
-      FMT_ASSERT(error == 1 && divisor > 2, "");
-    }
-    auto dir = get_round_direction(divisor, remainder, error);
-    if (dir != up) return dir == down ? digits::done : digits::error;
-    ++buf[size - 1];
-    for (int i = size - 1; i > 0 && buf[i] > '9'; --i) {
-      buf[i] = '0';
-      ++buf[i - 1];
-    }
-    if (buf[0] > '9') {
-      buf[0] = '1';
-      buf[size++] = '0';
-    }
-    return digits::done;
-  }
-};
-
-// The shortest representation digit handler.
-template <int GRISU_VERSION> struct grisu_shortest_handler {
-  char* buf;
-  int size;
-  // Distance between scaled value and upper bound (wp_W in Grisu3).
-  uint64_t diff;
-
-  digits::result on_start(uint64_t, uint64_t, uint64_t, int&) {
-    return digits::more;
-  }
-
-  // Decrement the generated number approaching value from above.
-  void round(uint64_t d, uint64_t divisor, uint64_t& remainder,
-             uint64_t error) {
-    while (
-        remainder < d && error - remainder >= divisor &&
-        (remainder + divisor < d || d - remainder >= remainder + divisor - d)) {
-      --buf[size - 1];
-      remainder += divisor;
-    }
-  }
-
-  // Implements Grisu's round_weed.
-  digits::result on_digit(char digit, uint64_t divisor, uint64_t remainder,
-                          uint64_t error, int exp, bool integral) {
-    buf[size++] = digit;
-    if (remainder >= error) return digits::more;
-    if (GRISU_VERSION != 3) {
-      uint64_t d = integral ? diff : diff * data::powers_of_10_64[-exp];
-      round(d, divisor, remainder, error);
-      return digits::done;
-    }
-    uint64_t unit = integral ? 1 : data::powers_of_10_64[-exp];
-    uint64_t up = (diff - 1) * unit;  // wp_Wup
-    round(up, divisor, remainder, error);
-    uint64_t down = (diff + 1) * unit;  // wp_Wdown
-    if (remainder < down && error - remainder >= divisor &&
-        (remainder + divisor < down ||
-         down - remainder > remainder + divisor - down)) {
-      return digits::error;
-    }
-    return 2 * unit <= remainder && remainder <= error - 4 * unit
-               ? digits::done
-               : digits::error;
-  }
-};
-
-template <typename Double,
-          enable_if_t<(sizeof(Double) == sizeof(uint64_t)), int>>
-FMT_API bool grisu_format(Double value, buffer<char>& buf, int precision,
-                          unsigned options, int& exp) {
-  FMT_ASSERT(value >= 0, "value is negative");
-  bool fixed = (options & grisu_options::fixed) != 0;
-  if (value <= 0) {  // <= instead of == to silence a warning.
-    if (precision <= 0 || !fixed) {
-      exp = 0;
-      buf.push_back('0');
-    } else {
-      exp = -precision;
-      buf.resize(precision);
-      std::uninitialized_fill_n(buf.data(), precision, '0');
-    }
-    return true;
-  }
-
-  fp fp_value(value);
-  const int min_exp = -60;  // alpha in Grisu.
-  int cached_exp10 = 0;     // K in Grisu.
-  if (precision != -1) {
-    if (precision > 17) return false;
-    fp_value.normalize();
-    auto cached_pow = get_cached_power(
-        min_exp - (fp_value.e + fp::significand_size), cached_exp10);
-    fp_value = fp_value * cached_pow;
-    fixed_handler handler{buf.data(), 0, precision, -cached_exp10, fixed};
-    if (grisu_gen_digits(fp_value, 1, exp, handler) == digits::error)
-      return false;
-    buf.resize(to_unsigned(handler.size));
-  } else {
-    fp lower, upper;  // w^- and w^+ in the Grisu paper.
-    fp_value.compute_boundaries(lower, upper);
-    // Find a cached power of 10 such that multiplying upper by it will bring
-    // the exponent in the range [min_exp, -32].
-    auto cached_pow = get_cached_power(  // \tilde{c}_{-k} in Grisu.
-        min_exp - (upper.e + fp::significand_size), cached_exp10);
-    fp_value.normalize();
-    fp_value = fp_value * cached_pow;
-    lower = lower * cached_pow;  // \tilde{M}^- in Grisu.
-    upper = upper * cached_pow;  // \tilde{M}^+ in Grisu.
-    assert(min_exp <= upper.e && upper.e <= -32);
-    auto result = digits::result();
-    int size = 0;
-    if ((options & grisu_options::grisu3) != 0) {
-      --lower.f;  // \tilde{M}^- - 1 ulp -> M^-_{\downarrow}.
-      ++upper.f;  // \tilde{M}^+ + 1 ulp -> M^+_{\uparrow}.
-      // Numbers outside of (lower, upper) definitely do not round to value.
-      grisu_shortest_handler<3> handler{buf.data(), 0, (upper - fp_value).f};
-      result = grisu_gen_digits(upper, upper.f - lower.f, exp, handler);
-      size = handler.size;
-    } else {
-      ++lower.f;  // \tilde{M}^- + 1 ulp -> M^-_{\uparrow}.
-      --upper.f;  // \tilde{M}^+ - 1 ulp -> M^+_{\downarrow}.
-      grisu_shortest_handler<2> handler{buf.data(), 0, (upper - fp_value).f};
-      result = grisu_gen_digits(upper, upper.f - lower.f, exp, handler);
-      size = handler.size;
-    }
-    if (result == digits::error) return false;
-    buf.resize(to_unsigned(size));
-  }
-  exp -= cached_exp10;
-  return true;
-}
-
-template <typename Double>
-char* sprintf_format(Double value, internal::buffer<char>& buf,
-                     sprintf_specs specs) {
-  // Buffer capacity must be non-zero, otherwise MSVC's vsnprintf_s will fail.
-  FMT_ASSERT(buf.capacity() != 0, "empty buffer");
-
-  // Build format string.
-  enum { max_format_size = 10 };  // longest format: %#-*.*Lg
-  char format[max_format_size];
-  char* format_ptr = format;
-  *format_ptr++ = '%';
-  if (specs.alt || !specs.type) *format_ptr++ = '#';
-  if (specs.precision >= 0) {
-    *format_ptr++ = '.';
-    *format_ptr++ = '*';
-  }
-  if (std::is_same<Double, long double>::value) *format_ptr++ = 'L';
-
-  char type = specs.type;
-
-  if (type == '%')
-    type = 'f';
-  else if (type == 0 || type == 'n')
-    type = 'g';
-#if FMT_MSC_VER
-  if (type == 'F') {
-    // MSVC's printf doesn't support 'F'.
-    type = 'f';
-  }
-#endif
-  *format_ptr++ = type;
-  *format_ptr = '\0';
-
-  // Format using snprintf.
-  char* start = nullptr;
-  char* decimal_point_pos = nullptr;
-  for (;;) {
-    std::size_t buffer_size = buf.capacity();
-    start = &buf[0];
-    int result =
-        format_float(start, buffer_size, format, specs.precision, value);
-    if (result >= 0) {
-      unsigned n = internal::to_unsigned(result);
-      if (n < buf.capacity()) {
-        // Find the decimal point.
-        auto p = buf.data(), end = p + n;
-        if (*p == '+' || *p == '-') ++p;
-        if (specs.type != 'a' && specs.type != 'A') {
-          while (p < end && *p >= '0' && *p <= '9') ++p;
-          if (p < end && *p != 'e' && *p != 'E') {
-            decimal_point_pos = p;
-            if (!specs.type) {
-              // Keep only one trailing zero after the decimal point.
-              ++p;
-              if (*p == '0') ++p;
-              while (p != end && *p >= '1' && *p <= '9') ++p;
-              char* where = p;
-              while (p != end && *p == '0') ++p;
-              if (p == end || *p < '0' || *p > '9') {
-                if (p != end) std::memmove(where, p, to_unsigned(end - p));
-                n -= static_cast<unsigned>(p - where);
-              }
-            }
-          }
-        }
-        buf.resize(n);
-        break;  // The buffer is large enough - continue with formatting.
-      }
-      buf.reserve(n + 1);
-    } else {
-      // If result is negative we ask to increase the capacity by at least 1,
-      // but as std::vector, the buffer grows exponentially.
-      buf.reserve(buf.capacity() + 1);
-    }
-  }
-  return decimal_point_pos;
-}
-}  // namespace internal
-
-#if FMT_USE_WINDOWS_H
-
-FMT_FUNC internal::utf8_to_utf16::utf8_to_utf16(string_view s) {
-  static const char ERROR_MSG[] = "cannot convert string from UTF-8 to UTF-16";
-  if (s.size() > INT_MAX)
-    FMT_THROW(windows_error(ERROR_INVALID_PARAMETER, ERROR_MSG));
-  int s_size = static_cast<int>(s.size());
-  if (s_size == 0) {
-    // MultiByteToWideChar does not support zero length, handle separately.
-    buffer_.resize(1);
-    buffer_[0] = 0;
-    return;
-  }
-
-  int length = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, s.data(),
-                                   s_size, nullptr, 0);
-  if (length == 0) FMT_THROW(windows_error(GetLastError(), ERROR_MSG));
-  buffer_.resize(length + 1);
-  length = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, s.data(), s_size,
-                               &buffer_[0], length);
-  if (length == 0) FMT_THROW(windows_error(GetLastError(), ERROR_MSG));
-  buffer_[length] = 0;
-}
-
-FMT_FUNC internal::utf16_to_utf8::utf16_to_utf8(wstring_view s) {
-  if (int error_code = convert(s)) {
-    FMT_THROW(windows_error(error_code,
-                            "cannot convert string from UTF-16 to UTF-8"));
-  }
-}
-
-FMT_FUNC int internal::utf16_to_utf8::convert(wstring_view s) {
-  if (s.size() > INT_MAX) return ERROR_INVALID_PARAMETER;
-  int s_size = static_cast<int>(s.size());
-  if (s_size == 0) {
-    // WideCharToMultiByte does not support zero length, handle separately.
-    buffer_.resize(1);
-    buffer_[0] = 0;
-    return 0;
-  }
-
-  int length = WideCharToMultiByte(CP_UTF8, 0, s.data(), s_size, nullptr, 0,
-                                   nullptr, nullptr);
-  if (length == 0) return GetLastError();
-  buffer_.resize(length + 1);
-  length = WideCharToMultiByte(CP_UTF8, 0, s.data(), s_size, &buffer_[0],
-                               length, nullptr, nullptr);
-  if (length == 0) return GetLastError();
-  buffer_[length] = 0;
-  return 0;
-}
-
-FMT_FUNC void windows_error::init(int err_code, string_view format_str,
-                                  format_args args) {
-  error_code_ = err_code;
-  memory_buffer buffer;
-  internal::format_windows_error(buffer, err_code, vformat(format_str, args));
-  std::runtime_error& base = *this;
-  base = std::runtime_error(to_string(buffer));
-}
-
-FMT_FUNC void internal::format_windows_error(internal::buffer<char>& out,
-                                             int error_code,
-                                             string_view message) FMT_NOEXCEPT {
-  FMT_TRY {
-    wmemory_buffer buf;
-    buf.resize(inline_buffer_size);
-    for (;;) {
-      wchar_t* system_message = &buf[0];
-      int result = FormatMessageW(
-          FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, nullptr,
-          error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), system_message,
-          static_cast<uint32_t>(buf.size()), nullptr);
-      if (result != 0) {
-        utf16_to_utf8 utf8_message;
-        if (utf8_message.convert(system_message) == ERROR_SUCCESS) {
-          internal::writer w(out);
-          w.write(message);
-          w.write(": ");
-          w.write(utf8_message);
-          return;
-        }
-        break;
-      }
-      if (GetLastError() != ERROR_INSUFFICIENT_BUFFER)
-        break;  // Can't get error message, report error code instead.
-      buf.resize(buf.size() * 2);
-    }
-  }
-  FMT_CATCH(...) {}
-  format_error_code(out, error_code, message);
-}
-
-#endif  // FMT_USE_WINDOWS_H
-
-FMT_FUNC void format_system_error(internal::buffer<char>& out, int error_code,
-                                  string_view message) FMT_NOEXCEPT {
-  FMT_TRY {
-    memory_buffer buf;
-    buf.resize(inline_buffer_size);
-    for (;;) {
-      char* system_message = &buf[0];
-      int result =
-          internal::safe_strerror(error_code, system_message, buf.size());
-      if (result == 0) {
-        internal::writer w(out);
-        w.write(message);
-        w.write(": ");
-        w.write(system_message);
-        return;
-      }
-      if (result != ERANGE)
-        break;  // Can't get error message, report error code instead.
-      buf.resize(buf.size() * 2);
-    }
-  }
-  FMT_CATCH(...) {}
-  format_error_code(out, error_code, message);
-}
-
-FMT_FUNC void internal::error_handler::on_error(const char* message) {
-  FMT_THROW(format_error(message));
-}
-
-FMT_FUNC void report_system_error(int error_code,
-                                  fmt::string_view message) FMT_NOEXCEPT {
-  report_error(format_system_error, error_code, message);
-}
-
-#if FMT_USE_WINDOWS_H
-FMT_FUNC void report_windows_error(int error_code,
-                                   fmt::string_view message) FMT_NOEXCEPT {
-  report_error(internal::format_windows_error, error_code, message);
-}
-#endif
-
-FMT_FUNC void vprint(std::FILE* f, string_view format_str, format_args args) {
-  memory_buffer buffer;
-  internal::vformat_to(buffer, format_str,
-                       basic_format_args<buffer_context<char>>(args));
-  internal::fwrite_fully(buffer.data(), 1, buffer.size(), f);
-}
-
-FMT_FUNC void vprint(std::FILE* f, wstring_view format_str, wformat_args args) {
-  wmemory_buffer buffer;
-  internal::vformat_to(buffer, format_str, args);
-  buffer.push_back(L'\0');
-  if (std::fputws(buffer.data(), f) == -1) {
-    FMT_THROW(system_error(errno, "cannot write to file"));
-  }
-}
-
-FMT_FUNC void vprint(string_view format_str, format_args args) {
-  vprint(stdout, format_str, args);
-}
-
-FMT_FUNC void vprint(wstring_view format_str, wformat_args args) {
-  vprint(stdout, format_str, args);
-}
-
-FMT_END_NAMESPACE
-
-#ifdef _MSC_VER
-#  pragma warning(pop)
-#endif
-
-#endif  // FMT_FORMAT_INL_H_
diff --git a/include/vtkdiy2/fmt/format.h b/include/vtkdiy2/fmt/format.h
deleted file mode 100644
index dcf4a399814..00000000000
--- a/include/vtkdiy2/fmt/format.h
+++ /dev/null
@@ -1,3602 +0,0 @@
-/*
- Formatting library for C++
-
- Copyright (c) 2012 - present, Victor Zverovich
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- --- Optional exception to the license ---
-
- As an exception, if, as a result of your compiling your source code, portions
- of this Software are embedded into a machine-executable object form of such
- source code, you may redistribute such embedded portions in such object form
- without including the above copyright and permission notices.
- */
-
-#ifndef FMT_FORMAT_H_
-#define FMT_FORMAT_H_
-
-#define FMT_HEADER_ONLY     // Added by diy for header-only usage
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <cstdint>
-#include <cstring>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <stdexcept>
-
-#include "core.h"
-
-#ifdef __clang__
-#  define FMT_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__)
-#else
-#  define FMT_CLANG_VERSION 0
-#endif
-
-#ifdef __INTEL_COMPILER
-#  define FMT_ICC_VERSION __INTEL_COMPILER
-#elif defined(__ICL)
-#  define FMT_ICC_VERSION __ICL
-#else
-#  define FMT_ICC_VERSION 0
-#endif
-
-#ifdef __NVCC__
-#  define FMT_CUDA_VERSION (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__)
-#else
-#  define FMT_CUDA_VERSION 0
-#endif
-
-#ifdef __has_builtin
-#  define FMT_HAS_BUILTIN(x) __has_builtin(x)
-#else
-#  define FMT_HAS_BUILTIN(x) 0
-#endif
-
-#ifndef FMT_THROW
-#  if FMT_EXCEPTIONS
-#    if FMT_MSC_VER
-FMT_BEGIN_NAMESPACE
-namespace internal {
-template <typename Exception> inline void do_throw(const Exception& x) {
-  // Silence unreachable code warnings in MSVC because these are nearly
-  // impossible to fix in a generic code.
-  volatile bool b = true;
-  if (b) throw x;
-}
-}  // namespace internal
-FMT_END_NAMESPACE
-#      define FMT_THROW(x) fmt::internal::do_throw(x)
-#    else
-#      define FMT_THROW(x) throw x
-#    endif
-#  else
-#    define FMT_THROW(x)              \
-      do {                            \
-        static_cast<void>(sizeof(x)); \
-        assert(false);                \
-      } while (false)
-#  endif
-#endif
-
-#ifndef FMT_USE_USER_DEFINED_LITERALS
-// For Intel and NVIDIA compilers both they and the system gcc/msc support UDLs.
-#  if (FMT_HAS_FEATURE(cxx_user_literals) || FMT_GCC_VERSION >= 407 ||      \
-       FMT_MSC_VER >= 1900) &&                                              \
-      (!(FMT_ICC_VERSION || FMT_CUDA_VERSION) || FMT_ICC_VERSION >= 1500 || \
-       FMT_CUDA_VERSION >= 700)
-#    define FMT_USE_USER_DEFINED_LITERALS 1
-#  else
-#    define FMT_USE_USER_DEFINED_LITERALS 0
-#  endif
-#endif
-
-#ifndef FMT_USE_UDL_TEMPLATE
-// EDG front end based compilers (icc, nvcc) do not support UDL templates yet
-// and GCC 9 warns about them.
-#  if FMT_USE_USER_DEFINED_LITERALS && FMT_ICC_VERSION == 0 && \
-      FMT_CUDA_VERSION == 0 &&                                 \
-      ((FMT_GCC_VERSION >= 600 && FMT_GCC_VERSION <= 900 &&    \
-        __cplusplus >= 201402L) ||                             \
-       FMT_CLANG_VERSION >= 304)
-#    define FMT_USE_UDL_TEMPLATE 1
-#  else
-#    define FMT_USE_UDL_TEMPLATE 0
-#  endif
-#endif
-
-#ifdef FMT_USE_INT128
-// Do nothing.
-#elif defined(__SIZEOF_INT128__)
-#  define FMT_USE_INT128 1
-#else
-#  define FMT_USE_INT128 0
-#endif
-
-// __builtin_clz is broken in clang with Microsoft CodeGen:
-// https://github.com/fmtlib/fmt/issues/519
-#if (FMT_GCC_VERSION || FMT_HAS_BUILTIN(__builtin_clz)) && !FMT_MSC_VER
-#  define FMT_BUILTIN_CLZ(n) __builtin_clz(n)
-#endif
-#if (FMT_GCC_VERSION || FMT_HAS_BUILTIN(__builtin_clzll)) && !FMT_MSC_VER
-#  define FMT_BUILTIN_CLZLL(n) __builtin_clzll(n)
-#endif
-
-// Some compilers masquerade as both MSVC and GCC-likes or otherwise support
-// __builtin_clz and __builtin_clzll, so only define FMT_BUILTIN_CLZ using the
-// MSVC intrinsics if the clz and clzll builtins are not available.
-#if FMT_MSC_VER && !defined(FMT_BUILTIN_CLZLL) && !defined(_MANAGED)
-#  include <intrin.h>  // _BitScanReverse, _BitScanReverse64
-
-FMT_BEGIN_NAMESPACE
-namespace internal {
-// Avoid Clang with Microsoft CodeGen's -Wunknown-pragmas warning.
-#  ifndef __clang__
-#    pragma intrinsic(_BitScanReverse)
-#  endif
-inline uint32_t clz(uint32_t x) {
-  unsigned long r = 0;
-  _BitScanReverse(&r, x);
-
-  assert(x != 0);
-  // Static analysis complains about using uninitialized data
-  // "r", but the only way that can happen is if "x" is 0,
-  // which the callers guarantee to not happen.
-#  pragma warning(suppress : 6102)
-  return 31 - r;
-}
-#  define FMT_BUILTIN_CLZ(n) fmt::internal::clz(n)
-
-#  if defined(_WIN64) && !defined(__clang__)
-#    pragma intrinsic(_BitScanReverse64)
-#  endif
-
-inline uint32_t clzll(uint64_t x) {
-  unsigned long r = 0;
-#  ifdef _WIN64
-  _BitScanReverse64(&r, x);
-#  else
-  // Scan the high 32 bits.
-  if (_BitScanReverse(&r, static_cast<uint32_t>(x >> 32))) return 63 - (r + 32);
-
-  // Scan the low 32 bits.
-  _BitScanReverse(&r, static_cast<uint32_t>(x));
-#  endif
-
-  assert(x != 0);
-  // Static analysis complains about using uninitialized data
-  // "r", but the only way that can happen is if "x" is 0,
-  // which the callers guarantee to not happen.
-#  pragma warning(suppress : 6102)
-  return 63 - r;
-}
-#  define FMT_BUILTIN_CLZLL(n) fmt::internal::clzll(n)
-}  // namespace internal
-FMT_END_NAMESPACE
-#endif
-
-FMT_BEGIN_NAMESPACE
-namespace internal {
-
-// A fallback implementation of uintptr_t for systems that lack it.
-struct fallback_uintptr {
-  unsigned char value[sizeof(void*)];
-};
-#ifdef UINTPTR_MAX
-using uintptr_t = ::uintptr_t;
-#else
-using uintptr_t = fallback_uintptr;
-#endif
-
-// An equivalent of `*reinterpret_cast<Dest*>(&source)` that doesn't produce
-// undefined behavior (e.g. due to type aliasing).
-// Example: uint64_t d = bit_cast<uint64_t>(2.718);
-template <typename Dest, typename Source>
-inline Dest bit_cast(const Source& source) {
-  static_assert(sizeof(Dest) == sizeof(Source), "size mismatch");
-  Dest dest;
-  std::memcpy(&dest, &source, sizeof(dest));
-  return dest;
-}
-
-// An approximation of iterator_t for pre-C++20 systems.
-template <typename T>
-using iterator_t = decltype(std::begin(std::declval<T&>()));
-
-// Detect the iterator category of *any* given type in a SFINAE-friendly way.
-// Unfortunately, older implementations of std::iterator_traits are not safe
-// for use in a SFINAE-context.
-template <typename It, typename Enable = void>
-struct iterator_category : std::false_type {};
-
-template <typename T> struct iterator_category<T*> {
-  using type = std::random_access_iterator_tag;
-};
-
-template <typename It>
-struct iterator_category<It, void_t<typename It::iterator_category>> {
-  using type = typename It::iterator_category;
-};
-
-// Detect if *any* given type models the OutputIterator concept.
-template <typename It> class is_output_iterator {
-  // Check for mutability because all iterator categories derived from
-  // std::input_iterator_tag *may* also meet the requirements of an
-  // OutputIterator, thereby falling into the category of 'mutable iterators'
-  // [iterator.requirements.general] clause 4. The compiler reveals this
-  // property only at the point of *actually dereferencing* the iterator!
-  template <typename U>
-  static decltype(*(std::declval<U>())) test(std::input_iterator_tag);
-  template <typename U> static char& test(std::output_iterator_tag);
-  template <typename U> static const char& test(...);
-
-  using type = decltype(test<It>(typename iterator_category<It>::type{}));
-
- public:
-  static const bool value = !std::is_const<remove_reference_t<type>>::value;
-};
-
-// A workaround for std::string not having mutable data() until C++17.
-template <typename Char> inline Char* get_data(std::basic_string<Char>& s) {
-  return &s[0];
-}
-template <typename Container>
-inline typename Container::value_type* get_data(Container& c) {
-  return c.data();
-}
-
-#ifdef _SECURE_SCL
-// Make a checked iterator to avoid MSVC warnings.
-template <typename T> using checked_ptr = stdext::checked_array_iterator<T*>;
-template <typename T> checked_ptr<T> make_checked(T* p, std::size_t size) {
-  return {p, size};
-}
-#else
-template <typename T> using checked_ptr = T*;
-template <typename T> inline T* make_checked(T* p, std::size_t) { return p; }
-#endif
-
-template <typename Container, FMT_ENABLE_IF(is_contiguous<Container>::value)>
-inline checked_ptr<typename Container::value_type> reserve(
-    std::back_insert_iterator<Container>& it, std::size_t n) {
-  Container& c = get_container(it);
-  std::size_t size = c.size();
-  c.resize(size + n);
-  return make_checked(get_data(c) + size, n);
-}
-
-template <typename Iterator>
-inline Iterator& reserve(Iterator& it, std::size_t) {
-  return it;
-}
-
-// An output iterator that counts the number of objects written to it and
-// discards them.
-template <typename T> class counting_iterator {
- private:
-  std::size_t count_;
-  mutable T blackhole_;
-
- public:
-  using iterator_category = std::output_iterator_tag;
-  using value_type = T;
-  using difference_type = std::ptrdiff_t;
-  using pointer = T*;
-  using reference = T&;
-  using _Unchecked_type = counting_iterator;  // Mark iterator as checked.
-
-  counting_iterator() : count_(0) {}
-
-  std::size_t count() const { return count_; }
-
-  counting_iterator& operator++() {
-    ++count_;
-    return *this;
-  }
-
-  counting_iterator operator++(int) {
-    auto it = *this;
-    ++*this;
-    return it;
-  }
-
-  T& operator*() const { return blackhole_; }
-};
-
-template <typename OutputIt> class truncating_iterator_base {
- protected:
-  OutputIt out_;
-  std::size_t limit_;
-  std::size_t count_;
-
-  truncating_iterator_base(OutputIt out, std::size_t limit)
-      : out_(out), limit_(limit), count_(0) {}
-
- public:
-  using iterator_category = std::output_iterator_tag;
-  using difference_type = void;
-  using pointer = void;
-  using reference = void;
-  using _Unchecked_type =
-      truncating_iterator_base;  // Mark iterator as checked.
-
-  OutputIt base() const { return out_; }
-  std::size_t count() const { return count_; }
-};
-
-// An output iterator that truncates the output and counts the number of objects
-// written to it.
-template <typename OutputIt,
-          typename Enable = typename std::is_void<
-              typename std::iterator_traits<OutputIt>::value_type>::type>
-class truncating_iterator;
-
-template <typename OutputIt>
-class truncating_iterator<OutputIt, std::false_type>
-    : public truncating_iterator_base<OutputIt> {
-  using traits = std::iterator_traits<OutputIt>;
-
-  mutable typename traits::value_type blackhole_;
-
- public:
-  using value_type = typename traits::value_type;
-
-  truncating_iterator(OutputIt out, std::size_t limit)
-      : truncating_iterator_base<OutputIt>(out, limit) {}
-
-  truncating_iterator& operator++() {
-    if (this->count_++ < this->limit_) ++this->out_;
-    return *this;
-  }
-
-  truncating_iterator operator++(int) {
-    auto it = *this;
-    ++*this;
-    return it;
-  }
-
-  value_type& operator*() const {
-    return this->count_ < this->limit_ ? *this->out_ : blackhole_;
-  }
-};
-
-template <typename OutputIt>
-class truncating_iterator<OutputIt, std::true_type>
-    : public truncating_iterator_base<OutputIt> {
- public:
-  using value_type = typename OutputIt::container_type::value_type;
-
-  truncating_iterator(OutputIt out, std::size_t limit)
-      : truncating_iterator_base<OutputIt>(out, limit) {}
-
-  truncating_iterator& operator=(value_type val) {
-    if (this->count_++ < this->limit_) this->out_ = val;
-    return *this;
-  }
-
-  truncating_iterator& operator++() { return *this; }
-  truncating_iterator& operator++(int) { return *this; }
-  truncating_iterator& operator*() { return *this; }
-};
-
-// A range with the specified output iterator and value type.
-template <typename OutputIt, typename T = typename OutputIt::value_type>
-class output_range {
- private:
-  OutputIt it_;
-
- public:
-  using value_type = T;
-  using iterator = OutputIt;
-  struct sentinel {};
-
-  explicit output_range(OutputIt it) : it_(it) {}
-  OutputIt begin() const { return it_; }
-  sentinel end() const { return {}; }  // Sentinel is not used yet.
-};
-
-// A range with an iterator appending to a buffer.
-template <typename T>
-class buffer_range
-    : public output_range<std::back_insert_iterator<buffer<T>>, T> {
- public:
-  using iterator = std::back_insert_iterator<buffer<T>>;
-  using output_range<iterator, T>::output_range;
-  buffer_range(buffer<T>& buf)
-      : output_range<iterator, T>(std::back_inserter(buf)) {}
-};
-
-template <typename Char>
-inline size_t count_code_points(basic_string_view<Char> s) {
-  return s.size();
-}
-
-// Counts the number of code points in a UTF-8 string.
-inline size_t count_code_points(basic_string_view<char8_t> s) {
-  const char8_t* data = s.data();
-  size_t num_code_points = 0;
-  for (size_t i = 0, size = s.size(); i != size; ++i) {
-    if ((data[i] & 0xc0) != 0x80) ++num_code_points;
-  }
-  return num_code_points;
-}
-
-inline char8_t to_char8_t(char c) { return static_cast<char8_t>(c); }
-
-template <typename InputIt, typename OutChar>
-using needs_conversion = bool_constant<
-    std::is_same<typename std::iterator_traits<InputIt>::value_type,
-                 char>::value &&
-    std::is_same<OutChar, char8_t>::value>;
-
-template <typename OutChar, typename InputIt, typename OutputIt,
-          FMT_ENABLE_IF(!needs_conversion<InputIt, OutChar>::value)>
-OutputIt copy_str(InputIt begin, InputIt end, OutputIt it) {
-  return std::copy(begin, end, it);
-}
-
-template <typename OutChar, typename InputIt, typename OutputIt,
-          FMT_ENABLE_IF(needs_conversion<InputIt, OutChar>::value)>
-OutputIt copy_str(InputIt begin, InputIt end, OutputIt it) {
-  return std::transform(begin, end, it, to_char8_t);
-}
-
-#ifndef FMT_USE_GRISU
-#  define FMT_USE_GRISU 0
-#endif
-
-template <typename T> constexpr bool use_grisu() {
-  return FMT_USE_GRISU && std::numeric_limits<double>::is_iec559 &&
-         sizeof(T) <= sizeof(double);
-}
-
-template <typename T>
-template <typename U>
-void buffer<T>::append(const U* begin, const U* end) {
-  std::size_t new_size = size_ + to_unsigned(end - begin);
-  reserve(new_size);
-  std::uninitialized_copy(begin, end, make_checked(ptr_, capacity_) + size_);
-  size_ = new_size;
-}
-}  // namespace internal
-
-// A UTF-8 string view.
-class u8string_view : public basic_string_view<char8_t> {
- public:
-  u8string_view(const char* s)
-      : basic_string_view<char8_t>(reinterpret_cast<const char8_t*>(s)) {}
-  u8string_view(const char* s, size_t count) FMT_NOEXCEPT
-      : basic_string_view<char8_t>(reinterpret_cast<const char8_t*>(s), count) {
-  }
-};
-
-#if FMT_USE_USER_DEFINED_LITERALS
-inline namespace literals {
-inline u8string_view operator"" _u(const char* s, std::size_t n) {
-  return {s, n};
-}
-}  // namespace literals
-#endif
-
-// The number of characters to store in the basic_memory_buffer object itself
-// to avoid dynamic memory allocation.
-enum { inline_buffer_size = 500 };
-
-/**
-  \rst
-  A dynamically growing memory buffer for trivially copyable/constructible types
-  with the first ``SIZE`` elements stored in the object itself.
-
-  You can use one of the following type aliases for common character types:
-
-  +----------------+------------------------------+
-  | Type           | Definition                   |
-  +================+==============================+
-  | memory_buffer  | basic_memory_buffer<char>    |
-  +----------------+------------------------------+
-  | wmemory_buffer | basic_memory_buffer<wchar_t> |
-  +----------------+------------------------------+
-
-  **Example**::
-
-     fmt::memory_buffer out;
-     format_to(out, "The answer is {}.", 42);
-
-  This will append the following output to the ``out`` object:
-
-  .. code-block:: none
-
-     The answer is 42.
-
-  The output can be converted to an ``std::string`` with ``to_string(out)``.
-  \endrst
- */
-template <typename T, std::size_t SIZE = inline_buffer_size,
-          typename Allocator = std::allocator<T>>
-class basic_memory_buffer : private Allocator, public internal::buffer<T> {
- private:
-  T store_[SIZE];
-
-  // Deallocate memory allocated by the buffer.
-  void deallocate() {
-    T* data = this->data();
-    if (data != store_) Allocator::deallocate(data, this->capacity());
-  }
-
- protected:
-  void grow(std::size_t size) FMT_OVERRIDE;
-
- public:
-  using value_type = T;
-  using const_reference = const T&;
-
-  explicit basic_memory_buffer(const Allocator& alloc = Allocator())
-      : Allocator(alloc) {
-    this->set(store_, SIZE);
-  }
-  ~basic_memory_buffer() { deallocate(); }
-
- private:
-  // Move data from other to this buffer.
-  void move(basic_memory_buffer& other) {
-    Allocator &this_alloc = *this, &other_alloc = other;
-    this_alloc = std::move(other_alloc);
-    T* data = other.data();
-    std::size_t size = other.size(), capacity = other.capacity();
-    if (data == other.store_) {
-      this->set(store_, capacity);
-      std::uninitialized_copy(other.store_, other.store_ + size,
-                              internal::make_checked(store_, capacity));
-    } else {
-      this->set(data, capacity);
-      // Set pointer to the inline array so that delete is not called
-      // when deallocating.
-      other.set(other.store_, 0);
-    }
-    this->resize(size);
-  }
-
- public:
-  /**
-    \rst
-    Constructs a :class:`fmt::basic_memory_buffer` object moving the content
-    of the other object to it.
-    \endrst
-   */
-  basic_memory_buffer(basic_memory_buffer&& other) { move(other); }
-
-  /**
-    \rst
-    Moves the content of the other ``basic_memory_buffer`` object to this one.
-    \endrst
-   */
-  basic_memory_buffer& operator=(basic_memory_buffer&& other) {
-    assert(this != &other);
-    deallocate();
-    move(other);
-    return *this;
-  }
-
-  // Returns a copy of the allocator associated with this buffer.
-  Allocator get_allocator() const { return *this; }
-};
-
-template <typename T, std::size_t SIZE, typename Allocator>
-void basic_memory_buffer<T, SIZE, Allocator>::grow(std::size_t size) {
-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
-  if (size > 1000) throw std::runtime_error("fuzz mode - won't grow that much");
-#endif
-  std::size_t old_capacity = this->capacity();
-  std::size_t new_capacity = old_capacity + old_capacity / 2;
-  if (size > new_capacity) new_capacity = size;
-  T* old_data = this->data();
-  T* new_data = std::allocator_traits<Allocator>::allocate(*this, new_capacity);
-  // The following code doesn't throw, so the raw pointer above doesn't leak.
-  std::uninitialized_copy(old_data, old_data + this->size(),
-                          internal::make_checked(new_data, new_capacity));
-  this->set(new_data, new_capacity);
-  // deallocate must not throw according to the standard, but even if it does,
-  // the buffer already uses the new storage and will deallocate it in
-  // destructor.
-  if (old_data != store_) Allocator::deallocate(old_data, old_capacity);
-}
-
-using memory_buffer = basic_memory_buffer<char>;
-using wmemory_buffer = basic_memory_buffer<wchar_t>;
-
-/** A formatting error such as invalid format string. */
-class FMT_API format_error : public std::runtime_error {
- public:
-  explicit format_error(const char* message) : std::runtime_error(message) {}
-  explicit format_error(const std::string& message)
-      : std::runtime_error(message) {}
-  ~format_error() FMT_NOEXCEPT;
-};
-
-namespace internal {
-
-// Returns true if value is negative, false otherwise.
-// Same as `value < 0` but doesn't produce warnings if T is an unsigned type.
-template <typename T, FMT_ENABLE_IF(std::numeric_limits<T>::is_signed)>
-FMT_CONSTEXPR bool is_negative(T value) {
-  return value < 0;
-}
-template <typename T, FMT_ENABLE_IF(!std::numeric_limits<T>::is_signed)>
-FMT_CONSTEXPR bool is_negative(T) {
-  return false;
-}
-
-// Smallest of uint32_t and uint64_t that is large enough to represent all
-// values of T.
-template <typename T>
-using uint32_or_64_t =
-    conditional_t<std::numeric_limits<T>::digits <= 32, uint32_t, uint64_t>;
-
-// Static data is placed in this class template for the header-only config.
-template <typename T = void> struct FMT_EXTERN_TEMPLATE_API basic_data {
-  static const uint64_t powers_of_10_64[];
-  static const uint32_t zero_or_powers_of_10_32[];
-  static const uint64_t zero_or_powers_of_10_64[];
-  static const uint64_t pow10_significands[];
-  static const int16_t pow10_exponents[];
-  static const char digits[];
-  static const char hex_digits[];
-  static const char foreground_color[];
-  static const char background_color[];
-  static const char reset_color[5];
-  static const wchar_t wreset_color[5];
-};
-
-FMT_EXTERN template struct basic_data<void>;
-
-// This is a struct rather than an alias to avoid shadowing warnings in gcc.
-struct data : basic_data<> {};
-
-#ifdef FMT_BUILTIN_CLZLL
-// Returns the number of decimal digits in n. Leading zeros are not counted
-// except for n == 0 in which case count_digits returns 1.
-inline int count_digits(uint64_t n) {
-  // Based on http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog10
-  // and the benchmark https://github.com/localvoid/cxx-benchmark-count-digits.
-  int t = (64 - FMT_BUILTIN_CLZLL(n | 1)) * 1233 >> 12;
-  return t - (n < data::zero_or_powers_of_10_64[t]) + 1;
-}
-#else
-// Fallback version of count_digits used when __builtin_clz is not available.
-inline int count_digits(uint64_t n) {
-  int count = 1;
-  for (;;) {
-    // Integer division is slow so do it for a group of four digits instead
-    // of for every digit. The idea comes from the talk by Alexandrescu
-    // "Three Optimization Tips for C++". See speed-test for a comparison.
-    if (n < 10) return count;
-    if (n < 100) return count + 1;
-    if (n < 1000) return count + 2;
-    if (n < 10000) return count + 3;
-    n /= 10000u;
-    count += 4;
-  }
-}
-#endif
-
-// Counts the number of digits in n. BITS = log2(radix).
-template <unsigned BITS, typename UInt> inline int count_digits(UInt n) {
-  int num_digits = 0;
-  do {
-    ++num_digits;
-  } while ((n >>= BITS) != 0);
-  return num_digits;
-}
-
-template <> int count_digits<4>(internal::fallback_uintptr n);
-
-#if FMT_HAS_CPP_ATTRIBUTE(always_inline)
-#  define FMT_ALWAYS_INLINE __attribute__((always_inline))
-#else
-#  define FMT_ALWAYS_INLINE
-#endif
-
-template <typename Handler>
-inline char* lg(uint32_t n, Handler h) FMT_ALWAYS_INLINE;
-
-// Computes g = floor(log10(n)) and calls h.on<g>(n);
-template <typename Handler> inline char* lg(uint32_t n, Handler h) {
-  return n < 100 ? n < 10 ? h.template on<0>(n) : h.template on<1>(n)
-                 : n < 1000000
-                       ? n < 10000 ? n < 1000 ? h.template on<2>(n)
-                                              : h.template on<3>(n)
-                                   : n < 100000 ? h.template on<4>(n)
-                                                : h.template on<5>(n)
-                       : n < 100000000 ? n < 10000000 ? h.template on<6>(n)
-                                                      : h.template on<7>(n)
-                                       : n < 1000000000 ? h.template on<8>(n)
-                                                        : h.template on<9>(n);
-}
-
-// An lg handler that formats a decimal number.
-// Usage: lg(n, decimal_formatter(buffer));
-class decimal_formatter {
- private:
-  char* buffer_;
-
-  void write_pair(unsigned N, uint32_t index) {
-    std::memcpy(buffer_ + N, data::digits + index * 2, 2);
-  }
-
- public:
-  explicit decimal_formatter(char* buf) : buffer_(buf) {}
-
-  template <unsigned N> char* on(uint32_t u) {
-    if (N == 0) {
-      *buffer_ = static_cast<char>(u) + '0';
-    } else if (N == 1) {
-      write_pair(0, u);
-    } else {
-      // The idea of using 4.32 fixed-point numbers is based on
-      // https://github.com/jeaiii/itoa
-      unsigned n = N - 1;
-      unsigned a = n / 5 * n * 53 / 16;
-      uint64_t t =
-          ((1ULL << (32 + a)) / data::zero_or_powers_of_10_32[n] + 1 - n / 9);
-      t = ((t * u) >> a) + n / 5 * 4;
-      write_pair(0, t >> 32);
-      for (unsigned i = 2; i < N; i += 2) {
-        t = 100ULL * static_cast<uint32_t>(t);
-        write_pair(i, t >> 32);
-      }
-      if (N % 2 == 0) {
-        buffer_[N] =
-            static_cast<char>((10ULL * static_cast<uint32_t>(t)) >> 32) + '0';
-      }
-    }
-    return buffer_ += N + 1;
-  }
-};
-
-#ifdef FMT_BUILTIN_CLZ
-// Optional version of count_digits for better performance on 32-bit platforms.
-inline int count_digits(uint32_t n) {
-  int t = (32 - FMT_BUILTIN_CLZ(n | 1)) * 1233 >> 12;
-  return t - (n < data::zero_or_powers_of_10_32[t]) + 1;
-}
-#endif
-
-template <typename Char> FMT_API Char thousands_sep_impl(locale_ref loc);
-template <typename Char> inline Char thousands_sep(locale_ref loc) {
-  return Char(thousands_sep_impl<char>(loc));
-}
-template <> inline wchar_t thousands_sep(locale_ref loc) {
-  return thousands_sep_impl<wchar_t>(loc);
-}
-
-template <typename Char> FMT_API Char decimal_point_impl(locale_ref loc);
-template <typename Char> inline Char decimal_point(locale_ref loc) {
-  return Char(decimal_point_impl<char>(loc));
-}
-template <> inline wchar_t decimal_point(locale_ref loc) {
-  return decimal_point_impl<wchar_t>(loc);
-}
-
-// Formats a decimal unsigned integer value writing into buffer.
-// add_thousands_sep is called after writing each char to add a thousands
-// separator if necessary.
-template <typename UInt, typename Char, typename F>
-inline Char* format_decimal(Char* buffer, UInt value, int num_digits,
-                            F add_thousands_sep) {
-  FMT_ASSERT(num_digits >= 0, "invalid digit count");
-  buffer += num_digits;
-  Char* end = buffer;
-  while (value >= 100) {
-    // Integer division is slow so do it for a group of two digits instead
-    // of for every digit. The idea comes from the talk by Alexandrescu
-    // "Three Optimization Tips for C++". See speed-test for a comparison.
-    unsigned index = static_cast<unsigned>((value % 100) * 2);
-    value /= 100;
-    *--buffer = static_cast<Char>(data::digits[index + 1]);
-    add_thousands_sep(buffer);
-    *--buffer = static_cast<Char>(data::digits[index]);
-    add_thousands_sep(buffer);
-  }
-  if (value < 10) {
-    *--buffer = static_cast<Char>('0' + value);
-    return end;
-  }
-  unsigned index = static_cast<unsigned>(value * 2);
-  *--buffer = static_cast<Char>(data::digits[index + 1]);
-  add_thousands_sep(buffer);
-  *--buffer = static_cast<Char>(data::digits[index]);
-  return end;
-}
-
-template <typename Char, typename UInt, typename Iterator, typename F>
-inline Iterator format_decimal(Iterator out, UInt value, int num_digits,
-                               F add_thousands_sep) {
-  FMT_ASSERT(num_digits >= 0, "invalid digit count");
-  // Buffer should be large enough to hold all digits (<= digits10 + 1).
-  enum { max_size = std::numeric_limits<UInt>::digits10 + 1 };
-  Char buffer[max_size + max_size / 3];
-  auto end = format_decimal(buffer, value, num_digits, add_thousands_sep);
-  return internal::copy_str<Char>(buffer, end, out);
-}
-
-template <typename Char, typename It, typename UInt>
-inline It format_decimal(It out, UInt value, int num_digits) {
-  return format_decimal<Char>(out, value, num_digits, [](Char*) {});
-}
-
-template <unsigned BASE_BITS, typename Char, typename UInt>
-inline Char* format_uint(Char* buffer, UInt value, int num_digits,
-                         bool upper = false) {
-  buffer += num_digits;
-  Char* end = buffer;
-  do {
-    const char* digits = upper ? "0123456789ABCDEF" : data::hex_digits;
-    unsigned digit = (value & ((1 << BASE_BITS) - 1));
-    *--buffer = static_cast<Char>(BASE_BITS < 4 ? static_cast<char>('0' + digit)
-                                                : digits[digit]);
-  } while ((value >>= BASE_BITS) != 0);
-  return end;
-}
-
-template <unsigned BASE_BITS, typename Char>
-Char* format_uint(Char* buffer, internal::fallback_uintptr n, int num_digits,
-                  bool = false) {
-  auto char_digits = std::numeric_limits<unsigned char>::digits / 4;
-  int start = (num_digits + char_digits - 1) / char_digits - 1;
-  if (int start_digits = num_digits % char_digits) {
-    unsigned value = n.value[start--];
-    buffer = format_uint<BASE_BITS>(buffer, value, start_digits);
-  }
-  for (; start >= 0; --start) {
-    unsigned value = n.value[start];
-    buffer += char_digits;
-    auto p = buffer;
-    for (int i = 0; i < char_digits; ++i) {
-      unsigned digit = (value & ((1 << BASE_BITS) - 1));
-      *--p = static_cast<Char>(data::hex_digits[digit]);
-      value >>= BASE_BITS;
-    }
-  }
-  return buffer;
-}
-
-template <unsigned BASE_BITS, typename Char, typename It, typename UInt>
-inline It format_uint(It out, UInt value, int num_digits, bool upper = false) {
-  // Buffer should be large enough to hold all digits (digits / BASE_BITS + 1).
-  char buffer[std::numeric_limits<UInt>::digits / BASE_BITS + 1];
-  format_uint<BASE_BITS>(buffer, value, num_digits, upper);
-  return internal::copy_str<Char>(buffer, buffer + num_digits, out);
-}
-
-#ifndef _WIN32
-#  define FMT_USE_WINDOWS_H 0
-#elif !defined(FMT_USE_WINDOWS_H)
-#  define FMT_USE_WINDOWS_H 1
-#endif
-
-// Define FMT_USE_WINDOWS_H to 0 to disable use of windows.h.
-// All the functionality that relies on it will be disabled too.
-#if FMT_USE_WINDOWS_H
-// A converter from UTF-8 to UTF-16.
-// It is only provided for Windows since other systems support UTF-8 natively.
-class utf8_to_utf16 {
- private:
-  wmemory_buffer buffer_;
-
- public:
-  FMT_API explicit utf8_to_utf16(string_view s);
-  operator wstring_view() const { return wstring_view(&buffer_[0], size()); }
-  size_t size() const { return buffer_.size() - 1; }
-  const wchar_t* c_str() const { return &buffer_[0]; }
-  std::wstring str() const { return std::wstring(&buffer_[0], size()); }
-};
-
-// A converter from UTF-16 to UTF-8.
-// It is only provided for Windows since other systems support UTF-8 natively.
-class utf16_to_utf8 {
- private:
-  memory_buffer buffer_;
-
- public:
-  utf16_to_utf8() {}
-  FMT_API explicit utf16_to_utf8(wstring_view s);
-  operator string_view() const { return string_view(&buffer_[0], size()); }
-  size_t size() const { return buffer_.size() - 1; }
-  const char* c_str() const { return &buffer_[0]; }
-  std::string str() const { return std::string(&buffer_[0], size()); }
-
-  // Performs conversion returning a system error code instead of
-  // throwing exception on conversion error. This method may still throw
-  // in case of memory allocation error.
-  FMT_API int convert(wstring_view s);
-};
-
-FMT_API void format_windows_error(fmt::internal::buffer<char>& out,
-                                  int error_code,
-                                  fmt::string_view message) FMT_NOEXCEPT;
-#endif
-
-template <typename T = void> struct null {};
-
-// Workaround an array initialization issue in gcc 4.8.
-template <typename Char> struct fill_t {
- private:
-  Char data_[6];
-
- public:
-  FMT_CONSTEXPR Char& operator[](size_t index) { return data_[index]; }
-  FMT_CONSTEXPR const Char& operator[](size_t index) const {
-    return data_[index];
-  }
-
-  static FMT_CONSTEXPR fill_t<Char> make() {
-    auto fill = fill_t<Char>();
-    fill[0] = Char(' ');
-    return fill;
-  }
-};
-}  // namespace internal
-
-// We cannot use enum classes as bit fields because of a gcc bug
-// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61414.
-namespace align {
-enum type { none, left, right, center, numeric };
-}
-using align_t = align::type;
-
-namespace sign {
-enum type { none, minus, plus, space };
-}
-using sign_t = sign::type;
-
-// Format specifiers for built-in and string types.
-template <typename Char> struct basic_format_specs {
-  int width;
-  int precision;
-  char type;
-  align_t align : 4;
-  sign_t sign : 3;
-  bool alt : 1;  // Alternate form ('#').
-  internal::fill_t<Char> fill;
-
-  constexpr basic_format_specs()
-      : width(0),
-        precision(-1),
-        type(0),
-        align(align::none),
-        sign(sign::none),
-        alt(false),
-        fill(internal::fill_t<Char>::make()) {}
-};
-
-using format_specs = basic_format_specs<char>;
-
-namespace internal {
-
-// Writes the exponent exp in the form "[+-]d{2,3}" to buffer.
-template <typename Char, typename It> It write_exponent(int exp, It it) {
-  FMT_ASSERT(-1000 < exp && exp < 1000, "exponent out of range");
-  if (exp < 0) {
-    *it++ = static_cast<Char>('-');
-    exp = -exp;
-  } else {
-    *it++ = static_cast<Char>('+');
-  }
-  if (exp >= 100) {
-    *it++ = static_cast<Char>(static_cast<char>('0' + exp / 100));
-    exp %= 100;
-  }
-  const char* d = data::digits + exp * 2;
-  *it++ = static_cast<Char>(d[0]);
-  *it++ = static_cast<Char>(d[1]);
-  return it;
-}
-
-struct gen_digits_params {
-  int num_digits;
-  bool fixed;
-  bool upper;
-  bool trailing_zeros;
-};
-
-// The number is given as v = digits * pow(10, exp).
-template <typename Char, typename It>
-It grisu_prettify(const char* digits, int size, int exp, It it,
-                  gen_digits_params params, Char decimal_point) {
-  // pow(10, full_exp - 1) <= v <= pow(10, full_exp).
-  int full_exp = size + exp;
-  if (!params.fixed) {
-    // Insert a decimal point after the first digit and add an exponent.
-    *it++ = static_cast<Char>(*digits);
-    if (size > 1) *it++ = decimal_point;
-    exp += size - 1;
-    it = copy_str<Char>(digits + 1, digits + size, it);
-    if (size < params.num_digits)
-      it = std::fill_n(it, params.num_digits - size, static_cast<Char>('0'));
-    *it++ = static_cast<Char>(params.upper ? 'E' : 'e');
-    return write_exponent<Char>(exp, it);
-  }
-  if (size <= full_exp) {
-    // 1234e7 -> 12340000000[.0+]
-    it = copy_str<Char>(digits, digits + size, it);
-    it = std::fill_n(it, full_exp - size, static_cast<Char>('0'));
-    int num_zeros = (std::max)(params.num_digits - full_exp, 1);
-    if (params.trailing_zeros) {
-      *it++ = decimal_point;
-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
-      if (num_zeros > 1000)
-        throw std::runtime_error("fuzz mode - avoiding excessive cpu use");
-#endif
-      it = std::fill_n(it, num_zeros, static_cast<Char>('0'));
-    }
-  } else if (full_exp > 0) {
-    // 1234e-2 -> 12.34[0+]
-    it = copy_str<Char>(digits, digits + full_exp, it);
-    if (!params.trailing_zeros) {
-      // Remove trailing zeros.
-      while (size > full_exp && digits[size - 1] == '0') --size;
-      if (size != full_exp) *it++ = decimal_point;
-      return copy_str<Char>(digits + full_exp, digits + size, it);
-    }
-    *it++ = decimal_point;
-    it = copy_str<Char>(digits + full_exp, digits + size, it);
-    if (params.num_digits > size) {
-      // Add trailing zeros.
-      int num_zeros = params.num_digits - size;
-      it = std::fill_n(it, num_zeros, static_cast<Char>('0'));
-    }
-  } else {
-    // 1234e-6 -> 0.001234
-    *it++ = static_cast<Char>('0');
-    int num_zeros = -full_exp;
-    if (params.num_digits >= 0 && params.num_digits < num_zeros)
-      num_zeros = params.num_digits;
-    if (!params.trailing_zeros)
-      while (size > 0 && digits[size - 1] == '0') --size;
-    if (num_zeros != 0 || size != 0) {
-      *it++ = decimal_point;
-      it = std::fill_n(it, num_zeros, static_cast<Char>('0'));
-      it = copy_str<Char>(digits, digits + size, it);
-    }
-  }
-  return it;
-}
-
-namespace grisu_options {
-enum { fixed = 1, grisu3 = 2 };
-}
-
-// Formats value using the Grisu algorithm:
-// https://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf
-template <typename Double, FMT_ENABLE_IF(sizeof(Double) == sizeof(uint64_t))>
-FMT_API bool grisu_format(Double, buffer<char>&, int, unsigned, int&);
-template <typename Double, FMT_ENABLE_IF(sizeof(Double) != sizeof(uint64_t))>
-inline bool grisu_format(Double, buffer<char>&, int, unsigned, int&) {
-  return false;
-}
-
-struct sprintf_specs {
-  int precision;
-  char type;
-  bool alt : 1;
-
-  template <typename Char>
-  constexpr sprintf_specs(basic_format_specs<Char> specs)
-      : precision(specs.precision), type(specs.type), alt(specs.alt) {}
-
-  constexpr bool has_precision() const { return precision >= 0; }
-};
-
-template <typename Double>
-char* sprintf_format(Double, internal::buffer<char>&, sprintf_specs);
-
-template <typename Handler>
-FMT_CONSTEXPR void handle_int_type_spec(char spec, Handler&& handler) {
-  switch (spec) {
-  case 0:
-  case 'd':
-    handler.on_dec();
-    break;
-  case 'x':
-  case 'X':
-    handler.on_hex();
-    break;
-  case 'b':
-  case 'B':
-    handler.on_bin();
-    break;
-  case 'o':
-    handler.on_oct();
-    break;
-  case 'n':
-    handler.on_num();
-    break;
-  default:
-    handler.on_error();
-  }
-}
-
-template <typename Handler>
-FMT_CONSTEXPR void handle_float_type_spec(char spec, Handler&& handler) {
-  switch (spec) {
-  case 0:
-  case 'g':
-  case 'G':
-    handler.on_general();
-    break;
-  case 'e':
-  case 'E':
-    handler.on_exp();
-    break;
-  case 'f':
-  case 'F':
-    handler.on_fixed();
-    break;
-  case '%':
-    handler.on_percent();
-    break;
-  case 'a':
-  case 'A':
-    handler.on_hex();
-    break;
-  case 'n':
-    handler.on_num();
-    break;
-  default:
-    handler.on_error();
-    break;
-  }
-}
-
-template <typename Char, typename Handler>
-FMT_CONSTEXPR void handle_char_specs(const basic_format_specs<Char>* specs,
-                                     Handler&& handler) {
-  if (!specs) return handler.on_char();
-  if (specs->type && specs->type != 'c') return handler.on_int();
-  if (specs->align == align::numeric || specs->sign != sign::none || specs->alt)
-    handler.on_error("invalid format specifier for char");
-  handler.on_char();
-}
-
-template <typename Char, typename Handler>
-FMT_CONSTEXPR void handle_cstring_type_spec(Char spec, Handler&& handler) {
-  if (spec == 0 || spec == 's')
-    handler.on_string();
-  else if (spec == 'p')
-    handler.on_pointer();
-  else
-    handler.on_error("invalid type specifier");
-}
-
-template <typename Char, typename ErrorHandler>
-FMT_CONSTEXPR void check_string_type_spec(Char spec, ErrorHandler&& eh) {
-  if (spec != 0 && spec != 's') eh.on_error("invalid type specifier");
-}
-
-template <typename Char, typename ErrorHandler>
-FMT_CONSTEXPR void check_pointer_type_spec(Char spec, ErrorHandler&& eh) {
-  if (spec != 0 && spec != 'p') eh.on_error("invalid type specifier");
-}
-
-template <typename ErrorHandler> class int_type_checker : private ErrorHandler {
- public:
-  FMT_CONSTEXPR explicit int_type_checker(ErrorHandler eh) : ErrorHandler(eh) {}
-
-  FMT_CONSTEXPR void on_dec() {}
-  FMT_CONSTEXPR void on_hex() {}
-  FMT_CONSTEXPR void on_bin() {}
-  FMT_CONSTEXPR void on_oct() {}
-  FMT_CONSTEXPR void on_num() {}
-
-  FMT_CONSTEXPR void on_error() {
-    ErrorHandler::on_error("invalid type specifier");
-  }
-};
-
-template <typename ErrorHandler>
-class float_type_checker : private ErrorHandler {
- public:
-  FMT_CONSTEXPR explicit float_type_checker(ErrorHandler eh)
-      : ErrorHandler(eh) {}
-
-  FMT_CONSTEXPR void on_general() {}
-  FMT_CONSTEXPR void on_exp() {}
-  FMT_CONSTEXPR void on_fixed() {}
-  FMT_CONSTEXPR void on_percent() {}
-  FMT_CONSTEXPR void on_hex() {}
-  FMT_CONSTEXPR void on_num() {}
-
-  FMT_CONSTEXPR void on_error() {
-    ErrorHandler::on_error("invalid type specifier");
-  }
-};
-
-template <typename ErrorHandler>
-class char_specs_checker : public ErrorHandler {
- private:
-  char type_;
-
- public:
-  FMT_CONSTEXPR char_specs_checker(char type, ErrorHandler eh)
-      : ErrorHandler(eh), type_(type) {}
-
-  FMT_CONSTEXPR void on_int() {
-    handle_int_type_spec(type_, int_type_checker<ErrorHandler>(*this));
-  }
-  FMT_CONSTEXPR void on_char() {}
-};
-
-template <typename ErrorHandler>
-class cstring_type_checker : public ErrorHandler {
- public:
-  FMT_CONSTEXPR explicit cstring_type_checker(ErrorHandler eh)
-      : ErrorHandler(eh) {}
-
-  FMT_CONSTEXPR void on_string() {}
-  FMT_CONSTEXPR void on_pointer() {}
-};
-
-template <typename Context>
-void arg_map<Context>::init(const basic_format_args<Context>& args) {
-  if (map_) return;
-  map_ = new entry[internal::to_unsigned(args.max_size())];
-  if (args.is_packed()) {
-    for (int i = 0;; ++i) {
-      internal::type arg_type = args.type(i);
-      if (arg_type == internal::none_type) return;
-      if (arg_type == internal::named_arg_type) push_back(args.values_[i]);
-    }
-  }
-  for (int i = 0, n = args.max_size(); i < n; ++i) {
-    auto type = args.args_[i].type_;
-    if (type == internal::named_arg_type) push_back(args.args_[i].value_);
-  }
-}
-
-// This template provides operations for formatting and writing data into a
-// character range.
-template <typename Range> class basic_writer {
- public:
-  using char_type = typename Range::value_type;
-  using iterator = typename Range::iterator;
-  using format_specs = basic_format_specs<char_type>;
-
- private:
-  iterator out_;  // Output iterator.
-  internal::locale_ref locale_;
-
-  // Attempts to reserve space for n extra characters in the output range.
-  // Returns a pointer to the reserved range or a reference to out_.
-  auto reserve(std::size_t n) -> decltype(internal::reserve(out_, n)) {
-    return internal::reserve(out_, n);
-  }
-
-  template <typename F> struct padded_int_writer {
-    size_t size_;
-    string_view prefix;
-    char_type fill;
-    std::size_t padding;
-    F f;
-
-    size_t size() const { return size_; }
-    size_t width() const { return size_; }
-
-    template <typename It> void operator()(It&& it) const {
-      if (prefix.size() != 0)
-        it = internal::copy_str<char_type>(prefix.begin(), prefix.end(), it);
-      it = std::fill_n(it, padding, fill);
-      f(it);
-    }
-  };
-
-  // Writes an integer in the format
-  //   <left-padding><prefix><numeric-padding><digits><right-padding>
-  // where <digits> are written by f(it).
-  template <typename F>
-  void write_int(int num_digits, string_view prefix, format_specs specs, F f) {
-    std::size_t size = prefix.size() + internal::to_unsigned(num_digits);
-    char_type fill = specs.fill[0];
-    std::size_t padding = 0;
-    if (specs.align == align::numeric) {
-      auto unsiged_width = internal::to_unsigned(specs.width);
-      if (unsiged_width > size) {
-        padding = unsiged_width - size;
-        size = unsiged_width;
-      }
-    } else if (specs.precision > num_digits) {
-      size = prefix.size() + internal::to_unsigned(specs.precision);
-      padding = internal::to_unsigned(specs.precision - num_digits);
-      fill = static_cast<char_type>('0');
-    }
-    if (specs.align == align::none) specs.align = align::right;
-    write_padded(specs, padded_int_writer<F>{size, prefix, fill, padding, f});
-  }
-
-  // Writes a decimal integer.
-  template <typename Int> void write_decimal(Int value) {
-    auto abs_value = static_cast<uint32_or_64_t<Int>>(value);
-    bool is_negative = internal::is_negative(value);
-    if (is_negative) abs_value = 0 - abs_value;
-    int num_digits = internal::count_digits(abs_value);
-    auto&& it =
-        reserve((is_negative ? 1 : 0) + static_cast<size_t>(num_digits));
-    if (is_negative) *it++ = static_cast<char_type>('-');
-    it = internal::format_decimal<char_type>(it, abs_value, num_digits);
-  }
-
-  // The handle_int_type_spec handler that writes an integer.
-  template <typename Int, typename Specs> struct int_writer {
-    using unsigned_type = uint32_or_64_t<Int>;
-
-    basic_writer<Range>& writer;
-    const Specs& specs;
-    unsigned_type abs_value;
-    char prefix[4];
-    unsigned prefix_size;
-
-    string_view get_prefix() const { return string_view(prefix, prefix_size); }
-
-    int_writer(basic_writer<Range>& w, Int value, const Specs& s)
-        : writer(w),
-          specs(s),
-          abs_value(static_cast<unsigned_type>(value)),
-          prefix_size(0) {
-      if (internal::is_negative(value)) {
-        prefix[0] = '-';
-        ++prefix_size;
-        abs_value = 0 - abs_value;
-      } else if (specs.sign != sign::none && specs.sign != sign::minus) {
-        prefix[0] = specs.sign == sign::plus ? '+' : ' ';
-        ++prefix_size;
-      }
-    }
-
-    struct dec_writer {
-      unsigned_type abs_value;
-      int num_digits;
-
-      template <typename It> void operator()(It&& it) const {
-        it = internal::format_decimal<char_type>(it, abs_value, num_digits);
-      }
-    };
-
-    void on_dec() {
-      int num_digits = internal::count_digits(abs_value);
-      writer.write_int(num_digits, get_prefix(), specs,
-                       dec_writer{abs_value, num_digits});
-    }
-
-    struct hex_writer {
-      int_writer& self;
-      int num_digits;
-
-      template <typename It> void operator()(It&& it) const {
-        it = internal::format_uint<4, char_type>(it, self.abs_value, num_digits,
-                                                 self.specs.type != 'x');
-      }
-    };
-
-    void on_hex() {
-      if (specs.alt) {
-        prefix[prefix_size++] = '0';
-        prefix[prefix_size++] = specs.type;
-      }
-      int num_digits = internal::count_digits<4>(abs_value);
-      writer.write_int(num_digits, get_prefix(), specs,
-                       hex_writer{*this, num_digits});
-    }
-
-    template <int BITS> struct bin_writer {
-      unsigned_type abs_value;
-      int num_digits;
-
-      template <typename It> void operator()(It&& it) const {
-        it = internal::format_uint<BITS, char_type>(it, abs_value, num_digits);
-      }
-    };
-
-    void on_bin() {
-      if (specs.alt) {
-        prefix[prefix_size++] = '0';
-        prefix[prefix_size++] = static_cast<char>(specs.type);
-      }
-      int num_digits = internal::count_digits<1>(abs_value);
-      writer.write_int(num_digits, get_prefix(), specs,
-                       bin_writer<1>{abs_value, num_digits});
-    }
-
-    void on_oct() {
-      int num_digits = internal::count_digits<3>(abs_value);
-      if (specs.alt && specs.precision <= num_digits) {
-        // Octal prefix '0' is counted as a digit, so only add it if precision
-        // is not greater than the number of digits.
-        prefix[prefix_size++] = '0';
-      }
-      writer.write_int(num_digits, get_prefix(), specs,
-                       bin_writer<3>{abs_value, num_digits});
-    }
-
-    enum { sep_size = 1 };
-
-    struct num_writer {
-      unsigned_type abs_value;
-      int size;
-      char_type sep;
-
-      template <typename It> void operator()(It&& it) const {
-        basic_string_view<char_type> s(&sep, sep_size);
-        // Index of a decimal digit with the least significant digit having
-        // index 0.
-        unsigned digit_index = 0;
-        it = internal::format_decimal<char_type>(
-            it, abs_value, size, [s, &digit_index](char_type*& buffer) {
-              if (++digit_index % 3 != 0) return;
-              buffer -= s.size();
-              std::uninitialized_copy(s.data(), s.data() + s.size(),
-                                      internal::make_checked(buffer, s.size()));
-            });
-      }
-    };
-
-    void on_num() {
-      char_type sep = internal::thousands_sep<char_type>(writer.locale_);
-      if (!sep) return on_dec();
-      int num_digits = internal::count_digits(abs_value);
-      int size = num_digits + sep_size * ((num_digits - 1) / 3);
-      writer.write_int(size, get_prefix(), specs,
-                       num_writer{abs_value, size, sep});
-    }
-
-    FMT_NORETURN void on_error() {
-      FMT_THROW(format_error("invalid type specifier"));
-    }
-  };
-
-  enum { inf_size = 3 };  // This is an enum to workaround a bug in MSVC.
-
-  struct inf_or_nan_writer {
-    char sign;
-    bool as_percentage;
-    const char* str;
-
-    size_t size() const {
-      return static_cast<std::size_t>(inf_size + (sign ? 1 : 0) +
-                                      (as_percentage ? 1 : 0));
-    }
-    size_t width() const { return size(); }
-
-    template <typename It> void operator()(It&& it) const {
-      if (sign) *it++ = static_cast<char_type>(sign);
-      it = internal::copy_str<char_type>(
-          str, str + static_cast<std::size_t>(inf_size), it);
-      if (as_percentage) *it++ = static_cast<char_type>('%');
-    }
-  };
-
-  struct double_writer {
-    char sign;
-    internal::buffer<char>& buffer;
-    char* decimal_point_pos;
-    char_type decimal_point;
-
-    size_t size() const { return buffer.size() + (sign ? 1 : 0); }
-    size_t width() const { return size(); }
-
-    template <typename It> void operator()(It&& it) {
-      if (sign) *it++ = static_cast<char_type>(sign);
-      auto begin = buffer.begin();
-      if (decimal_point_pos) {
-        it = internal::copy_str<char_type>(begin, decimal_point_pos, it);
-        *it++ = decimal_point;
-        begin = decimal_point_pos + 1;
-      }
-      it = internal::copy_str<char_type>(begin, buffer.end(), it);
-    }
-  };
-
-  class grisu_writer {
-   private:
-    internal::buffer<char>& digits_;
-    size_t size_;
-    char sign_;
-    int exp_;
-    internal::gen_digits_params params_;
-    char_type decimal_point_;
-
-   public:
-    grisu_writer(char sign, internal::buffer<char>& digits, int exp,
-                 const internal::gen_digits_params& params,
-                 char_type decimal_point)
-        : digits_(digits),
-          sign_(sign),
-          exp_(exp),
-          params_(params),
-          decimal_point_(decimal_point) {
-      int num_digits = static_cast<int>(digits.size());
-      int full_exp = num_digits + exp - 1;
-      int precision = params.num_digits > 0 ? params.num_digits : 11;
-      params_.fixed |= full_exp >= -4 && full_exp < precision;
-      auto it = internal::grisu_prettify<char>(
-          digits.data(), num_digits, exp, internal::counting_iterator<char>(),
-          params_, '.');
-      size_ = it.count();
-    }
-
-    size_t size() const { return size_ + (sign_ ? 1 : 0); }
-    size_t width() const { return size(); }
-
-    template <typename It> void operator()(It&& it) {
-      if (sign_) *it++ = static_cast<char_type>(sign_);
-      int num_digits = static_cast<int>(digits_.size());
-      it = internal::grisu_prettify<char_type>(digits_.data(), num_digits, exp_,
-                                               it, params_, decimal_point_);
-    }
-  };
-
-  template <typename Char> struct str_writer {
-    const Char* s;
-    size_t size_;
-
-    size_t size() const { return size_; }
-    size_t width() const {
-      return internal::count_code_points(basic_string_view<Char>(s, size_));
-    }
-
-    template <typename It> void operator()(It&& it) const {
-      it = internal::copy_str<char_type>(s, s + size_, it);
-    }
-  };
-
-  template <typename UIntPtr> struct pointer_writer {
-    UIntPtr value;
-    int num_digits;
-
-    size_t size() const { return to_unsigned(num_digits) + 2; }
-    size_t width() const { return size(); }
-
-    template <typename It> void operator()(It&& it) const {
-      *it++ = static_cast<char_type>('0');
-      *it++ = static_cast<char_type>('x');
-      it = internal::format_uint<4, char_type>(it, value, num_digits);
-    }
-  };
-
- public:
-  /** Constructs a ``basic_writer`` object. */
-  explicit basic_writer(Range out,
-                        internal::locale_ref loc = internal::locale_ref())
-      : out_(out.begin()), locale_(loc) {}
-
-  iterator out() const { return out_; }
-
-  // Writes a value in the format
-  //   <left-padding><value><right-padding>
-  // where <value> is written by f(it).
-  template <typename F> void write_padded(const format_specs& specs, F&& f) {
-    // User-perceived width (in code points).
-    unsigned width = to_unsigned(specs.width);
-    size_t size = f.size();  // The number of code units.
-    size_t num_code_points = width != 0 ? f.width() : size;
-    if (width <= num_code_points) return f(reserve(size));
-    auto&& it = reserve(width + (size - num_code_points));
-    char_type fill = specs.fill[0];
-    std::size_t padding = width - num_code_points;
-    if (specs.align == align::right) {
-      it = std::fill_n(it, padding, fill);
-      f(it);
-    } else if (specs.align == align::center) {
-      std::size_t left_padding = padding / 2;
-      it = std::fill_n(it, left_padding, fill);
-      f(it);
-      it = std::fill_n(it, padding - left_padding, fill);
-    } else {
-      f(it);
-      it = std::fill_n(it, padding, fill);
-    }
-  }
-
-  void write(int value) { write_decimal(value); }
-  void write(long value) { write_decimal(value); }
-  void write(long long value) { write_decimal(value); }
-
-  void write(unsigned value) { write_decimal(value); }
-  void write(unsigned long value) { write_decimal(value); }
-  void write(unsigned long long value) { write_decimal(value); }
-
-  // Writes a formatted integer.
-  template <typename T, typename Spec>
-  void write_int(T value, const Spec& spec) {
-    internal::handle_int_type_spec(spec.type,
-                                   int_writer<T, Spec>(*this, value, spec));
-  }
-
-  void write(double value, const format_specs& specs = format_specs()) {
-    write_double(value, specs);
-  }
-
-  /**
-    \rst
-    Formats *value* using the general format for floating-point numbers
-    (``'g'``) and writes it to the buffer.
-    \endrst
-   */
-  void write(long double value, const format_specs& specs = format_specs()) {
-    write_double(value, specs);
-  }
-
-  // Formats a floating-point number (double or long double).
-  template <typename T, bool USE_GRISU = fmt::internal::use_grisu<T>()>
-  void write_double(T value, const format_specs& specs);
-
-  /** Writes a character to the buffer. */
-  void write(char value) {
-    auto&& it = reserve(1);
-    *it++ = value;
-  }
-
-  template <typename Char, FMT_ENABLE_IF(std::is_same<Char, char_type>::value)>
-  void write(Char value) {
-    auto&& it = reserve(1);
-    *it++ = value;
-  }
-
-  /**
-    \rst
-    Writes *value* to the buffer.
-    \endrst
-   */
-  void write(string_view value) {
-    auto&& it = reserve(value.size());
-    it = internal::copy_str<char_type>(value.begin(), value.end(), it);
-  }
-  void write(wstring_view value) {
-    static_assert(std::is_same<char_type, wchar_t>::value, "");
-    auto&& it = reserve(value.size());
-    it = std::copy(value.begin(), value.end(), it);
-  }
-
-  // Writes a formatted string.
-  template <typename Char>
-  void write(const Char* s, std::size_t size, const format_specs& specs) {
-    write_padded(specs, str_writer<Char>{s, size});
-  }
-
-  template <typename Char>
-  void write(basic_string_view<Char> s,
-             const format_specs& specs = format_specs()) {
-    const Char* data = s.data();
-    std::size_t size = s.size();
-    if (specs.precision >= 0 && internal::to_unsigned(specs.precision) < size)
-      size = internal::to_unsigned(specs.precision);
-    write(data, size, specs);
-  }
-
-  template <typename UIntPtr>
-  void write_pointer(UIntPtr value, const format_specs* specs) {
-    int num_digits = internal::count_digits<4>(value);
-    auto pw = pointer_writer<UIntPtr>{value, num_digits};
-    if (!specs) return pw(reserve(to_unsigned(num_digits) + 2));
-    format_specs specs_copy = *specs;
-    if (specs_copy.align == align::none) specs_copy.align = align::right;
-    write_padded(specs_copy, pw);
-  }
-};
-
-using writer = basic_writer<buffer_range<char>>;
-
-template <typename Range, typename ErrorHandler = internal::error_handler>
-class arg_formatter_base {
- public:
-  using char_type = typename Range::value_type;
-  using iterator = typename Range::iterator;
-  using format_specs = basic_format_specs<char_type>;
-
- private:
-  using writer_type = basic_writer<Range>;
-  writer_type writer_;
-  format_specs* specs_;
-
-  struct char_writer {
-    char_type value;
-
-    size_t size() const { return 1; }
-    size_t width() const { return 1; }
-
-    template <typename It> void operator()(It&& it) const { *it++ = value; }
-  };
-
-  void write_char(char_type value) {
-    if (specs_)
-      writer_.write_padded(*specs_, char_writer{value});
-    else
-      writer_.write(value);
-  }
-
-  void write_pointer(const void* p) {
-    writer_.write_pointer(internal::bit_cast<internal::uintptr_t>(p), specs_);
-  }
-
- protected:
-  writer_type& writer() { return writer_; }
-  FMT_DEPRECATED format_specs* spec() { return specs_; }
-  format_specs* specs() { return specs_; }
-  iterator out() { return writer_.out(); }
-
-  void write(bool value) {
-    string_view sv(value ? "true" : "false");
-    specs_ ? writer_.write(sv, *specs_) : writer_.write(sv);
-  }
-
-  void write(const char_type* value) {
-    if (!value) {
-      FMT_THROW(format_error("string pointer is null"));
-    } else {
-      auto length = std::char_traits<char_type>::length(value);
-      basic_string_view<char_type> sv(value, length);
-      specs_ ? writer_.write(sv, *specs_) : writer_.write(sv);
-    }
-  }
-
- public:
-  arg_formatter_base(Range r, format_specs* s, locale_ref loc)
-      : writer_(r, loc), specs_(s) {}
-
-  iterator operator()(monostate) {
-    FMT_ASSERT(false, "invalid argument type");
-    return out();
-  }
-
-  template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
-  iterator operator()(T value) {
-    if (specs_)
-      writer_.write_int(value, *specs_);
-    else
-      writer_.write(value);
-    return out();
-  }
-
-  iterator operator()(char_type value) {
-    internal::handle_char_specs(
-        specs_, char_spec_handler(*this, static_cast<char_type>(value)));
-    return out();
-  }
-
-  iterator operator()(bool value) {
-    if (specs_ && specs_->type) return (*this)(value ? 1 : 0);
-    write(value != 0);
-    return out();
-  }
-
-  template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value)>
-  iterator operator()(T value) {
-    writer_.write_double(value, specs_ ? *specs_ : format_specs());
-    return out();
-  }
-
-  struct char_spec_handler : ErrorHandler {
-    arg_formatter_base& formatter;
-    char_type value;
-
-    char_spec_handler(arg_formatter_base& f, char_type val)
-        : formatter(f), value(val) {}
-
-    void on_int() {
-      if (formatter.specs_)
-        formatter.writer_.write_int(value, *formatter.specs_);
-      else
-        formatter.writer_.write(value);
-    }
-    void on_char() { formatter.write_char(value); }
-  };
-
-  struct cstring_spec_handler : internal::error_handler {
-    arg_formatter_base& formatter;
-    const char_type* value;
-
-    cstring_spec_handler(arg_formatter_base& f, const char_type* val)
-        : formatter(f), value(val) {}
-
-    void on_string() { formatter.write(value); }
-    void on_pointer() { formatter.write_pointer(value); }
-  };
-
-  iterator operator()(const char_type* value) {
-    if (!specs_) return write(value), out();
-    internal::handle_cstring_type_spec(specs_->type,
-                                       cstring_spec_handler(*this, value));
-    return out();
-  }
-
-  iterator operator()(basic_string_view<char_type> value) {
-    if (specs_) {
-      internal::check_string_type_spec(specs_->type, internal::error_handler());
-      writer_.write(value, *specs_);
-    } else {
-      writer_.write(value);
-    }
-    return out();
-  }
-
-  iterator operator()(const void* value) {
-    if (specs_)
-      check_pointer_type_spec(specs_->type, internal::error_handler());
-    write_pointer(value);
-    return out();
-  }
-};
-
-template <typename Char> FMT_CONSTEXPR bool is_name_start(Char c) {
-  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || '_' == c;
-}
-
-// Parses the range [begin, end) as an unsigned integer. This function assumes
-// that the range is non-empty and the first character is a digit.
-template <typename Char, typename ErrorHandler>
-FMT_CONSTEXPR int parse_nonnegative_int(const Char*& begin, const Char* end,
-                                        ErrorHandler&& eh) {
-  assert(begin != end && '0' <= *begin && *begin <= '9');
-  if (*begin == '0') {
-    ++begin;
-    return 0;
-  }
-  unsigned value = 0;
-  // Convert to unsigned to prevent a warning.
-  constexpr unsigned max_int = (std::numeric_limits<int>::max)();
-  unsigned big = max_int / 10;
-  do {
-    // Check for overflow.
-    if (value > big) {
-      value = max_int + 1;
-      break;
-    }
-    value = value * 10 + unsigned(*begin - '0');
-    ++begin;
-  } while (begin != end && '0' <= *begin && *begin <= '9');
-  if (value > max_int) eh.on_error("number is too big");
-  return static_cast<int>(value);
-}
-
-template <typename Context> class custom_formatter {
- private:
-  using char_type = typename Context::char_type;
-
-  basic_parse_context<char_type>& parse_ctx_;
-  Context& ctx_;
-
- public:
-  explicit custom_formatter(basic_parse_context<char_type>& parse_ctx,
-                            Context& ctx)
-      : parse_ctx_(parse_ctx), ctx_(ctx) {}
-
-  bool operator()(typename basic_format_arg<Context>::handle h) const {
-    h.format(parse_ctx_, ctx_);
-    return true;
-  }
-
-  template <typename T> bool operator()(T) const { return false; }
-};
-
-template <typename T>
-using is_integer =
-    bool_constant<std::is_integral<T>::value && !std::is_same<T, bool>::value &&
-                  !std::is_same<T, char>::value &&
-                  !std::is_same<T, wchar_t>::value>;
-
-template <typename ErrorHandler> class width_checker {
- public:
-  explicit FMT_CONSTEXPR width_checker(ErrorHandler& eh) : handler_(eh) {}
-
-  template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
-  FMT_CONSTEXPR unsigned long long operator()(T value) {
-    if (is_negative(value)) handler_.on_error("negative width");
-    return static_cast<unsigned long long>(value);
-  }
-
-  template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
-  FMT_CONSTEXPR unsigned long long operator()(T) {
-    handler_.on_error("width is not integer");
-    return 0;
-  }
-
- private:
-  ErrorHandler& handler_;
-};
-
-template <typename ErrorHandler> class precision_checker {
- public:
-  explicit FMT_CONSTEXPR precision_checker(ErrorHandler& eh) : handler_(eh) {}
-
-  template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
-  FMT_CONSTEXPR unsigned long long operator()(T value) {
-    if (is_negative(value)) handler_.on_error("negative precision");
-    return static_cast<unsigned long long>(value);
-  }
-
-  template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
-  FMT_CONSTEXPR unsigned long long operator()(T) {
-    handler_.on_error("precision is not integer");
-    return 0;
-  }
-
- private:
-  ErrorHandler& handler_;
-};
-
-// A format specifier handler that sets fields in basic_format_specs.
-template <typename Char> class specs_setter {
- public:
-  explicit FMT_CONSTEXPR specs_setter(basic_format_specs<Char>& specs)
-      : specs_(specs) {}
-
-  FMT_CONSTEXPR specs_setter(const specs_setter& other)
-      : specs_(other.specs_) {}
-
-  FMT_CONSTEXPR void on_align(align_t align) { specs_.align = align; }
-  FMT_CONSTEXPR void on_fill(Char fill) { specs_.fill[0] = fill; }
-  FMT_CONSTEXPR void on_plus() { specs_.sign = sign::plus; }
-  FMT_CONSTEXPR void on_minus() { specs_.sign = sign::minus; }
-  FMT_CONSTEXPR void on_space() { specs_.sign = sign::space; }
-  FMT_CONSTEXPR void on_hash() { specs_.alt = true; }
-
-  FMT_CONSTEXPR void on_zero() {
-    specs_.align = align::numeric;
-    specs_.fill[0] = Char('0');
-  }
-
-  FMT_CONSTEXPR void on_width(int width) { specs_.width = width; }
-  FMT_CONSTEXPR void on_precision(int precision) {
-    specs_.precision = precision;
-  }
-  FMT_CONSTEXPR void end_precision() {}
-
-  FMT_CONSTEXPR void on_type(Char type) {
-    specs_.type = static_cast<char>(type);
-  }
-
- protected:
-  basic_format_specs<Char>& specs_;
-};
-
-template <typename ErrorHandler> class numeric_specs_checker {
- public:
-  FMT_CONSTEXPR numeric_specs_checker(ErrorHandler& eh, internal::type arg_type)
-      : error_handler_(eh), arg_type_(arg_type) {}
-
-  FMT_CONSTEXPR void require_numeric_argument() {
-    if (!is_arithmetic(arg_type_))
-      error_handler_.on_error("format specifier requires numeric argument");
-  }
-
-  FMT_CONSTEXPR void check_sign() {
-    require_numeric_argument();
-    if (is_integral(arg_type_) && arg_type_ != int_type &&
-        arg_type_ != long_long_type && arg_type_ != internal::char_type) {
-      error_handler_.on_error("format specifier requires signed argument");
-    }
-  }
-
-  FMT_CONSTEXPR void check_precision() {
-    if (is_integral(arg_type_) || arg_type_ == internal::pointer_type)
-      error_handler_.on_error("precision not allowed for this argument type");
-  }
-
- private:
-  ErrorHandler& error_handler_;
-  internal::type arg_type_;
-};
-
-// A format specifier handler that checks if specifiers are consistent with the
-// argument type.
-template <typename Handler> class specs_checker : public Handler {
- public:
-  FMT_CONSTEXPR specs_checker(const Handler& handler, internal::type arg_type)
-      : Handler(handler), checker_(*this, arg_type) {}
-
-  FMT_CONSTEXPR specs_checker(const specs_checker& other)
-      : Handler(other), checker_(*this, other.arg_type_) {}
-
-  FMT_CONSTEXPR void on_align(align_t align) {
-    if (align == align::numeric) checker_.require_numeric_argument();
-    Handler::on_align(align);
-  }
-
-  FMT_CONSTEXPR void on_plus() {
-    checker_.check_sign();
-    Handler::on_plus();
-  }
-
-  FMT_CONSTEXPR void on_minus() {
-    checker_.check_sign();
-    Handler::on_minus();
-  }
-
-  FMT_CONSTEXPR void on_space() {
-    checker_.check_sign();
-    Handler::on_space();
-  }
-
-  FMT_CONSTEXPR void on_hash() {
-    checker_.require_numeric_argument();
-    Handler::on_hash();
-  }
-
-  FMT_CONSTEXPR void on_zero() {
-    checker_.require_numeric_argument();
-    Handler::on_zero();
-  }
-
-  FMT_CONSTEXPR void end_precision() { checker_.check_precision(); }
-
- private:
-  numeric_specs_checker<Handler> checker_;
-};
-
-template <template <typename> class Handler, typename T, typename FormatArg,
-          typename ErrorHandler>
-FMT_CONSTEXPR void set_dynamic_spec(T& value, FormatArg arg, ErrorHandler eh) {
-  unsigned long long big_value =
-      visit_format_arg(Handler<ErrorHandler>(eh), arg);
-  if (big_value > to_unsigned((std::numeric_limits<int>::max)()))
-    eh.on_error("number is too big");
-  value = static_cast<T>(big_value);
-}
-
-struct auto_id {};
-
-template <typename Context>
-FMT_CONSTEXPR typename Context::format_arg get_arg(Context& ctx, int id) {
-  auto arg = ctx.arg(id);
-  if (!arg) ctx.on_error("argument index out of range");
-  return arg;
-}
-
-// The standard format specifier handler with checking.
-template <typename ParseContext, typename Context>
-class specs_handler : public specs_setter<typename Context::char_type> {
- public:
-  using char_type = typename Context::char_type;
-
-  FMT_CONSTEXPR specs_handler(basic_format_specs<char_type>& specs,
-                              ParseContext& parse_ctx, Context& ctx)
-      : specs_setter<char_type>(specs),
-        parse_context_(parse_ctx),
-        context_(ctx) {}
-
-  template <typename Id> FMT_CONSTEXPR void on_dynamic_width(Id arg_id) {
-    set_dynamic_spec<width_checker>(this->specs_.width, get_arg(arg_id),
-                                    context_.error_handler());
-  }
-
-  template <typename Id> FMT_CONSTEXPR void on_dynamic_precision(Id arg_id) {
-    set_dynamic_spec<precision_checker>(this->specs_.precision, get_arg(arg_id),
-                                        context_.error_handler());
-  }
-
-  void on_error(const char* message) { context_.on_error(message); }
-
- private:
-  // This is only needed for compatibility with gcc 4.4.
-  using format_arg = typename Context::format_arg;
-
-  FMT_CONSTEXPR format_arg get_arg(auto_id) {
-    return internal::get_arg(context_, parse_context_.next_arg_id());
-  }
-
-  FMT_CONSTEXPR format_arg get_arg(int arg_id) {
-    parse_context_.check_arg_id(arg_id);
-    return internal::get_arg(context_, arg_id);
-  }
-
-  FMT_CONSTEXPR format_arg get_arg(basic_string_view<char_type> arg_id) {
-    parse_context_.check_arg_id(arg_id);
-    return context_.arg(arg_id);
-  }
-
-  ParseContext& parse_context_;
-  Context& context_;
-};
-
-struct string_view_metadata {
-  FMT_CONSTEXPR string_view_metadata() : offset_(0u), size_(0u) {}
-  template <typename Char>
-  FMT_CONSTEXPR string_view_metadata(basic_string_view<Char> primary_string,
-                                     basic_string_view<Char> view)
-      : offset_(to_unsigned(view.data() - primary_string.data())),
-        size_(view.size()) {}
-  FMT_CONSTEXPR string_view_metadata(std::size_t offset, std::size_t size)
-      : offset_(offset), size_(size) {}
-  template <typename Char>
-  FMT_CONSTEXPR basic_string_view<Char> to_view(const Char* str) const {
-    return {str + offset_, size_};
-  }
-
-  std::size_t offset_;
-  std::size_t size_;
-};
-
-enum class arg_id_kind { none, index, name };
-
-// An argument reference.
-template <typename Char> struct arg_ref {
-  FMT_CONSTEXPR arg_ref() : kind(arg_id_kind::none), val() {}
-  FMT_CONSTEXPR explicit arg_ref(int index)
-      : kind(arg_id_kind::index), val(index) {}
-  FMT_CONSTEXPR explicit arg_ref(string_view_metadata name)
-      : kind(arg_id_kind::name), val(name) {}
-
-  FMT_CONSTEXPR arg_ref& operator=(int idx) {
-    kind = arg_id_kind::index;
-    val.index = idx;
-    return *this;
-  }
-
-  arg_id_kind kind;
-  union value {
-    FMT_CONSTEXPR value() : index(0u) {}
-    FMT_CONSTEXPR value(int id) : index(id) {}
-    FMT_CONSTEXPR value(string_view_metadata n) : name(n) {}
-
-    int index;
-    string_view_metadata name;
-  } val;
-};
-
-// Format specifiers with width and precision resolved at formatting rather
-// than parsing time to allow re-using the same parsed specifiers with
-// different sets of arguments (precompilation of format strings).
-template <typename Char>
-struct dynamic_format_specs : basic_format_specs<Char> {
-  arg_ref<Char> width_ref;
-  arg_ref<Char> precision_ref;
-};
-
-// Format spec handler that saves references to arguments representing dynamic
-// width and precision to be resolved at formatting time.
-template <typename ParseContext>
-class dynamic_specs_handler
-    : public specs_setter<typename ParseContext::char_type> {
- public:
-  using char_type = typename ParseContext::char_type;
-
-  FMT_CONSTEXPR dynamic_specs_handler(dynamic_format_specs<char_type>& specs,
-                                      ParseContext& ctx)
-      : specs_setter<char_type>(specs), specs_(specs), context_(ctx) {}
-
-  FMT_CONSTEXPR dynamic_specs_handler(const dynamic_specs_handler& other)
-      : specs_setter<char_type>(other),
-        specs_(other.specs_),
-        context_(other.context_) {}
-
-  template <typename Id> FMT_CONSTEXPR void on_dynamic_width(Id arg_id) {
-    specs_.width_ref = make_arg_ref(arg_id);
-  }
-
-  template <typename Id> FMT_CONSTEXPR void on_dynamic_precision(Id arg_id) {
-    specs_.precision_ref = make_arg_ref(arg_id);
-  }
-
-  FMT_CONSTEXPR void on_error(const char* message) {
-    context_.on_error(message);
-  }
-
- private:
-  using arg_ref_type = arg_ref<char_type>;
-
-  FMT_CONSTEXPR arg_ref_type make_arg_ref(int arg_id) {
-    context_.check_arg_id(arg_id);
-    return arg_ref_type(arg_id);
-  }
-
-  FMT_CONSTEXPR arg_ref_type make_arg_ref(auto_id) {
-    return arg_ref_type(context_.next_arg_id());
-  }
-
-  FMT_CONSTEXPR arg_ref_type make_arg_ref(basic_string_view<char_type> arg_id) {
-    context_.check_arg_id(arg_id);
-    basic_string_view<char_type> format_str(
-        context_.begin(), to_unsigned(context_.end() - context_.begin()));
-    const auto id_metadata = string_view_metadata(format_str, arg_id);
-    return arg_ref_type(id_metadata);
-  }
-
-  dynamic_format_specs<char_type>& specs_;
-  ParseContext& context_;
-};
-
-template <typename Char, typename IDHandler>
-FMT_CONSTEXPR const Char* parse_arg_id(const Char* begin, const Char* end,
-                                       IDHandler&& handler) {
-  assert(begin != end);
-  Char c = *begin;
-  if (c == '}' || c == ':') return handler(), begin;
-  if (c >= '0' && c <= '9') {
-    int index = parse_nonnegative_int(begin, end, handler);
-    if (begin == end || (*begin != '}' && *begin != ':'))
-      return handler.on_error("invalid format string"), begin;
-    handler(index);
-    return begin;
-  }
-  if (!is_name_start(c))
-    return handler.on_error("invalid format string"), begin;
-  auto it = begin;
-  do {
-    ++it;
-  } while (it != end && (is_name_start(c = *it) || ('0' <= c && c <= '9')));
-  handler(basic_string_view<Char>(begin, to_unsigned(it - begin)));
-  return it;
-}
-
-// Adapts SpecHandler to IDHandler API for dynamic width.
-template <typename SpecHandler, typename Char> struct width_adapter {
-  explicit FMT_CONSTEXPR width_adapter(SpecHandler& h) : handler(h) {}
-
-  FMT_CONSTEXPR void operator()() { handler.on_dynamic_width(auto_id()); }
-  FMT_CONSTEXPR void operator()(int id) { handler.on_dynamic_width(id); }
-  FMT_CONSTEXPR void operator()(basic_string_view<Char> id) {
-    handler.on_dynamic_width(id);
-  }
-
-  FMT_CONSTEXPR void on_error(const char* message) {
-    handler.on_error(message);
-  }
-
-  SpecHandler& handler;
-};
-
-// Adapts SpecHandler to IDHandler API for dynamic precision.
-template <typename SpecHandler, typename Char> struct precision_adapter {
-  explicit FMT_CONSTEXPR precision_adapter(SpecHandler& h) : handler(h) {}
-
-  FMT_CONSTEXPR void operator()() { handler.on_dynamic_precision(auto_id()); }
-  FMT_CONSTEXPR void operator()(int id) { handler.on_dynamic_precision(id); }
-  FMT_CONSTEXPR void operator()(basic_string_view<Char> id) {
-    handler.on_dynamic_precision(id);
-  }
-
-  FMT_CONSTEXPR void on_error(const char* message) {
-    handler.on_error(message);
-  }
-
-  SpecHandler& handler;
-};
-
-// Parses fill and alignment.
-template <typename Char, typename Handler>
-FMT_CONSTEXPR const Char* parse_align(const Char* begin, const Char* end,
-                                      Handler&& handler) {
-  FMT_ASSERT(begin != end, "");
-  auto align = align::none;
-  int i = 0;
-  if (begin + 1 != end) ++i;
-  do {
-    switch (static_cast<char>(begin[i])) {
-    case '<':
-      align = align::left;
-      break;
-    case '>':
-      align = align::right;
-      break;
-    case '=':
-      align = align::numeric;
-      break;
-    case '^':
-      align = align::center;
-      break;
-    }
-    if (align != align::none) {
-      if (i > 0) {
-        auto c = *begin;
-        if (c == '{')
-          return handler.on_error("invalid fill character '{'"), begin;
-        begin += 2;
-        handler.on_fill(c);
-      } else
-        ++begin;
-      handler.on_align(align);
-      break;
-    }
-  } while (i-- > 0);
-  return begin;
-}
-
-template <typename Char, typename Handler>
-FMT_CONSTEXPR const Char* parse_width(const Char* begin, const Char* end,
-                                      Handler&& handler) {
-  FMT_ASSERT(begin != end, "");
-  if ('0' <= *begin && *begin <= '9') {
-    handler.on_width(parse_nonnegative_int(begin, end, handler));
-  } else if (*begin == '{') {
-    ++begin;
-    if (begin != end)
-      begin = parse_arg_id(begin, end, width_adapter<Handler, Char>(handler));
-    if (begin == end || *begin != '}')
-      return handler.on_error("invalid format string"), begin;
-    ++begin;
-  }
-  return begin;
-}
-
-template <typename Char, typename Handler>
-FMT_CONSTEXPR const Char* parse_precision(const Char* begin, const Char* end,
-                                          Handler&& handler) {
-  ++begin;
-  auto c = begin != end ? *begin : Char();
-  if ('0' <= c && c <= '9') {
-    handler.on_precision(parse_nonnegative_int(begin, end, handler));
-  } else if (c == '{') {
-    ++begin;
-    if (begin != end) {
-      begin =
-          parse_arg_id(begin, end, precision_adapter<Handler, Char>(handler));
-    }
-    if (begin == end || *begin++ != '}')
-      return handler.on_error("invalid format string"), begin;
-  } else {
-    return handler.on_error("missing precision specifier"), begin;
-  }
-  handler.end_precision();
-  return begin;
-}
-
-// Parses standard format specifiers and sends notifications about parsed
-// components to handler.
-template <typename Char, typename SpecHandler>
-FMT_CONSTEXPR const Char* parse_format_specs(const Char* begin, const Char* end,
-                                             SpecHandler&& handler) {
-  if (begin == end || *begin == '}') return begin;
-
-  begin = parse_align(begin, end, handler);
-  if (begin == end) return begin;
-
-  // Parse sign.
-  switch (static_cast<char>(*begin)) {
-  case '+':
-    handler.on_plus();
-    ++begin;
-    break;
-  case '-':
-    handler.on_minus();
-    ++begin;
-    break;
-  case ' ':
-    handler.on_space();
-    ++begin;
-    break;
-  }
-  if (begin == end) return begin;
-
-  if (*begin == '#') {
-    handler.on_hash();
-    if (++begin == end) return begin;
-  }
-
-  // Parse zero flag.
-  if (*begin == '0') {
-    handler.on_zero();
-    if (++begin == end) return begin;
-  }
-
-  begin = parse_width(begin, end, handler);
-  if (begin == end) return begin;
-
-  // Parse precision.
-  if (*begin == '.') {
-    begin = parse_precision(begin, end, handler);
-  }
-
-  // Parse type.
-  if (begin != end && *begin != '}') handler.on_type(*begin++);
-  return begin;
-}
-
-// Return the result via the out param to workaround gcc bug 77539.
-template <bool IS_CONSTEXPR, typename T, typename Ptr = const T*>
-FMT_CONSTEXPR bool find(Ptr first, Ptr last, T value, Ptr& out) {
-  for (out = first; out != last; ++out) {
-    if (*out == value) return true;
-  }
-  return false;
-}
-
-template <>
-inline bool find<false, char>(const char* first, const char* last, char value,
-                              const char*& out) {
-  out = static_cast<const char*>(
-      std::memchr(first, value, internal::to_unsigned(last - first)));
-  return out != nullptr;
-}
-
-template <typename Handler, typename Char> struct id_adapter {
-  FMT_CONSTEXPR void operator()() { handler.on_arg_id(); }
-  FMT_CONSTEXPR void operator()(int id) { handler.on_arg_id(id); }
-  FMT_CONSTEXPR void operator()(basic_string_view<Char> id) {
-    handler.on_arg_id(id);
-  }
-  FMT_CONSTEXPR void on_error(const char* message) {
-    handler.on_error(message);
-  }
-  Handler& handler;
-};
-
-template <bool IS_CONSTEXPR, typename Char, typename Handler>
-FMT_CONSTEXPR void parse_format_string(basic_string_view<Char> format_str,
-                                       Handler&& handler) {
-  struct pfs_writer {
-    FMT_CONSTEXPR void operator()(const Char* begin, const Char* end) {
-      if (begin == end) return;
-      for (;;) {
-        const Char* p = nullptr;
-        if (!find<IS_CONSTEXPR>(begin, end, '}', p))
-          return handler_.on_text(begin, end);
-        ++p;
-        if (p == end || *p != '}')
-          return handler_.on_error("unmatched '}' in format string");
-        handler_.on_text(begin, p);
-        begin = p + 1;
-      }
-    }
-    Handler& handler_;
-  } write{handler};
-  auto begin = format_str.data();
-  auto end = begin + format_str.size();
-  while (begin != end) {
-    // Doing two passes with memchr (one for '{' and another for '}') is up to
-    // 2.5x faster than the naive one-pass implementation on big format strings.
-    const Char* p = begin;
-    if (*begin != '{' && !find<IS_CONSTEXPR>(begin, end, '{', p))
-      return write(begin, end);
-    write(begin, p);
-    ++p;
-    if (p == end) return handler.on_error("invalid format string");
-    if (static_cast<char>(*p) == '}') {
-      handler.on_arg_id();
-      handler.on_replacement_field(p);
-    } else if (*p == '{') {
-      handler.on_text(p, p + 1);
-    } else {
-      p = parse_arg_id(p, end, id_adapter<Handler, Char>{handler});
-      Char c = p != end ? *p : Char();
-      if (c == '}') {
-        handler.on_replacement_field(p);
-      } else if (c == ':') {
-        p = handler.on_format_specs(p + 1, end);
-        if (p == end || *p != '}')
-          return handler.on_error("unknown format specifier");
-      } else {
-        return handler.on_error("missing '}' in format string");
-      }
-    }
-    begin = p + 1;
-  }
-}
-
-template <typename T, typename ParseContext>
-FMT_CONSTEXPR const typename ParseContext::char_type* parse_format_specs(
-    ParseContext& ctx) {
-  using char_type = typename ParseContext::char_type;
-  using context = buffer_context<char_type>;
-  using mapped_type =
-      conditional_t<internal::mapped_type_constant<T, context>::value !=
-                        internal::custom_type,
-                    decltype(arg_mapper<context>().map(std::declval<T>())), T>;
-  conditional_t<has_formatter<mapped_type, context>::value,
-                formatter<mapped_type, char_type>,
-                internal::fallback_formatter<T, char_type>>
-      f;
-  return f.parse(ctx);
-}
-
-template <typename Char, typename ErrorHandler, typename... Args>
-class format_string_checker {
- public:
-  explicit FMT_CONSTEXPR format_string_checker(
-      basic_string_view<Char> format_str, ErrorHandler eh)
-      : arg_id_((std::numeric_limits<unsigned>::max)()),
-        context_(format_str, eh),
-        parse_funcs_{&parse_format_specs<Args, parse_context_type>...} {}
-
-  FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
-
-  FMT_CONSTEXPR void on_arg_id() {
-    arg_id_ = context_.next_arg_id();
-    check_arg_id();
-  }
-  FMT_CONSTEXPR void on_arg_id(int id) {
-    arg_id_ = id;
-    context_.check_arg_id(id);
-    check_arg_id();
-  }
-  FMT_CONSTEXPR void on_arg_id(basic_string_view<Char>) {
-    on_error("compile-time checks don't support named arguments");
-  }
-
-  FMT_CONSTEXPR void on_replacement_field(const Char*) {}
-
-  FMT_CONSTEXPR const Char* on_format_specs(const Char* begin, const Char*) {
-    advance_to(context_, begin);
-    return arg_id_ < num_args ? parse_funcs_[arg_id_](context_) : begin;
-  }
-
-  FMT_CONSTEXPR void on_error(const char* message) {
-    context_.on_error(message);
-  }
-
- private:
-  using parse_context_type = basic_parse_context<Char, ErrorHandler>;
-  enum { num_args = sizeof...(Args) };
-
-  FMT_CONSTEXPR void check_arg_id() {
-    if (arg_id_ >= num_args) context_.on_error("argument index out of range");
-  }
-
-  // Format specifier parsing function.
-  using parse_func = const Char* (*)(parse_context_type&);
-
-  unsigned arg_id_;
-  parse_context_type context_;
-  parse_func parse_funcs_[num_args > 0 ? num_args : 1];
-};
-
-template <typename Char, typename ErrorHandler, typename... Args>
-FMT_CONSTEXPR bool do_check_format_string(basic_string_view<Char> s,
-                                          ErrorHandler eh = ErrorHandler()) {
-  format_string_checker<Char, ErrorHandler, Args...> checker(s, eh);
-  parse_format_string<true>(s, checker);
-  return true;
-}
-
-template <typename... Args, typename S,
-          enable_if_t<(is_compile_string<S>::value), int>>
-void check_format_string(S format_str) {
-  FMT_CONSTEXPR_DECL bool invalid_format =
-      internal::do_check_format_string<typename S::char_type,
-                                       internal::error_handler, Args...>(
-          to_string_view(format_str));
-  (void)invalid_format;
-}
-
-template <template <typename> class Handler, typename Spec, typename Context>
-void handle_dynamic_spec(Spec& value, arg_ref<typename Context::char_type> ref,
-                         Context& ctx,
-                         const typename Context::char_type* format_str) {
-  switch (ref.kind) {
-  case arg_id_kind::none:
-    break;
-  case arg_id_kind::index:
-    internal::set_dynamic_spec<Handler>(value, ctx.arg(ref.val.index),
-                                        ctx.error_handler());
-    break;
-  case arg_id_kind::name: {
-    const auto arg_id = ref.val.name.to_view(format_str);
-    internal::set_dynamic_spec<Handler>(value, ctx.arg(arg_id),
-                                        ctx.error_handler());
-    break;
-  }
-  }
-}
-}  // namespace internal
-
-template <typename Range>
-using basic_writer FMT_DEPRECATED_ALIAS = internal::basic_writer<Range>;
-using writer FMT_DEPRECATED_ALIAS = internal::writer;
-using wwriter FMT_DEPRECATED_ALIAS =
-    internal::basic_writer<internal::buffer_range<wchar_t>>;
-
-/** The default argument formatter. */
-template <typename Range>
-class arg_formatter : public internal::arg_formatter_base<Range> {
- private:
-  using char_type = typename Range::value_type;
-  using base = internal::arg_formatter_base<Range>;
-  using context_type = basic_format_context<typename base::iterator, char_type>;
-
-  context_type& ctx_;
-  basic_parse_context<char_type>* parse_ctx_;
-
- public:
-  using range = Range;
-  using iterator = typename base::iterator;
-  using format_specs = typename base::format_specs;
-
-  /**
-    \rst
-    Constructs an argument formatter object.
-    *ctx* is a reference to the formatting context,
-    *specs* contains format specifier information for standard argument types.
-    \endrst
-   */
-  explicit arg_formatter(context_type& ctx,
-                         basic_parse_context<char_type>* parse_ctx = nullptr,
-                         format_specs* specs = nullptr)
-      : base(Range(ctx.out()), specs, ctx.locale()),
-        ctx_(ctx),
-        parse_ctx_(parse_ctx) {}
-
-  using base::operator();
-
-  /** Formats an argument of a user-defined type. */
-  iterator operator()(typename basic_format_arg<context_type>::handle handle) {
-    handle.format(*parse_ctx_, ctx_);
-    return this->out();
-  }
-};
-
-/**
- An error returned by an operating system or a language runtime,
- for example a file opening error.
-*/
-class FMT_API system_error : public std::runtime_error {
- private:
-  void init(int err_code, string_view format_str, format_args args);
-
- protected:
-  int error_code_;
-
-  system_error() : std::runtime_error(""), error_code_(0) {}
-
- public:
-  /**
-   \rst
-   Constructs a :class:`fmt::system_error` object with a description
-   formatted with `fmt::format_system_error`. *message* and additional
-   arguments passed into the constructor are formatted similarly to
-   `fmt::format`.
-
-   **Example**::
-
-     // This throws a system_error with the description
-     //   cannot open file 'madeup': No such file or directory
-     // or similar (system message may vary).
-     const char *filename = "madeup";
-     std::FILE *file = std::fopen(filename, "r");
-     if (!file)
-       throw fmt::system_error(errno, "cannot open file '{}'", filename);
-   \endrst
-  */
-  template <typename... Args>
-  system_error(int error_code, string_view message, const Args&... args)
-      : std::runtime_error("") {
-    init(error_code, message, make_format_args(args...));
-  }
-  ~system_error() FMT_NOEXCEPT;
-
-  int error_code() const { return error_code_; }
-};
-
-/**
-  \rst
-  Formats an error returned by an operating system or a language runtime,
-  for example a file opening error, and writes it to *out* in the following
-  form:
-
-  .. parsed-literal::
-     *<message>*: *<system-message>*
-
-  where *<message>* is the passed message and *<system-message>* is
-  the system message corresponding to the error code.
-  *error_code* is a system error code as given by ``errno``.
-  If *error_code* is not a valid error code such as -1, the system message
-  may look like "Unknown error -1" and is platform-dependent.
-  \endrst
- */
-FMT_API void format_system_error(internal::buffer<char>& out, int error_code,
-                                 fmt::string_view message) FMT_NOEXCEPT;
-
-struct float_spec_handler {
-  char type;
-  bool upper;
-  bool fixed;
-  bool as_percentage;
-  bool use_locale;
-
-  explicit float_spec_handler(char t)
-      : type(t),
-        upper(false),
-        fixed(false),
-        as_percentage(false),
-        use_locale(false) {}
-
-  void on_general() {
-    if (type == 'G') upper = true;
-  }
-
-  void on_exp() {
-    if (type == 'E') upper = true;
-  }
-
-  void on_fixed() {
-    fixed = true;
-    if (type == 'F') upper = true;
-  }
-
-  void on_percent() {
-    fixed = true;
-    as_percentage = true;
-  }
-
-  void on_hex() {
-    if (type == 'A') upper = true;
-  }
-
-  void on_num() { use_locale = true; }
-
-  FMT_NORETURN void on_error() {
-    FMT_THROW(format_error("invalid type specifier"));
-  }
-};
-
-template <typename Range>
-template <typename T, bool USE_GRISU>
-void internal::basic_writer<Range>::write_double(T value,
-                                                 const format_specs& specs) {
-  // Check type.
-  float_spec_handler handler(static_cast<char>(specs.type));
-  internal::handle_float_type_spec(handler.type, handler);
-
-  char sign = 0;
-  // Use signbit instead of value < 0 since the latter is always false for NaN.
-  if (std::signbit(value)) {
-    sign = '-';
-    value = -value;
-  } else if (specs.sign != sign::none) {
-    if (specs.sign == sign::plus)
-      sign = '+';
-    else if (specs.sign == sign::space)
-      sign = ' ';
-  }
-
-  if (!std::isfinite(value)) {
-    // Format infinity and NaN ourselves because sprintf's output is not
-    // consistent across platforms.
-    const char* str = std::isinf(value) ? (handler.upper ? "INF" : "inf")
-                                        : (handler.upper ? "NAN" : "nan");
-    return write_padded(specs,
-                        inf_or_nan_writer{sign, handler.as_percentage, str});
-  }
-
-  if (handler.as_percentage) value *= 100;
-
-  memory_buffer buffer;
-  int exp = 0;
-  int precision = specs.precision >= 0 || !specs.type ? specs.precision : 6;
-  unsigned options = handler.fixed ? internal::grisu_options::fixed : 0;
-  bool use_grisu = USE_GRISU &&
-                   (specs.type != 'a' && specs.type != 'A' &&
-                    specs.type != 'e' && specs.type != 'E') &&
-                   internal::grisu_format(static_cast<double>(value), buffer,
-                                          precision, options, exp);
-  char* decimal_point_pos = nullptr;
-  if (!use_grisu)
-    decimal_point_pos = internal::sprintf_format(value, buffer, specs);
-
-  if (handler.as_percentage) {
-    buffer.push_back('%');
-    --exp;  // Adjust decimal place position.
-  }
-  format_specs as = specs;
-  if (specs.align == align::numeric) {
-    if (sign) {
-      auto&& it = reserve(1);
-      *it++ = static_cast<char_type>(sign);
-      sign = 0;
-      if (as.width) --as.width;
-    }
-    as.align = align::right;
-  } else if (specs.align == align::none) {
-    as.align = align::right;
-  }
-  char_type decimal_point = handler.use_locale
-                                ? internal::decimal_point<char_type>(locale_)
-                                : static_cast<char_type>('.');
-  if (use_grisu) {
-    auto params = internal::gen_digits_params();
-    params.fixed = handler.fixed;
-    params.num_digits = precision;
-    params.trailing_zeros =
-        (precision != 0 && (handler.fixed || !specs.type)) || specs.alt;
-    write_padded(as, grisu_writer(sign, buffer, exp, params, decimal_point));
-  } else {
-    write_padded(as,
-                 double_writer{sign, buffer, decimal_point_pos, decimal_point});
-  }
-}
-
-// Reports a system error without throwing an exception.
-// Can be used to report errors from destructors.
-FMT_API void report_system_error(int error_code,
-                                 string_view message) FMT_NOEXCEPT;
-
-#if FMT_USE_WINDOWS_H
-
-/** A Windows error. */
-class windows_error : public system_error {
- private:
-  FMT_API void init(int error_code, string_view format_str, format_args args);
-
- public:
-  /**
-   \rst
-   Constructs a :class:`fmt::windows_error` object with the description
-   of the form
-
-   .. parsed-literal::
-     *<message>*: *<system-message>*
-
-   where *<message>* is the formatted message and *<system-message>* is the
-   system message corresponding to the error code.
-   *error_code* is a Windows error code as given by ``GetLastError``.
-   If *error_code* is not a valid error code such as -1, the system message
-   will look like "error -1".
-
-   **Example**::
-
-     // This throws a windows_error with the description
-     //   cannot open file 'madeup': The system cannot find the file specified.
-     // or similar (system message may vary).
-     const char *filename = "madeup";
-     LPOFSTRUCT of = LPOFSTRUCT();
-     HFILE file = OpenFile(filename, &of, OF_READ);
-     if (file == HFILE_ERROR) {
-       throw fmt::windows_error(GetLastError(),
-                                "cannot open file '{}'", filename);
-     }
-   \endrst
-  */
-  template <typename... Args>
-  windows_error(int error_code, string_view message, const Args&... args) {
-    init(error_code, message, make_format_args(args...));
-  }
-};
-
-// Reports a Windows error without throwing an exception.
-// Can be used to report errors from destructors.
-FMT_API void report_windows_error(int error_code,
-                                  string_view message) FMT_NOEXCEPT;
-
-#endif
-
-/** Fast integer formatter. */
-class format_int {
- private:
-  // Buffer should be large enough to hold all digits (digits10 + 1),
-  // a sign and a null character.
-  enum { buffer_size = std::numeric_limits<unsigned long long>::digits10 + 3 };
-  mutable char buffer_[buffer_size];
-  char* str_;
-
-  // Formats value in reverse and returns a pointer to the beginning.
-  char* format_decimal(unsigned long long value) {
-    char* ptr = buffer_ + (buffer_size - 1);  // Parens to workaround MSVC bug.
-    while (value >= 100) {
-      // Integer division is slow so do it for a group of two digits instead
-      // of for every digit. The idea comes from the talk by Alexandrescu
-      // "Three Optimization Tips for C++". See speed-test for a comparison.
-      unsigned index = static_cast<unsigned>((value % 100) * 2);
-      value /= 100;
-      *--ptr = internal::data::digits[index + 1];
-      *--ptr = internal::data::digits[index];
-    }
-    if (value < 10) {
-      *--ptr = static_cast<char>('0' + value);
-      return ptr;
-    }
-    unsigned index = static_cast<unsigned>(value * 2);
-    *--ptr = internal::data::digits[index + 1];
-    *--ptr = internal::data::digits[index];
-    return ptr;
-  }
-
-  void format_signed(long long value) {
-    unsigned long long abs_value = static_cast<unsigned long long>(value);
-    bool negative = value < 0;
-    if (negative) abs_value = 0 - abs_value;
-    str_ = format_decimal(abs_value);
-    if (negative) *--str_ = '-';
-  }
-
- public:
-  explicit format_int(int value) { format_signed(value); }
-  explicit format_int(long value) { format_signed(value); }
-  explicit format_int(long long value) { format_signed(value); }
-  explicit format_int(unsigned value) : str_(format_decimal(value)) {}
-  explicit format_int(unsigned long value) : str_(format_decimal(value)) {}
-  explicit format_int(unsigned long long value) : str_(format_decimal(value)) {}
-
-  /** Returns the number of characters written to the output buffer. */
-  std::size_t size() const {
-    return internal::to_unsigned(buffer_ - str_ + buffer_size - 1);
-  }
-
-  /**
-    Returns a pointer to the output buffer content. No terminating null
-    character is appended.
-   */
-  const char* data() const { return str_; }
-
-  /**
-    Returns a pointer to the output buffer content with terminating null
-    character appended.
-   */
-  const char* c_str() const {
-    buffer_[buffer_size - 1] = '\0';
-    return str_;
-  }
-
-  /**
-    \rst
-    Returns the content of the output buffer as an ``std::string``.
-    \endrst
-   */
-  std::string str() const { return std::string(str_, size()); }
-};
-
-// A formatter specialization for the core types corresponding to internal::type
-// constants.
-template <typename T, typename Char>
-struct formatter<T, Char,
-                 enable_if_t<internal::type_constant<T, Char>::value !=
-                             internal::custom_type>> {
-  FMT_CONSTEXPR formatter() : format_str_(nullptr) {}
-
-  // Parses format specifiers stopping either at the end of the range or at the
-  // terminating '}'.
-  template <typename ParseContext>
-  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
-    format_str_ = ctx.begin();
-    using handler_type = internal::dynamic_specs_handler<ParseContext>;
-    auto type = internal::type_constant<T, Char>::value;
-    internal::specs_checker<handler_type> handler(handler_type(specs_, ctx),
-                                                  type);
-    auto it = parse_format_specs(ctx.begin(), ctx.end(), handler);
-    auto eh = ctx.error_handler();
-    switch (type) {
-    case internal::none_type:
-    case internal::named_arg_type:
-      FMT_ASSERT(false, "invalid argument type");
-      break;
-    case internal::int_type:
-    case internal::uint_type:
-    case internal::long_long_type:
-    case internal::ulong_long_type:
-    case internal::bool_type:
-      handle_int_type_spec(specs_.type,
-                           internal::int_type_checker<decltype(eh)>(eh));
-      break;
-    case internal::char_type:
-      handle_char_specs(
-          &specs_, internal::char_specs_checker<decltype(eh)>(specs_.type, eh));
-      break;
-    case internal::double_type:
-    case internal::long_double_type:
-      handle_float_type_spec(specs_.type,
-                             internal::float_type_checker<decltype(eh)>(eh));
-      break;
-    case internal::cstring_type:
-      internal::handle_cstring_type_spec(
-          specs_.type, internal::cstring_type_checker<decltype(eh)>(eh));
-      break;
-    case internal::string_type:
-      internal::check_string_type_spec(specs_.type, eh);
-      break;
-    case internal::pointer_type:
-      internal::check_pointer_type_spec(specs_.type, eh);
-      break;
-    case internal::custom_type:
-      // Custom format specifiers should be checked in parse functions of
-      // formatter specializations.
-      break;
-    }
-    return it;
-  }
-
-  template <typename FormatContext>
-  auto format(const T& val, FormatContext& ctx) -> decltype(ctx.out()) {
-    internal::handle_dynamic_spec<internal::width_checker>(
-        specs_.width, specs_.width_ref, ctx, format_str_);
-    internal::handle_dynamic_spec<internal::precision_checker>(
-        specs_.precision, specs_.precision_ref, ctx, format_str_);
-    using range_type =
-        internal::output_range<typename FormatContext::iterator,
-                               typename FormatContext::char_type>;
-    return visit_format_arg(arg_formatter<range_type>(ctx, nullptr, &specs_),
-                            internal::make_arg<FormatContext>(val));
-  }
-
- private:
-  internal::dynamic_format_specs<Char> specs_;
-  const Char* format_str_;
-};
-
-#define FMT_FORMAT_AS(Type, Base)                                             \
-  template <typename Char>                                                    \
-  struct formatter<Type, Char> : formatter<Base, Char> {                      \
-    template <typename FormatContext>                                         \
-    auto format(const Type& val, FormatContext& ctx) -> decltype(ctx.out()) { \
-      return formatter<Base, Char>::format(val, ctx);                         \
-    }                                                                         \
-  }
-
-FMT_FORMAT_AS(signed char, int);
-FMT_FORMAT_AS(unsigned char, unsigned);
-FMT_FORMAT_AS(short, int);
-FMT_FORMAT_AS(unsigned short, unsigned);
-FMT_FORMAT_AS(long, long long);
-FMT_FORMAT_AS(unsigned long, unsigned long long);
-FMT_FORMAT_AS(float, double);
-FMT_FORMAT_AS(Char*, const Char*);
-FMT_FORMAT_AS(std::basic_string<Char>, basic_string_view<Char>);
-FMT_FORMAT_AS(std::nullptr_t, const void*);
-FMT_FORMAT_AS(internal::std_string_view<Char>, basic_string_view<Char>);
-
-template <typename Char>
-struct formatter<void*, Char> : formatter<const void*, Char> {
-  template <typename FormatContext>
-  auto format(void* val, FormatContext& ctx) -> decltype(ctx.out()) {
-    return formatter<const void*, Char>::format(val, ctx);
-  }
-};
-
-template <typename Char, size_t N>
-struct formatter<Char[N], Char> : formatter<basic_string_view<Char>, Char> {
-  template <typename FormatContext>
-  auto format(const Char* val, FormatContext& ctx) -> decltype(ctx.out()) {
-    return formatter<basic_string_view<Char>, Char>::format(val, ctx);
-  }
-};
-
-// A formatter for types known only at run time such as variant alternatives.
-//
-// Usage:
-//   using variant = std::variant<int, std::string>;
-//   template <>
-//   struct formatter<variant>: dynamic_formatter<> {
-//     void format(buffer &buf, const variant &v, context &ctx) {
-//       visit([&](const auto &val) { format(buf, val, ctx); }, v);
-//     }
-//   };
-template <typename Char = char> class dynamic_formatter {
- private:
-  struct null_handler : internal::error_handler {
-    void on_align(align_t) {}
-    void on_plus() {}
-    void on_minus() {}
-    void on_space() {}
-    void on_hash() {}
-  };
-
- public:
-  template <typename ParseContext>
-  auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
-    format_str_ = ctx.begin();
-    // Checks are deferred to formatting time when the argument type is known.
-    internal::dynamic_specs_handler<ParseContext> handler(specs_, ctx);
-    return parse_format_specs(ctx.begin(), ctx.end(), handler);
-  }
-
-  template <typename T, typename FormatContext>
-  auto format(const T& val, FormatContext& ctx) -> decltype(ctx.out()) {
-    handle_specs(ctx);
-    internal::specs_checker<null_handler> checker(
-        null_handler(),
-        internal::mapped_type_constant<T, FormatContext>::value);
-    checker.on_align(specs_.align);
-    switch (specs_.sign) {
-    case sign::none:
-      break;
-    case sign::plus:
-      checker.on_plus();
-      break;
-    case sign::minus:
-      checker.on_minus();
-      break;
-    case sign::space:
-      checker.on_space();
-      break;
-    }
-    if (specs_.alt) checker.on_hash();
-    if (specs_.precision >= 0) checker.end_precision();
-    using range = internal::output_range<typename FormatContext::iterator,
-                                         typename FormatContext::char_type>;
-    visit_format_arg(arg_formatter<range>(ctx, nullptr, &specs_),
-                     internal::make_arg<FormatContext>(val));
-    return ctx.out();
-  }
-
- private:
-  template <typename Context> void handle_specs(Context& ctx) {
-    internal::handle_dynamic_spec<internal::width_checker>(
-        specs_.width, specs_.width_ref, ctx, format_str_);
-    internal::handle_dynamic_spec<internal::precision_checker>(
-        specs_.precision, specs_.precision_ref, ctx, format_str_);
-  }
-
-  internal::dynamic_format_specs<Char> specs_;
-  const Char* format_str_;
-};
-
-template <typename Range, typename Char>
-typename basic_format_context<Range, Char>::format_arg
-basic_format_context<Range, Char>::arg(basic_string_view<char_type> name) {
-  map_.init(args_);
-  format_arg arg = map_.find(name);
-  if (arg.type() == internal::none_type) this->on_error("argument not found");
-  return arg;
-}
-
-template <typename Char, typename ErrorHandler>
-FMT_CONSTEXPR void advance_to(basic_parse_context<Char, ErrorHandler>& ctx,
-                              const Char* p) {
-  ctx.advance_to(ctx.begin() + (p - &*ctx.begin()));
-}
-
-template <typename ArgFormatter, typename Char, typename Context>
-struct format_handler : internal::error_handler {
-  using range = typename ArgFormatter::range;
-
-  format_handler(range r, basic_string_view<Char> str,
-                 basic_format_args<Context> format_args,
-                 internal::locale_ref loc)
-      : parse_context(str), context(r.begin(), format_args, loc) {}
-
-  void on_text(const Char* begin, const Char* end) {
-    auto size = internal::to_unsigned(end - begin);
-    auto out = context.out();
-    auto&& it = internal::reserve(out, size);
-    it = std::copy_n(begin, size, it);
-    context.advance_to(out);
-  }
-
-  void get_arg(int id) { arg = internal::get_arg(context, id); }
-
-  void on_arg_id() { get_arg(parse_context.next_arg_id()); }
-  void on_arg_id(int id) {
-    parse_context.check_arg_id(id);
-    get_arg(id);
-  }
-  void on_arg_id(basic_string_view<Char> id) { arg = context.arg(id); }
-
-  void on_replacement_field(const Char* p) {
-    advance_to(parse_context, p);
-    internal::custom_formatter<Context> f(parse_context, context);
-    if (!visit_format_arg(f, arg))
-      context.advance_to(
-          visit_format_arg(ArgFormatter(context, &parse_context), arg));
-  }
-
-  const Char* on_format_specs(const Char* begin, const Char* end) {
-    advance_to(parse_context, begin);
-    internal::custom_formatter<Context> f(parse_context, context);
-    if (visit_format_arg(f, arg)) return parse_context.begin();
-    basic_format_specs<Char> specs;
-    using internal::specs_handler;
-    using parse_context_t = basic_parse_context<Char>;
-    internal::specs_checker<specs_handler<parse_context_t, Context>> handler(
-        specs_handler<parse_context_t, Context>(specs, parse_context, context),
-        arg.type());
-    begin = parse_format_specs(begin, end, handler);
-    if (begin == end || *begin != '}') on_error("missing '}' in format string");
-    advance_to(parse_context, begin);
-    context.advance_to(
-        visit_format_arg(ArgFormatter(context, &parse_context, &specs), arg));
-    return begin;
-  }
-
-  basic_parse_context<Char> parse_context;
-  Context context;
-  basic_format_arg<Context> arg;
-};
-
-/** Formats arguments and writes the output to the range. */
-template <typename ArgFormatter, typename Char, typename Context>
-typename Context::iterator vformat_to(
-    typename ArgFormatter::range out, basic_string_view<Char> format_str,
-    basic_format_args<Context> args,
-    internal::locale_ref loc = internal::locale_ref()) {
-  format_handler<ArgFormatter, Char, Context> h(out, format_str, args, loc);
-  internal::parse_format_string<false>(format_str, h);
-  return h.context.out();
-}
-
-// Casts ``p`` to ``const void*`` for pointer formatting.
-// Example:
-//   auto s = format("{}", ptr(p));
-template <typename T> inline const void* ptr(const T* p) { return p; }
-template <typename T> inline const void* ptr(const std::unique_ptr<T>& p) {
-  return p.get();
-}
-template <typename T> inline const void* ptr(const std::shared_ptr<T>& p) {
-  return p.get();
-}
-
-template <typename It, typename Char> struct arg_join : internal::view {
-  It begin;
-  It end;
-  basic_string_view<Char> sep;
-
-  arg_join(It b, It e, basic_string_view<Char> s) : begin(b), end(e), sep(s) {}
-};
-
-template <typename It, typename Char>
-struct formatter<arg_join<It, Char>, Char>
-    : formatter<typename std::iterator_traits<It>::value_type, Char> {
-  template <typename FormatContext>
-  auto format(const arg_join<It, Char>& value, FormatContext& ctx)
-      -> decltype(ctx.out()) {
-    using base = formatter<typename std::iterator_traits<It>::value_type, Char>;
-    auto it = value.begin;
-    auto out = ctx.out();
-    if (it != value.end) {
-      out = base::format(*it++, ctx);
-      while (it != value.end) {
-        out = std::copy(value.sep.begin(), value.sep.end(), out);
-        ctx.advance_to(out);
-        out = base::format(*it++, ctx);
-      }
-    }
-    return out;
-  }
-};
-
-/**
-  Returns an object that formats the iterator range `[begin, end)` with elements
-  separated by `sep`.
- */
-template <typename It>
-arg_join<It, char> join(It begin, It end, string_view sep) {
-  return {begin, end, sep};
-}
-
-template <typename It>
-arg_join<It, wchar_t> join(It begin, It end, wstring_view sep) {
-  return {begin, end, sep};
-}
-
-/**
-  \rst
-  Returns an object that formats `range` with elements separated by `sep`.
-
-  **Example**::
-
-    std::vector<int> v = {1, 2, 3};
-    fmt::print("{}", fmt::join(v, ", "));
-    // Output: "1, 2, 3"
-  \endrst
- */
-template <typename Range>
-arg_join<internal::iterator_t<const Range>, char> join(const Range& range,
-                                                       string_view sep) {
-  return join(std::begin(range), std::end(range), sep);
-}
-
-template <typename Range>
-arg_join<internal::iterator_t<const Range>, wchar_t> join(const Range& range,
-                                                          wstring_view sep) {
-  return join(std::begin(range), std::end(range), sep);
-}
-
-/**
-  \rst
-  Converts *value* to ``std::string`` using the default format for type *T*.
-  It doesn't support user-defined types with custom formatters.
-
-  **Example**::
-
-    #include <fmt/format.h>
-
-    std::string answer = fmt::to_string(42);
-  \endrst
- */
-template <typename T> inline std::string to_string(const T& value) {
-  return format("{}", value);
-}
-
-/**
-  Converts *value* to ``std::wstring`` using the default format for type *T*.
- */
-template <typename T> inline std::wstring to_wstring(const T& value) {
-  return format(L"{}", value);
-}
-
-template <typename Char, std::size_t SIZE>
-std::basic_string<Char> to_string(const basic_memory_buffer<Char, SIZE>& buf) {
-  return std::basic_string<Char>(buf.data(), buf.size());
-}
-
-template <typename Char>
-typename buffer_context<Char>::iterator internal::vformat_to(
-    internal::buffer<Char>& buf, basic_string_view<Char> format_str,
-    basic_format_args<buffer_context<Char>> args) {
-  using range = buffer_range<Char>;
-  return vformat_to<arg_formatter<range>>(buf, to_string_view(format_str),
-                                          args);
-}
-
-template <typename S, typename Char = char_t<S>,
-          FMT_ENABLE_IF(internal::is_string<S>::value)>
-inline typename buffer_context<Char>::iterator vformat_to(
-    internal::buffer<Char>& buf, const S& format_str,
-    basic_format_args<buffer_context<Char>> args) {
-  return internal::vformat_to(buf, to_string_view(format_str), args);
-}
-
-template <typename S, typename... Args, std::size_t SIZE = inline_buffer_size,
-          typename Char = enable_if_t<internal::is_string<S>::value, char_t<S>>>
-inline typename buffer_context<Char>::iterator format_to(
-    basic_memory_buffer<Char, SIZE>& buf, const S& format_str, Args&&... args) {
-  internal::check_format_string<Args...>(format_str);
-  using context = buffer_context<Char>;
-  return internal::vformat_to(buf, to_string_view(format_str),
-                              {make_format_args<context>(args...)});
-}
-
-template <typename OutputIt, typename Char = char>
-using format_context_t = basic_format_context<OutputIt, Char>;
-
-template <typename OutputIt, typename Char = char>
-using format_args_t = basic_format_args<format_context_t<OutputIt, Char>>;
-
-template <typename S, typename OutputIt, typename... Args,
-          FMT_ENABLE_IF(
-              internal::is_output_iterator<OutputIt>::value &&
-              !internal::is_contiguous_back_insert_iterator<OutputIt>::value)>
-inline OutputIt vformat_to(OutputIt out, const S& format_str,
-                           format_args_t<OutputIt, char_t<S>> args) {
-  using range = internal::output_range<OutputIt, char_t<S>>;
-  return vformat_to<arg_formatter<range>>(range(out),
-                                          to_string_view(format_str), args);
-}
-
-/**
- \rst
- Formats arguments, writes the result to the output iterator ``out`` and returns
- the iterator past the end of the output range.
-
- **Example**::
-
-   std::vector<char> out;
-   fmt::format_to(std::back_inserter(out), "{}", 42);
- \endrst
- */
-template <typename OutputIt, typename S, typename... Args,
-          FMT_ENABLE_IF(
-              internal::is_output_iterator<OutputIt>::value &&
-              !internal::is_contiguous_back_insert_iterator<OutputIt>::value &&
-              internal::is_string<S>::value)>
-inline OutputIt format_to(OutputIt out, const S& format_str, Args&&... args) {
-  internal::check_format_string<Args...>(format_str);
-  using context = format_context_t<OutputIt, char_t<S>>;
-  return vformat_to(out, to_string_view(format_str),
-                    {make_format_args<context>(args...)});
-}
-
-template <typename OutputIt> struct format_to_n_result {
-  /** Iterator past the end of the output range. */
-  OutputIt out;
-  /** Total (not truncated) output size. */
-  std::size_t size;
-};
-
-template <typename OutputIt, typename Char = typename OutputIt::value_type>
-using format_to_n_context =
-    format_context_t<fmt::internal::truncating_iterator<OutputIt>, Char>;
-
-template <typename OutputIt, typename Char = typename OutputIt::value_type>
-using format_to_n_args = basic_format_args<format_to_n_context<OutputIt, Char>>;
-
-template <typename OutputIt, typename Char, typename... Args>
-inline format_arg_store<format_to_n_context<OutputIt, Char>, Args...>
-make_format_to_n_args(const Args&... args) {
-  return format_arg_store<format_to_n_context<OutputIt, Char>, Args...>(
-      args...);
-}
-
-template <typename OutputIt, typename Char, typename... Args,
-          FMT_ENABLE_IF(internal::is_output_iterator<OutputIt>::value)>
-inline format_to_n_result<OutputIt> vformat_to_n(
-    OutputIt out, std::size_t n, basic_string_view<Char> format_str,
-    format_to_n_args<OutputIt, Char> args) {
-  auto it = vformat_to(internal::truncating_iterator<OutputIt>(out, n),
-                       format_str, args);
-  return {it.base(), it.count()};
-}
-
-/**
- \rst
- Formats arguments, writes up to ``n`` characters of the result to the output
- iterator ``out`` and returns the total output size and the iterator past the
- end of the output range.
- \endrst
- */
-template <typename OutputIt, typename S, typename... Args,
-          FMT_ENABLE_IF(internal::is_string<S>::value&&
-                            internal::is_output_iterator<OutputIt>::value)>
-inline format_to_n_result<OutputIt> format_to_n(OutputIt out, std::size_t n,
-                                                const S& format_str,
-                                                const Args&... args) {
-  internal::check_format_string<Args...>(format_str);
-  using context = format_to_n_context<OutputIt, char_t<S>>;
-  return vformat_to_n(out, n, to_string_view(format_str),
-                      {make_format_args<context>(args...)});
-}
-
-template <typename Char>
-inline std::basic_string<Char> internal::vformat(
-    basic_string_view<Char> format_str,
-    basic_format_args<buffer_context<Char>> args) {
-  basic_memory_buffer<Char> buffer;
-  internal::vformat_to(buffer, format_str, args);
-  return fmt::to_string(buffer);
-}
-
-/**
-  Returns the number of characters in the output of
-  ``format(format_str, args...)``.
- */
-template <typename... Args>
-inline std::size_t formatted_size(string_view format_str, const Args&... args) {
-  auto it = format_to(internal::counting_iterator<char>(), format_str, args...);
-  return it.count();
-}
-
-#if FMT_USE_USER_DEFINED_LITERALS
-namespace internal {
-
-#  if FMT_USE_UDL_TEMPLATE
-template <typename Char, Char... CHARS> class udl_formatter {
- public:
-  template <typename... Args>
-  std::basic_string<Char> operator()(Args&&... args) const {
-    FMT_CONSTEXPR_DECL Char s[] = {CHARS..., '\0'};
-    FMT_CONSTEXPR_DECL bool invalid_format =
-        do_check_format_string<Char, error_handler, Args...>(
-            basic_string_view<Char>(s, sizeof...(CHARS)));
-    (void)invalid_format;
-    return format(s, std::forward<Args>(args)...);
-  }
-};
-#  else
-template <typename Char> struct udl_formatter {
-  basic_string_view<Char> str;
-
-  template <typename... Args>
-  std::basic_string<Char> operator()(Args&&... args) const {
-    return format(str, std::forward<Args>(args)...);
-  }
-};
-#  endif  // FMT_USE_UDL_TEMPLATE
-
-template <typename Char> struct udl_arg {
-  basic_string_view<Char> str;
-
-  template <typename T> named_arg<T, Char> operator=(T&& value) const {
-    return {str, std::forward<T>(value)};
-  }
-};
-
-}  // namespace internal
-
-inline namespace literals {
-#  if FMT_USE_UDL_TEMPLATE
-#    pragma GCC diagnostic push
-#    if FMT_CLANG_VERSION
-#      pragma GCC diagnostic ignored "-Wgnu-string-literal-operator-template"
-#    endif
-template <typename Char, Char... CHARS>
-FMT_CONSTEXPR internal::udl_formatter<Char, CHARS...> operator""_format() {
-  return {};
-}
-#    pragma GCC diagnostic pop
-#  else
-/**
-  \rst
-  User-defined literal equivalent of :func:`fmt::format`.
-
-  **Example**::
-
-    using namespace fmt::literals;
-    std::string message = "The answer is {}"_format(42);
-  \endrst
- */
-FMT_CONSTEXPR internal::udl_formatter<char> operator"" _format(const char* s,
-                                                               std::size_t n) {
-  return {{s, n}};
-}
-FMT_CONSTEXPR internal::udl_formatter<wchar_t> operator"" _format(
-    const wchar_t* s, std::size_t n) {
-  return {{s, n}};
-}
-#  endif  // FMT_USE_UDL_TEMPLATE
-
-/**
-  \rst
-  User-defined literal equivalent of :func:`fmt::arg`.
-
-  **Example**::
-
-    using namespace fmt::literals;
-    fmt::print("Elapsed time: {s:.2f} seconds", "s"_a=1.23);
-  \endrst
- */
-FMT_CONSTEXPR internal::udl_arg<char> operator"" _a(const char* s,
-                                                    std::size_t n) {
-  return {{s, n}};
-}
-FMT_CONSTEXPR internal::udl_arg<wchar_t> operator"" _a(const wchar_t* s,
-                                                       std::size_t n) {
-  return {{s, n}};
-}
-}  // namespace literals
-#endif  // FMT_USE_USER_DEFINED_LITERALS
-FMT_END_NAMESPACE
-
-/**
-  \rst
-  Constructs a compile-time format string.
-
-  **Example**::
-
-    // A compile-time error because 'd' is an invalid specifier for strings.
-    std::string s = format(FMT_STRING("{:d}"), "foo");
-  \endrst
- */
-#define FMT_STRING(s)                                                    \
-  [] {                                                                   \
-    struct str : fmt::compile_string {                                   \
-      using char_type = typename std::remove_cv<std::remove_pointer<     \
-          typename std::decay<decltype(s)>::type>::type>::type;          \
-      FMT_CONSTEXPR operator fmt::basic_string_view<char_type>() const { \
-        return {s, sizeof(s) / sizeof(char_type) - 1};                   \
-      }                                                                  \
-    } result;                                                            \
-    /* Suppress Qt Creator warning about unused operator. */             \
-    (void)static_cast<fmt::basic_string_view<typename str::char_type>>(  \
-        result);                                                         \
-    return result;                                                       \
-  }()
-
-#if defined(FMT_STRING_ALIAS) && FMT_STRING_ALIAS
-/**
-  \rst
-  Constructs a compile-time format string. This macro is disabled by default to
-  prevent potential name collisions. To enable it define ``FMT_STRING_ALIAS`` to
-  1 before including ``fmt/format.h``.
-
-  **Example**::
-
-    #define FMT_STRING_ALIAS 1
-    #include <fmt/format.h>
-    // A compile-time error because 'd' is an invalid specifier for strings.
-    std::string s = format(fmt("{:d}"), "foo");
-  \endrst
- */
-#  define fmt(s) FMT_STRING(s)
-#endif
-
-#ifdef FMT_HEADER_ONLY
-#  define FMT_FUNC inline
-#  include "format-inl.h"
-#else
-#  define FMT_FUNC
-#endif
-
-#endif  // FMT_FORMAT_H_
diff --git a/include/vtkdiy2/fmt/locale.h b/include/vtkdiy2/fmt/locale.h
deleted file mode 100644
index 7c13656e4fa..00000000000
--- a/include/vtkdiy2/fmt/locale.h
+++ /dev/null
@@ -1,77 +0,0 @@
-// Formatting library for C++ - std::locale support
-//
-// Copyright (c) 2012 - present, Victor Zverovich
-// All rights reserved.
-//
-// For the license information refer to format.h.
-
-#ifndef FMT_LOCALE_H_
-#define FMT_LOCALE_H_
-
-#include <locale>
-#include "format.h"
-
-FMT_BEGIN_NAMESPACE
-
-namespace internal {
-template <typename Char>
-typename buffer_context<Char>::iterator vformat_to(
-    const std::locale& loc, buffer<Char>& buf,
-    basic_string_view<Char> format_str,
-    basic_format_args<buffer_context<Char>> args) {
-  using range = buffer_range<Char>;
-  return vformat_to<arg_formatter<range>>(buf, to_string_view(format_str), args,
-                                          internal::locale_ref(loc));
-}
-
-template <typename Char>
-std::basic_string<Char> vformat(const std::locale& loc,
-                                basic_string_view<Char> format_str,
-                                basic_format_args<buffer_context<Char>> args) {
-  basic_memory_buffer<Char> buffer;
-  internal::vformat_to(loc, buffer, format_str, args);
-  return fmt::to_string(buffer);
-}
-}  // namespace internal
-
-template <typename S, typename Char = char_t<S>>
-inline std::basic_string<Char> vformat(
-    const std::locale& loc, const S& format_str,
-    basic_format_args<buffer_context<Char>> args) {
-  return internal::vformat(loc, to_string_view(format_str), args);
-}
-
-template <typename S, typename... Args, typename Char = char_t<S>>
-inline std::basic_string<Char> format(const std::locale& loc,
-                                      const S& format_str, Args&&... args) {
-  return internal::vformat(
-      loc, to_string_view(format_str),
-      {internal::make_args_checked<Args...>(format_str, args...)});
-}
-
-template <typename S, typename OutputIt, typename... Args,
-          typename Char = enable_if_t<
-              internal::is_output_iterator<OutputIt>::value, char_t<S>>>
-inline OutputIt vformat_to(OutputIt out, const std::locale& loc,
-                           const S& format_str,
-                           format_args_t<OutputIt, Char> args) {
-  using range = internal::output_range<OutputIt, Char>;
-  return vformat_to<arg_formatter<range>>(
-      range(out), to_string_view(format_str), args, internal::locale_ref(loc));
-}
-
-template <typename OutputIt, typename S, typename... Args,
-          FMT_ENABLE_IF(internal::is_output_iterator<OutputIt>::value&&
-                            internal::is_string<S>::value)>
-inline OutputIt format_to(OutputIt out, const std::locale& loc,
-                          const S& format_str, Args&&... args) {
-  internal::check_format_string<Args...>(format_str);
-  using context = format_context_t<OutputIt, char_t<S>>;
-  format_arg_store<context, Args...> as{args...};
-  return vformat_to(out, loc, to_string_view(format_str),
-                    basic_format_args<context>(as));
-}
-
-FMT_END_NAMESPACE
-
-#endif  // FMT_LOCALE_H_
diff --git a/include/vtkdiy2/grid.hpp b/include/vtkdiy2/grid.hpp
index 2913891015d..13915762639 100644
--- a/include/vtkdiy2/grid.hpp
+++ b/include/vtkdiy2/grid.hpp
@@ -55,7 +55,7 @@ struct GridRef
         inline
         Vertex      vertex(Index idx) const;
 
-        Index       index(const Vertex& v) const                { Index idx = 0; for (unsigned i = 0; i < D; ++i) { idx += ((Index) v[i]) * ((Index) stride_[i]); } return idx; }
+        Index       index(const Vertex& v) const                { Index idx = 0; for (unsigned i = 0; i < D; ++i) { idx += ((Index) v[i]) * stride_[i]; } return idx; }
 
         Index       size() const                                { return size(shape()); }
         void        swap(GridRef& other)                        { std::swap(data_, other.data_); std::swap(shape_, other.shape_); std::swap(stride_, other.stride_); std::swap(c_order_, other.c_order_); }
@@ -65,6 +65,8 @@ struct GridRef
         static constexpr
         unsigned    dimension()                                 { return D; }
 
+        bool        contains(const Vertex& v)                   { for (unsigned i = 0; i < D; ++i) { if (v[i] < 0 || v[i] >= shape_[i]) return false;} return true; }
+
     protected:
         static Index
                 size(const Vertex& v)                           { Index res = 1; for (unsigned i = 0; i < D; ++i) res *= v[i]; return res; }
@@ -73,10 +75,9 @@ struct GridRef
         {
             Index cur = 1;
             if (c_order_)
-                for (unsigned i = D; i > 0; --i) { stride_[i-1] = cur; cur *= shape_[i-1]; }
+                for (unsigned i = D; i > 0; --i) { stride_[i-1] = cur; cur *= static_cast<Index>(shape_[i-1]); }
             else
-                for (unsigned i = 0; i < D; ++i) { stride_[i] = cur; cur *= shape_[i]; }
-
+                for (unsigned i = 0; i < D; ++i) { stride_[i] = cur; cur *= static_cast<Index>(shape_[i]); }
         }
         void    set_shape(const Vertex& v)                      { shape_ = v; set_stride(); }
         void    set_data(C* data)                               { data_ = data; }
@@ -85,7 +86,7 @@ struct GridRef
     private:
         C*      data_;
         Vertex  shape_;
-        Vertex  stride_;
+        diy::Point<Index, D> stride_;
         bool    c_order_;
 };
 
@@ -107,8 +108,8 @@ struct Grid: public GridRef<C,D>
                 Grid():
                     Parent(new C[0], Vertex::zero())            {}
         template<class Int>
-                Grid(const Point<Int, D>& shape, bool c_order = true):
-                    Parent(new C[size(shape)], shape, c_order)
+                Grid(const Point<Int, D>& s, bool c_order = true):
+                    Parent(new C[size(s)], s, c_order)
                 {}
 
                 Grid(Grid&& g): Grid()                          { Parent::swap(g); }
@@ -147,11 +148,11 @@ struct Grid: public GridRef<C,D>
 
     private:
         template<class OC>
-        void    copy_data(const OC* data)
+        void    copy_data(const OC* data_)
         {
             Index s = size(shape());
             for (Index i = 0; i < s; ++i)
-                Parent::data()[i] = data[i];
+                Parent::data()[i] = data_[i];
         }
 };
 
@@ -181,13 +182,13 @@ vertex(typename GridRef<C, D>::Index idx) const
     if (c_order())
         for (unsigned i = 0; i < D; ++i)
         {
-            v[i] = idx / stride_[i];
+            v[i] = static_cast<int>(idx / stride_[i]);
             idx %= stride_[i];
         }
     else
         for (int i = D-1; i >= 0; --i)
         {
-            v[i] = idx / stride_[i];
+            v[i] = static_cast<int>(idx / stride_[i]);
             idx %= stride_[i];
         }
     return v;
diff --git a/include/vtkdiy2/io/block.hpp b/include/vtkdiy2/io/block.hpp
index 26635bac299..0cf2b97b503 100644
--- a/include/vtkdiy2/io/block.hpp
+++ b/include/vtkdiy2/io/block.hpp
@@ -205,12 +205,11 @@ namespace io
         extra.reset();
 
         // Get local gids from assigner
-        size_t size = all_offset_counts.size();
-        assigner.set_nblocks(size);
+        assigner.set_nblocks(static_cast<int>(all_offset_counts.size()));
         std::vector<int> gids;
         assigner.local_gids(comm.rank(), gids);
 
-        for (unsigned i = 0; i < gids.size(); ++i)
+        for (size_t i = 0; i < gids.size(); ++i)
         {
             if (gids[i] != all_offset_counts[gids[i]].gid)
                 get_logger()->warn("gids don't match in diy::io::read_blocks(), {} vs {}",
@@ -342,7 +341,7 @@ namespace split
     }
 
     // Get local gids from assigner
-    assigner.set_nblocks(size);
+    assigner.set_nblocks(static_cast<int>(size));
     std::vector<int> gids;
     assigner.local_gids(comm.rank(), gids);
 
diff --git a/include/vtkdiy2/io/bov.hpp b/include/vtkdiy2/io/bov.hpp
index 7a7737e8a8c..4f7a3e0ee2d 100644
--- a/include/vtkdiy2/io/bov.hpp
+++ b/include/vtkdiy2/io/bov.hpp
@@ -2,11 +2,8 @@
 #define DIY_IO_BOV_HPP
 
 #include <vector>
-#include <algorithm>
-#include <numeric>
 
-#include "../types.hpp"
-#include "../mpi.hpp"
+#include "../mpi/io.hpp"
 
 namespace diy
 {
@@ -39,8 +36,9 @@ namespace io
             shape_.push_back(shape[i]);
             stride_.push_back(1);
         }
-        for (int i = shape_.size() - 2; i >=  0; --i)
+        for (auto i = shape_.size() - 2; i ==  0; --i)
           stride_[i] = stride_[i+1] * shape_[i+1];
+        stride_[0] = stride_[1] * shape_[1];
       }
 
       const Shape&  shape() const                                       { return shape_; }
@@ -71,50 +69,7 @@ void
 diy::io::BOV::
 read(const DiscreteBounds& bounds, T* buffer, bool collective, int chunk) const
 {
-#ifndef DIY_NO_MPI
-  int dim   = shape_.size();
-  int total = 1;
-  std::vector<int> subsizes;
-  for (int i = 0; i < dim; ++i)
-  {
-    subsizes.push_back(bounds.max[i] - bounds.min[i] + 1);
-    total *= subsizes.back();
-  }
-
-  MPI_Datatype T_type;
-  if (chunk == 1)
-    T_type = mpi::detail::get_mpi_datatype<T>();
-  else
-  {
-    // create an MPI struct of size chunk to read the data in those chunks
-    // (this allows to work around MPI-IO weirdness where crucial quantities
-    // are ints, which are too narrow of a type)
-    int             array_of_blocklengths[]  = { chunk };
-    MPI_Aint        array_of_displacements[] = { 0 };
-    MPI_Datatype    array_of_types[]         = { mpi::detail::get_mpi_datatype<T>() };
-    MPI_Type_create_struct(1, array_of_blocklengths, array_of_displacements, array_of_types, &T_type);
-    MPI_Type_commit(&T_type);
-  }
-
-  MPI_Datatype fileblk;
-  MPI_Type_create_subarray(dim, (int*) &shape_[0], &subsizes[0], (int*) &bounds.min[0], MPI_ORDER_C, T_type, &fileblk);
-  MPI_Type_commit(&fileblk);
-
-  MPI_File_set_view(f_.handle(), offset_, T_type, fileblk, (char*)"native", MPI_INFO_NULL);
-
-  mpi::status s;
-  if (!collective)
-      MPI_File_read(f_.handle(), buffer, total, T_type, &s.s);
-  else
-      MPI_File_read_all(f_.handle(), buffer, total, T_type, &s.s);
-
-  if (chunk != 1)
-    MPI_Type_free(&T_type);
-  MPI_Type_free(&fileblk);
-#else
-  (void) bounds; (void) buffer; (void) collective; (void)chunk;
-  DIY_UNSUPPORTED_MPI_CALL(diy::io::BOV::read);
-#endif
+  f_.read_bov(bounds, static_cast<int>(shape_.size()), shape_.data(), reinterpret_cast<char*>(buffer), offset_, mpi::detail::get_mpi_datatype<T>(), collective, chunk);
 }
 
 template<class T>
@@ -130,52 +85,7 @@ void
 diy::io::BOV::
 write(const DiscreteBounds& bounds, const T* buffer, const DiscreteBounds& core, bool collective, int chunk)
 {
-#ifndef DIY_NO_MPI
-  int dim   = shape_.size();
-  std::vector<int> subsizes;
-  std::vector<int> buffer_shape, buffer_start;
-  for (int i = 0; i < dim; ++i)
-  {
-    buffer_shape.push_back(bounds.max[i] - bounds.min[i] + 1);
-    buffer_start.push_back(core.min[i] - bounds.min[i]);
-    subsizes.push_back(core.max[i] - core.min[i] + 1);
-  }
-
-  MPI_Datatype T_type;
-  if (chunk == 1)
-    T_type = mpi::detail::get_mpi_datatype<T>();
-  else
-  {
-    // assume T is a binary block and create an MPI struct of appropriate size
-    int             array_of_blocklengths[]  = { chunk };
-    MPI_Aint        array_of_displacements[] = { 0 };
-    MPI_Datatype    array_of_types[]         = { mpi::detail::get_mpi_datatype<T>() };
-    MPI_Type_create_struct(1, array_of_blocklengths, array_of_displacements, array_of_types, &T_type);
-    MPI_Type_commit(&T_type);
-  }
-
-  MPI_Datatype fileblk, subbuffer;
-  MPI_Type_create_subarray(dim, (int*) &shape_[0],       &subsizes[0], (int*) &core.min[0],     MPI_ORDER_C, T_type, &fileblk);
-  MPI_Type_create_subarray(dim, (int*) &buffer_shape[0], &subsizes[0], (int*) &buffer_start[0], MPI_ORDER_C, T_type, &subbuffer);
-  MPI_Type_commit(&fileblk);
-  MPI_Type_commit(&subbuffer);
-
-  MPI_File_set_view(f_.handle(), offset_, T_type, fileblk, (char*)"native", MPI_INFO_NULL);
-
-  mpi::status s;
-  if (!collective)
-    MPI_File_write(f_.handle(), (void*)buffer, 1, subbuffer, &s.s);
-  else
-    MPI_File_write_all(f_.handle(), (void*)buffer, 1, subbuffer, &s.s);
-
-  if (chunk != 1)
-    MPI_Type_free(&T_type);
-  MPI_Type_free(&fileblk);
-  MPI_Type_free(&subbuffer);
-#else
-  (void) bounds; (void) buffer;(void) core; (void) collective; (void) chunk;
-  DIY_UNSUPPORTED_MPI_CALL(diy::io::bov::write);
-#endif
+  f_.write_bov(bounds, core, static_cast<int>(shape_.size()), shape_.data(), reinterpret_cast<const char*>(buffer), offset_, mpi::detail::get_mpi_datatype<T>(), collective, chunk);
 }
 
 #endif
diff --git a/include/vtkdiy2/io/numpy.hpp b/include/vtkdiy2/io/numpy.hpp
index 0199a0c38f4..242313ed216 100644
--- a/include/vtkdiy2/io/numpy.hpp
+++ b/include/vtkdiy2/io/numpy.hpp
@@ -79,21 +79,21 @@ parse_npy_header(BOV::Shape& shape, bool& fortran_order)
     header = header.substr(11, nl - 11 + 1);
     size_t header_size = nl + 1;
 
-    int loc1, loc2;
+    size_t loc1, loc2;
 
     //fortran order
     loc1 = header.find("fortran_order")+16;
     fortran_order = (header.substr(loc1,4) == "True" ? true : false);
 
     //shape
-    unsigned ndims;
+    size_t ndims;
     loc1 = header.find("(");
     loc2 = header.find(")");
     std::string str_shape = header.substr(loc1+1,loc2-loc1-1);
     if(str_shape[str_shape.size()-1] == ',') ndims = 1;
     else ndims = std::count(str_shape.begin(),str_shape.end(),',')+1;
     shape.resize(ndims);
-    for(unsigned int i = 0;i < ndims;i++) {
+    for(size_t i = 0;i < ndims;i++) {
         loc1 = str_shape.find(",");
         shape[i] = atoi(str_shape.substr(0,loc1).c_str());
         str_shape = str_shape.substr(loc1+1);
diff --git a/include/vtkdiy2/io/shared.hpp b/include/vtkdiy2/io/shared.hpp
index 26d94d6c16d..00328612cac 100644
--- a/include/vtkdiy2/io/shared.hpp
+++ b/include/vtkdiy2/io/shared.hpp
@@ -22,19 +22,35 @@ class SharedOutFile: public std::ostringstream
 
         void    close()
         {
-            auto str = this->str();
-            std::vector<char> contents(str.begin(), str.end());
-            if (world_.rank() == root_)
+            if (root_ >= 0)
             {
-                std::vector<std::vector<char>> all_contents;
-                diy::mpi::gather(world_, contents, all_contents, root_);
+                auto str = this->str();
+                std::vector<char> contents(str.begin(), str.end());
+                if (world_.rank() == root_)
+                {
+                    std::vector<std::vector<char>> all_contents;
+                    diy::mpi::gather(world_, contents, all_contents, root_);
 
-                // write the file serially
-                std::ofstream out(filename_);
-                for (auto& contents : all_contents)
-                    out.write(contents.data(), contents.size());
+                    // write the file serially
+                    std::ofstream fout(filename_);
+                    for (auto& cntnts : all_contents)
+                        fout.write(cntnts.data(), cntnts.size());
+                } else
+                    diy::mpi::gather(world_, contents, root_);
             } else
-                diy::mpi::gather(world_, contents, root_);
+            {
+                int x = 0;
+                if (world_.rank() > 0)
+                    world_.recv(world_.rank() - 1, 0, x);
+
+                std::ofstream fout(filename_, std::ios_base::app);
+                fout << this->str();
+
+                if (world_.rank() < world_.size() - 1)
+                    world_.send(world_.rank() + 1, 0, x);
+
+                world_.barrier();
+            }
         }
 
     private:
diff --git a/include/vtkdiy2/io/utils.hpp b/include/vtkdiy2/io/utils.hpp
index 3e44a470c47..c857560effc 100644
--- a/include/vtkdiy2/io/utils.hpp
+++ b/include/vtkdiy2/io/utils.hpp
@@ -5,6 +5,8 @@
 #include <direct.h>
 #include <io.h>
 #include <share.h>
+#define NOMINMAX
+#include <windows.h>
 #else
 #include <unistd.h>     // mkstemp() on Mac
 #include <dirent.h>
diff --git a/include/vtkdiy2/link.hpp b/include/vtkdiy2/link.hpp
index 423e24807fa..c87e274de18 100644
--- a/include/vtkdiy2/link.hpp
+++ b/include/vtkdiy2/link.hpp
@@ -84,7 +84,7 @@ namespace diy
       // direction
       int       direction(Direction dir) const;         // convert direction to a neighbor (-1 if no neighbor)
       Direction direction(int i) const                  { return dir_vec_[i]; }
-      void      add_direction(Direction dir)            { int c = dir_map_.size(); dir_map_[dir] = c; dir_vec_.push_back(dir); }
+      void      add_direction(Direction dir)            { auto c = static_cast<int>(dir_map_.size()); dir_map_[dir] = c; dir_vec_.push_back(dir); }
 
       // wrap
       void       add_wrap(Direction dir)                { wrap_.push_back(dir); }
diff --git a/include/vtkdiy2/log.hpp b/include/vtkdiy2/log.hpp
index f47962d3b61..32f77cc0b1a 100644
--- a/include/vtkdiy2/log.hpp
+++ b/include/vtkdiy2/log.hpp
@@ -4,8 +4,8 @@
 #ifndef DIY_USE_SPDLOG
 
 #include <memory>
-#include "fmt/format.h"
-#include "fmt/ostream.h"
+#include "thirdparty/fmt/format.h"
+#include "thirdparty/fmt/ostream.h"
 
 namespace diy
 {
@@ -55,8 +55,8 @@ set_logger(Args...)
 #include <spdlog/sinks/null_sink.h>
 #include <spdlog/sinks/stdout_sinks.h>
 
-#include <spdlog/fmt/bundled/format.h>
-#include <spdlog/fmt/bundled/ostream.h>
+#include <spdlog/fmt/fmt.h>
+#include <spdlog/fmt/ostr.h>
 
 namespace diy
 {
diff --git a/include/vtkdiy2/master.hpp b/include/vtkdiy2/master.hpp
index 9761ce0f3cf..51a03ce0c1e 100644
--- a/include/vtkdiy2/master.hpp
+++ b/include/vtkdiy2/master.hpp
@@ -10,6 +10,8 @@
 #include <numeric>
 #include <memory>
 #include <chrono>
+#include <climits>
+#include <random>
 
 #include "link.hpp"
 #include "collection.hpp"
@@ -21,6 +23,9 @@
 
 #include "thread.hpp"
 
+#include "coroutine.hpp"
+#include "utils.hpp"
+
 #include "detail/block_traits.hpp"
 
 #include "log.hpp"
@@ -28,6 +33,23 @@
 
 namespace diy
 {
+
+  struct MemoryManagement
+  {
+      using Allocate = std::function<char* (int, size_t)>;
+      using Deallocate = BinaryBlob::Deleter;
+      using MemCopy = std::function<void(char*, const char*, size_t)>;
+
+      MemoryManagement()    = default;
+      MemoryManagement(Allocate allocate_, Deallocate deallocate_, MemCopy copy_):
+            allocate(allocate_), deallocate(deallocate_), copy(copy_)       {}
+
+      Allocate      allocate    = [](int /* gid */, size_t n) { return new char[n]; };
+      Deallocate    deallocate  = [](const char* p) { delete[] p; };
+      MemCopy       copy        = [](char* dest, const char* src, size_t count) { std::memcpy(dest, src, count); };
+  };
+
+
   // Stores and manages blocks; initiates serialization and communication when necessary.
   //
   // Provides a foreach function, which is meant as the main entry point.
@@ -68,6 +90,10 @@ namespace diy
       template<class Block>
       using Callback = std::function<void(Block*, const ProxyWithLink&)>;
 
+      // foreach_exchange callback
+      template<class Block>
+      using CoroutineCallback = std::function<void(Block* const&, const ProxyWithLink&)>;
+
       // iexchange callback
       template<class Block>
       using ICallback = std::function<bool(Block*, const ProxyWithLink&)>;
@@ -84,7 +110,7 @@ namespace diy
       {
                 QueueSizePolicy(size_t sz): size(sz)          {}
         bool    unload_incoming(const Master&, int, int, size_t sz) const         { return sz > size; }
-        bool    unload_outgoing(const Master& master, int from, size_t sz) const  { return sz > size*master.outgoing_count(from); }
+        bool    unload_outgoing(const Master&, int, size_t sz) const              { return sz > size; }
 
         size_t  size;
       };
@@ -107,32 +133,40 @@ namespace diy
       struct CollectivesList;       // std::list<Collective>
       struct CollectivesMap;        // std::map<int, CollectivesList>       // gid -> [collectives]
 
-
       struct QueueRecord
       {
-                        QueueRecord(size_t s = 0, int e = -1): size(s), external(e)     {}
-        size_t          size;
-        int             external;
-      };
+                        QueueRecord(MemoryBuffer&& b):
+                            buffer_(std::move(b))                                       { size_ = buffer_.size(); external_ = -1; }
+                        QueueRecord(size_t s = 0, int e = -1): size_(s), external_(e)   {}
+                        QueueRecord(const QueueRecord&) =delete;
+                        QueueRecord(QueueRecord&&)      =default;
+        QueueRecord&    operator=(const QueueRecord&)   =delete;
+        QueueRecord&    operator=(QueueRecord&&)        =default;
 
-      typedef           std::map<int,     QueueRecord>      InQueueRecords;     //  gid         -> (size, external)
-      typedef           std::map<int,     MemoryBuffer>     IncomingQueues;     //  gid         -> queue
-      typedef           std::map<BlockID, MemoryBuffer>     OutgoingQueues;     // (gid, proc)  -> queue
-      typedef           std::map<BlockID, QueueRecord>      OutQueueRecords;    // (gid, proc)  -> (size, external)
-      struct IncomingQueuesRecords
-      {
-        InQueueRecords  records;
-        IncomingQueues  queues;
-      };
-      struct OutgoingQueuesRecord
-      {
-                        OutgoingQueuesRecord(int e = -1): external(e)       {}
-        int             external;
-        OutQueueRecords external_local;
-        OutgoingQueues  queues;
+        bool            external() const                                                { return external_ != -1; }
+        MemoryBuffer&&  move()                                                          { return std::move(buffer_); }
+        size_t          size() const                                                    { if (external()) return size_; return buffer_.size(); }
+
+        void            reset()                                                         { buffer_.reset(); }
+
+        void            unload(ExternalStorage* storage)                                { size_ = buffer_.size(); external_ = storage->put(buffer_); }
+        void            load(ExternalStorage* storage)                                  { storage->get(external_, buffer_); external_ = -1; }
+
+        MemoryBuffer&   buffer()                                                        { return buffer_; }
+
+        private:
+            size_t          size_;
+            int             external_;
+            MemoryBuffer    buffer_;
       };
-      typedef           std::map<int,     IncomingQueuesRecords>    IncomingQueuesMap;  //  gid         -> {  gid       -> queue }
-      typedef           std::map<int,     OutgoingQueuesRecord>     OutgoingQueuesMap;  //  gid         -> { (gid,proc) -> queue }
+
+      using RecordQueue = critical_resource<std::deque<QueueRecord>>;
+
+      using IncomingQueues = concurrent_map<int,      RecordQueue>;       // gid  -> [(size, external, buffer), ...]
+      using OutgoingQueues = concurrent_map<BlockID,  RecordQueue>;       // bid  -> [(size, external, buffer), ...]
+
+      using IncomingQueuesMap = std::map<int, IncomingQueues>;      // gid  -> {  gid -> [(size, external, buffer), ...]}
+      using OutgoingQueuesMap = std::map<int, OutgoingQueues>;      // gid  -> {  bid -> [(size, external, buffer), ...]}
 
       struct IncomingRound
       {
@@ -141,7 +175,6 @@ namespace diy
       };
       typedef std::map<int, IncomingRound> IncomingRoundMap;
 
-
     public:
      /**
       * \ingroup Initialization
@@ -167,12 +200,17 @@ namespace diy
       inline void   destroy(int i)                      { if (blocks_.own()) blocks_.destroy(i); }
 
       inline int    add(int gid, void* b, Link* l);     //!< add a block
+      inline int    add(int gid, void* b, const Link& l){ return add(gid, b, l.clone()); }
       inline void*  release(int i);                     //!< release ownership of the block
 
       //!< return the `i`-th block
       inline void*  block(int i) const                  { return blocks_.find(i); }
       template<class Block>
       Block*        block(int i) const                  { return static_cast<Block*>(block(i)); }
+
+      const Collection&
+                    blocks() const                      { return blocks_; }
+
       //! return the `i`-th block, loading it if necessary
       void*         get(int i)                          { return blocks_.get(i); }
       template<class Block>
@@ -207,32 +245,23 @@ namespace diy
       bool          local(int gid__) const              { return lids_.find(gid__) != lids_.end(); }
 
       //! exchange the queues between all the blocks (collective operation)
-      inline void   exchange(bool remote = false);
+      inline void   exchange(bool remote = false, MemoryManagement mem = MemoryManagement());
 
       //! nonblocking exchange of the queues between all the blocks
       template<class Block>
-      void          iexchange_(const ICallback<Block>&  f,
-                               size_t                   min_queue_size,
-                               size_t                   max_hold_time,
-                               bool                     fine);
+      void          iexchange_(const ICallback<Block>&  f, MemoryManagement mem);
 
       template<class F>
-      void          iexchange(const     F&      f,
-                              size_t            min_queue_size = 0,     // in bytes, queues smaller than min_queue_size will be held for up to max_hold_time
-                              size_t            max_hold_time  = 0,     // in milliseconds
-                              bool              fine           = false)
+      void          iexchange(const F& f, MemoryManagement mem = MemoryManagement())
       {
           using Block = typename detail::block_traits<F>::type;
-          iexchange_<Block>(f, min_queue_size, max_hold_time, fine);
+          iexchange_<Block>(f, mem);
       }
 
       inline void   process_collectives();
 
       inline
-      ProxyWithLink proxy(int i) const;
-
-      inline
-      ProxyWithLink proxy(int i, IExchangeInfo* iexchange) const;
+      ProxyWithLink proxy(int i, IExchangeInfo* iex = 0) const;
 
       //! return the number of local blocks
       unsigned int  size() const                        { return static_cast<unsigned int>(blocks_.size()); }
@@ -243,7 +272,13 @@ namespace diy
       int           threads() const                     { return threads_; }
       int           in_memory() const                   { return *blocks_.in_memory().const_access(); }
 
-      void          set_threads(int threads__)          { threads_ = threads__; }
+      void          set_threads(int threads__)
+      {
+          threads_ = threads__;
+#if defined(DIY_NO_THREADS)
+          threads_ = 1;
+#endif
+      }
 
       CreateBlock   creator() const                     { return blocks_.creator(); }
       DestroyBlock  destroyer() const                   { return blocks_.destroyer(); }
@@ -266,11 +301,22 @@ namespace diy
       bool          immediate() const                   { return immediate_; }
       void          set_immediate(bool i)               { if (i && !immediate_) execute(); immediate_ = i; }
 
+      /** foreach_exchange **/
+      struct CoroutineArg;
+
+      inline static
+      void          launch_process_block_coroutine();
+
+      template<class Block>
+      void          foreach_exchange_(const CoroutineCallback<Block>& f, bool remote, unsigned int stack_size);
+
+      template<class F>
+      void          foreach_exchange(const F& f, bool remote = false, unsigned int stack_size = 16*1024*1024);
+
     public:
       // Communicator functionality
-      IncomingQueues&   incoming(int gid__)             { return incoming_[exchange_round_].map[gid__].queues; }
-      OutgoingQueues&   outgoing(int gid__)             { return outgoing_[gid__].queues; }
-      size_t            outgoing_count(int gid__) const { OutgoingQueuesMap::const_iterator it = outgoing_.find(gid__); if (it == outgoing_.end()) return 0; return it->second.queues.size(); }
+      IncomingQueues&   incoming(int gid__)             { return incoming_[exchange_round_].map[gid__]; }
+      OutgoingQueues&   outgoing(int gid__)             { return outgoing_[gid__]; }
       inline CollectivesList&  collectives(int gid__);
       inline CollectivesMap&   collectives();
 
@@ -281,33 +327,30 @@ namespace diy
 
     public:
       // Communicator functionality
-      inline void       flush(bool remote = false);     // makes sure all the serialized queues migrate to their target processors
+      inline void       flush(bool remote, MemoryManagement mem = MemoryManagement());            // makes sure all the serialized queues migrate to their target processors
 
     private:
       // Communicator functionality
-      inline void       comm_exchange(GidSendOrder& gid_order, IExchangeInfo*    iexchange = 0);
-      inline void       rcomm_exchange();    // possibly called in between block computations
-      inline bool       nudge(IExchangeInfo* iexchange = 0);
-      inline void       send_queue(int from_gid, int to_gid, int to_proc, MemoryBuffer& out_queue, bool remote, IExchangeInfo* iexchange);
+      inline void       comm_exchange(GidSendOrder& gid_order, MemoryManagement mem, IExchangeInfo*    iex = 0);
+      inline void       rcomm_exchange(MemoryManagement mem);    // possibly called in between block computations
+      inline bool       nudge(IExchangeInfo* iex = 0);
+      inline void       send_queue(int from_gid, int to_gid, int to_proc, QueueRecord& qr, bool remote, MemoryManagement mem, IExchangeInfo* iex);
       inline void       send_outgoing_queues(GidSendOrder&   gid_order,
                                              bool            remote,
-                                             IExchangeInfo*  iexchange = 0);
-      inline void       check_incoming_queues(IExchangeInfo* iexchange = 0);
+                                             MemoryManagement mem,
+                                             IExchangeInfo*  iex = 0);
+      inline void       check_incoming_queues(MemoryManagement mem, IExchangeInfo* iex = 0);
       inline GidSendOrder
                         order_gids();
       inline void       touch_queues();
-      inline void       move_external_local(int from);
-      inline void       send_same_rank(int from, int to, MemoryBuffer& bb, IExchangeInfo* iexchange);
-      inline void       send_different_rank(int from, int to, int proc, MemoryBuffer& bb, bool remote, IExchangeInfo* iexchange);
+      inline void       send_same_rank(int from, int to, QueueRecord& qr, MemoryManagement mem, IExchangeInfo* iex);
+      inline void       send_different_rank(int from, int to, int proc, QueueRecord& qr, bool remote, IExchangeInfo* iex);
 
       inline InFlightRecv&         inflight_recv(int proc);
       inline InFlightSendsList&    inflight_sends();
 
       // iexchange commmunication
-      inline void       icommunicate(IExchangeInfo* iexchange);     // async communication
-
-      // debug
-      inline void       show_incoming_records() const;
+      inline void       icommunicate(IExchangeInfo* iex, MemoryManagement mem);     // async communication
 
       struct tags       { enum {
                                     queue,
@@ -348,6 +391,7 @@ namespace diy
       std::shared_ptr<spd::logger>  log = get_logger();
       stats::Profiler               prof;
       stats::Annotation             exchange_round_annotation { "diy.exchange-round" };
+      std::mt19937 mt_gen;          // mersenne_twister random number generator
   };
 
   struct Master::SkipNoIncoming
@@ -360,6 +404,7 @@ namespace diy
 #include "detail/master/commands.hpp"
 #include "proxy.hpp"
 #include "detail/master/execution.hpp"
+#include "detail/master/foreach_exchange.hpp"
 
 diy::Master::
 Master(mpi::communicator    comm,
@@ -374,14 +419,28 @@ Master(mpi::communicator    comm,
   blocks_(create_, destroy_, storage, save, load_),
   queue_policy_(q_policy),
   limit_(limit__),
+#if !defined(DIY_NO_THREADS)
   threads_(threads__ == -1 ? static_cast<int>(thread::hardware_concurrency()) : threads__),
+#else
+  threads_(1),
+#endif
   storage_(storage),
   // Communicator functionality
   inflight_sends_(new InFlightSendsList),
   inflight_recvs_(new InFlightRecvsMap),
   collectives_(new CollectivesMap)
 {
+#ifdef DIY_NO_THREADS
+  (void) threads__;
+#endif
     comm_.duplicate(comm);
+
+    // seed random number generator, broadcast seed, offset by rank
+    std::random_device rd;                      // seed source for the random number engine
+    unsigned int s = rd();
+    diy::mpi::broadcast(communicator(), s, 0);
+    std::mt19937 gen(s + communicator().rank());          // mersenne_twister random number generator
+    mt_gen = gen;
 }
 
 diy::Master::
@@ -431,18 +490,20 @@ unload_incoming(int gid__)
   {
     IncomingQueuesMap::iterator qmap_itr = round_itr->second.map.find(gid__);
     if (qmap_itr == round_itr->second.map.end())
-    {
       continue;
-    }
-    IncomingQueuesRecords& in_qrs = qmap_itr->second;
-    for (InQueueRecords::iterator it = in_qrs.records.begin(); it != in_qrs.records.end(); ++it)
+
+    IncomingQueues& in_qs = qmap_itr->second;
+    for (auto& x : in_qs)
     {
-      QueueRecord& qr = it->second;
-      if (queue_policy_->unload_incoming(*this, it->first, gid__, qr.size))
-      {
-        log->debug("Unloading queue: {} <- {}", gid__, it->first);
-        qr.external = storage_->put(in_qrs.queues[it->first]);
-      }
+        int from = x.first;
+        for (QueueRecord& qr : *x.second.access())
+        {
+          if (queue_policy_->unload_incoming(*this, from, gid__, qr.size()))
+          {
+            log->debug("Unloading queue: {} <- {}", gid__, from);
+            qr.unload(storage_);
+          }
+        }
     }
   }
 }
@@ -451,54 +512,18 @@ void
 diy::Master::
 unload_outgoing(int gid__)
 {
-  OutgoingQueuesRecord& out_qr = outgoing_[gid__];
-
-  size_t out_queues_size = sizeof(size_t);   // map size
-  size_t count = 0;
-  // count the size of the queues we need to pack
-  for (auto& rec : out_qr.queues)
+  OutgoingQueues& out_qs = outgoing_[gid__];
+  for (auto& x : out_qs)
   {
-    if (rec.first.proc == comm_.rank()) continue;
-
-    out_queues_size += sizeof(BlockID);                                 // target
-    out_queues_size += Serialization<MemoryBuffer>::size(rec.second);   // buffer contents
-    ++count;
-  }
-  if (queue_policy_->unload_outgoing(*this, gid__, out_queues_size - sizeof(size_t)))
-  {
-      log->debug("Unloading outgoing queues: {} -> ...; size = {}\n", gid__, out_queues_size);
-      MemoryBuffer  bb;     bb.reserve(out_queues_size);
-      diy::save(bb, count);
-
-      // pack queues going to a remote proc into bb; queues going to a
-      // different block on our rank, stay separated, recorded in external_local
-      for (auto it = out_qr.queues.begin(); it != out_qr.queues.end();)
+      int to = x.first.gid;
+      for (QueueRecord& qr : *x.second.access())
       {
-        auto  bid    = it->first;
-        auto& buffer = it->second;
-        if (bid.proc == comm_.rank())
-        {
-          // treat as incoming
-          if (queue_policy_->unload_incoming(*this, gid__, bid.gid, buffer.size()))
-          {
-            QueueRecord& qr = out_qr.external_local[bid];
-            qr.size     = buffer.size();
-            qr.external = storage_->put(buffer);
-
-            out_qr.queues.erase(it++);
-          } ++it; // else keep in memory
-        } else
+        if (queue_policy_->unload_outgoing(*this, gid__, qr.size()))
         {
-          diy::save(bb, bid);
-          diy::save(bb, buffer);
-
-          out_qr.queues.erase(it++);
+          log->debug("Unloading outgoing queue: {} -> {}", gid__, to);
+          qr.unload(storage_);
         }
       }
-
-      // TODO: this mechanism could be adjusted for direct saving to disk
-      //       (without intermediate binary buffer serialization)
-      out_qr.external = storage_->put(bb);
   }
 }
 
@@ -524,16 +549,22 @@ void
 diy::Master::
 load_incoming(int gid__)
 {
-  IncomingQueuesRecords& in_qrs = incoming_[exchange_round_].map[gid__];
-  for (InQueueRecords::iterator it = in_qrs.records.begin(); it != in_qrs.records.end(); ++it)
+  IncomingQueues& in_qs = incoming_[exchange_round_].map[gid__];
+  for (auto& x : in_qs)
   {
-    QueueRecord& qr = it->second;
-    if (qr.external != -1)
-    {
-        log->debug("Loading queue: {} <- {}", gid__, it->first);
-        storage_->get(qr.external, in_qrs.queues[it->first]);
-        qr.external = -1;
-    }
+      int from = x.first;
+      auto access = x.second.access();
+      if (!access->empty())
+      {
+          // NB: we only load the front queue, if we want to use out-of-core
+          //     machinery with iexchange, this will require changes
+          auto& qr = access->front();
+          if (qr.external())
+          {
+            log->debug("Loading queue: {} <- {}", gid__, from);
+            qr.load(storage_);
+          }
+      }
   }
 }
 
@@ -543,33 +574,30 @@ load_outgoing(int gid__)
 {
   // TODO: we could adjust this mechanism to read directly from storage,
   //       bypassing an intermediate MemoryBuffer
-  OutgoingQueuesRecord& out_qr = outgoing_[gid__];
-  if (out_qr.external != -1)
+  OutgoingQueues& out_qs = outgoing_[gid__];
+  for (auto& x : out_qs)
   {
-    MemoryBuffer bb;
-    storage_->get(out_qr.external, bb);
-    out_qr.external = -1;
-
-    size_t count;
-    diy::load(bb, count);
-    for (size_t i = 0; i < count; ++i)
-    {
-      BlockID to;
-      diy::load(bb, to);
-      diy::load(bb, out_qr.queues[to]);
-    }
+      int to      = x.first.gid;
+      int to_rank = x.first.proc;
+      auto access = x.second.access();
+      if (!access->empty())
+      {
+          // NB: we only load the front queue, if we want to use out-of-core
+          //     machinery with iexchange, this will require changes
+          auto& qr = access->front();
+          if (qr.external() && comm_.rank() != to_rank)     // skip queues to the same rank
+          {
+            log->debug("Loading queue: {} -> {}", gid__, to);
+            qr.load(storage_);
+          }
+      }
   }
 }
 
 diy::Master::ProxyWithLink
 diy::Master::
-proxy(int i) const
-{ return ProxyWithLink(Proxy(const_cast<Master*>(this), gid(i)), block(i), link(i)); }
-
-diy::Master::ProxyWithLink
-diy::Master::
-proxy(int i, IExchangeInfo* iexchange) const
-{ return ProxyWithLink(Proxy(const_cast<Master*>(this), gid(i), iexchange), block(i), link(i)); }
+proxy(int i, IExchangeInfo* iex) const
+{ return ProxyWithLink(Proxy(const_cast<Master*>(this), gid(i), iex), block(i), link(i)); }
 
 int
 diy::Master::
@@ -596,8 +624,18 @@ diy::Master::
 release(int i)
 {
   void* b = blocks_.release(i);
+
+  expected_ -= links_[i]->size_unique();
   delete link(i);   links_[i] = 0;
+  std::swap(links_[i], links_.back());
+  links_.pop_back();
+
   lids_.erase(gid(i));
+
+  std::swap(gids_[i], gids_.back());
+  gids_.pop_back();
+  lids_[gid(i)] = i;
+
   return b;
 }
 
@@ -605,12 +643,12 @@ bool
 diy::Master::
 has_incoming(int i) const
 {
-  const IncomingQueuesRecords& in_qrs = const_cast<Master&>(*this).incoming_[exchange_round_].map[gid(i)];
-  for (InQueueRecords::const_iterator it = in_qrs.records.begin(); it != in_qrs.records.end(); ++it)
+  const IncomingQueues& in_qs = const_cast<Master&>(*this).incoming_[exchange_round_].map[gid(i)];
+  for (auto& x : in_qs)
   {
-    const QueueRecord& qr = it->second;
-    if (qr.size != 0)
-        return true;
+      auto access = x.second.const_access();
+      if (!access->empty() && access->front().size() != 0)
+          return true;
   }
   return false;
 }
@@ -633,7 +671,7 @@ foreach_(const Callback<Block>& f, const Skip& skip)
 
 void
 diy::Master::
-exchange(bool remote)
+exchange(bool remote, MemoryManagement mem)
 {
   auto scoped = prof.scoped("exchange");
   DIY_UNUSED(scoped);
@@ -642,17 +680,16 @@ exchange(bool remote)
 
   log->debug("Starting exchange");
 
-#ifdef DIY_NO_MPI
-  // remote doesn't need to do anything special if there is no mpi, but we also
-  // can't just use it because of the ibarrier
-  remote = false;
-#endif
+  if (comm_.size() == 1)
+  {
+    remote = false;
+  }
 
   // make sure there is a queue for each neighbor
   if (!remote)
       touch_queues();
 
-  flush(remote);
+  flush(remote, mem);
   log->debug("Finished exchange");
 }
 
@@ -662,14 +699,13 @@ touch_queues()
 {
   for (int i = 0; i < (int)size(); ++i)
   {
-      OutgoingQueues&  outgoing_queues  = outgoing_[gid(i)].queues;
-      OutQueueRecords& external_local   = outgoing_[gid(i)].external_local;
-      if (outgoing_queues.size() < (size_t)link(i)->size())
-          for (unsigned j = 0; j < (unsigned)link(i)->size(); ++j)
-          {
-              if (external_local.find(link(i)->target(j)) == external_local.end())
-                  outgoing_queues[link(i)->target(j)];        // touch the outgoing queue, creating it if necessary
-          }
+      OutgoingQueues&  outgoing_queues  = outgoing_[gid(i)];
+      for (BlockID target : link(i)->neighbors())
+      {
+          auto access = outgoing_queues[target].access();
+          if (access->empty())
+              access->emplace_back();
+      }
   }
 }
 
@@ -686,57 +722,101 @@ touch_queues()
 template<class Block>
 void
 diy::Master::
-iexchange_(const    ICallback<Block>&   f,
-           size_t                       min_queue_size,
-           size_t                       max_hold_time,
-           bool                         fine)
+iexchange_(const ICallback<Block>& f, MemoryManagement mem)
 {
     auto scoped = prof.scoped("iexchange");
     DIY_UNUSED(scoped);
 
+#if !defined(DIY_NO_THREADS) && (!defined(DIY_USE_CALIPER) && defined(DIY_PROFILE))
+    static_assert(false, "Cannot use DIY's internal profiler; it's not thread safe. Use caliper.");
+#endif
+
     // prepare for next round
     incoming_.erase(exchange_round_);
     ++exchange_round_;
     exchange_round_annotation.set(exchange_round_);
 
+    // touch the outgoing and incoming queues to make sure they exist
+    for (unsigned i = 0; i < size(); ++i)
+    {
+      outgoing(gid(i));
+      incoming(gid(i));
+    }
+
     //IExchangeInfoDUD iexchange(comm_, min_queue_size, max_hold_time, fine, prof);
-    IExchangeInfoCollective iexchange(comm_, min_queue_size, max_hold_time, fine, prof);
-    iexchange.add_work(size());                 // start with one work unit for each block
+    IExchangeInfoCollective iex(comm_, prof);
+    iex.add_work(size());                 // start with one work unit for each block
+
+    thread comm_thread;
+    if (threads() > 1)
+        comm_thread = thread([this,&iex,mem]()
+        {
+            while(!iex.all_done())
+            {
+                icommunicate(&iex, mem);
+                iex.control();
+                //std::this_thread::sleep_for(std::chrono::microseconds(1));
+            }
+        });
+
+    auto empty_incoming = [this](int gid)
+    {
+        for (auto& x : incoming(gid))
+            if (!x.second.access()->empty())
+                return false;
+        return true;
+    };
 
     std::map<int, bool> done_result;
     do
     {
-        for (size_t i = 0; i < size(); i++)     // for all blocks
+        size_t work_done = 0;
+        DIY_UNUSED(work_done);
+        for (int i = 0; i < static_cast<int>(size()); i++)     // for all blocks
         {
-            iexchange.from_gid = gid(i);       // for shortcut sending only from current block during icommunicate
-            stats::Annotation::Guard g( stats::Annotation("diy.block").set(iexchange.from_gid) );
+            int gid = this->gid(i);
+            stats::Annotation::Guard g( stats::Annotation("diy.block").set(gid) );
 
-            icommunicate(&iexchange);               // TODO: separate comm thread std::thread t(icommunicate);
-            ProxyWithLink cp = proxy(i, &iexchange);
-
-            bool done = done_result[cp.gid()];
-            if (!done || !cp.empty_incoming_queues())
+            if (threads() == 1)
+                icommunicate(&iex, mem);
+            bool done = done_result[gid];
+            if (!done || !empty_incoming(gid))
             {
                 prof << "callback";
-                done = f(block<Block>(i), cp);
+                iex.inc_work();       // even if we remove the queues, when constructing the proxy, we still have work to do
+                {
+                    ProxyWithLink cp = proxy(i, &iex);
+                    done = f(block<Block>(i), cp);
+                    if (done_result[gid] ^ done)        // status changed
+                    {
+                        if (done)
+                            iex.dec_work();
+                        else
+                            iex.inc_work();
+                    }
+                }   // NB: we need cp to go out of scope and copy out its queues before we can decrement the work
+                iex.dec_work();
                 prof >> "callback";
+                ++work_done;
             }
-            done_result[cp.gid()] = done;
-
-            done &= cp.empty_queues();
-
+            done_result[gid] = done;
             log->debug("Done: {}", done);
+        }
 
-            prof << "work-counting";
-            iexchange.update_done(cp.gid(), done);
-            prof >> "work-counting";
+        if (threads() == 1)
+        {
+            prof << "iexchange-control";
+            iex.control();
+            prof >> "iexchange-control";
         }
+        //else
+        //if (work_done == 0)
+        //    std::this_thread::sleep_for(std::chrono::microseconds(1));
+    } while (!iex.all_done());
+    log->info("[{}] ==== Leaving iexchange ====\n", iex.comm.rank());
 
-        prof << "iexchange-control";
-        iexchange.control();
-        prof >> "iexchange-control";
-    } while (!iexchange.all_done());
-    log->info("[{}] ==== Leaving iexchange ====\n", iexchange.comm.rank());
+    if (threads() > 1)
+        comm_thread.join();
 
     //comm_.barrier();        // TODO: this is only necessary for DUD
     prof >> "consensus-time";
@@ -747,17 +827,17 @@ iexchange_(const    ICallback<Block>&   f,
 /* Communicator */
 void
 diy::Master::
-comm_exchange(GidSendOrder& gid_order, IExchangeInfo* iexchange)
+comm_exchange(GidSendOrder& gid_order, MemoryManagement mem, IExchangeInfo* iex)
 {
     auto scoped = prof.scoped("comm-exchange");
     DIY_UNUSED(scoped);
 
-    send_outgoing_queues(gid_order, false, iexchange);
+    send_outgoing_queues(gid_order, false, mem, iex);
 
-    while(nudge(iexchange))                         // kick requests
+    while(nudge(iex))                         // kick requests
         ;
 
-    check_incoming_queues(iexchange);
+    check_incoming_queues(mem, iex);
 }
 
 /* Remote communicator */
@@ -788,7 +868,7 @@ comm_exchange(GidSendOrder& gid_order, IExchangeInfo* iexchange)
 //
 void
 diy::Master::
-rcomm_exchange()
+rcomm_exchange(MemoryManagement mem)
 {
     bool            done                = false;
     bool            ibarr_act           = false;
@@ -799,12 +879,12 @@ rcomm_exchange()
 
     while (!done)
     {
-        send_outgoing_queues(gid_order, true, 0);
+        send_outgoing_queues(gid_order, true, mem, 0);
 
         // kick requests
         nudge();
 
-        check_incoming_queues();
+        check_incoming_queues(mem);
         if (ibarr_act)
         {
             if (ibarr_req.test())
@@ -831,13 +911,19 @@ order_gids()
 
     GidSendOrder order;
 
-    for (OutgoingQueuesMap::iterator it = outgoing_.begin(); it != outgoing_.end(); ++it)
+    for (auto& x : outgoing_)
     {
-        OutgoingQueuesRecord& out = it->second;
-        if (out.external == -1)
-            order.list.push_front(it->first);
-        else
-            order.list.push_back(it->first);
+        OutgoingQueues& out = x.second;
+        if (!out.empty())
+        {
+            auto access = out.begin()->second.access();
+            if (!access->empty() && !access->front().external())
+            {
+                order.list.push_front(x.first);
+                continue;
+            }
+        }
+        order.list.push_back(x.first);
     }
     log->debug("order.size(): {}", order.size());
 
@@ -856,27 +942,17 @@ order_gids()
 // iexchange communicator
 void
 diy::Master::
-icommunicate(IExchangeInfo* iexchange)
+icommunicate(IExchangeInfo* iex, MemoryManagement mem)
 {
     auto scoped = prof.scoped("icommunicate");
     DIY_UNUSED(scoped);
 
     log->debug("Entering icommunicate()");
 
-    // lock out other threads
-    // TODO: not threaded yet
-    // if (!CAS(comm_flag, 0, 1))
-    //     return;
-
-    // debug
-//     log->info("out_queues_limit: {}", out_queues_limit);
-
-    // order gids
-
     auto gid_order = order_gids();
 
     // exchange
-    comm_exchange(gid_order, iexchange);
+    comm_exchange(gid_order, mem, iex);
 
     // cleanup
 
@@ -893,48 +969,58 @@ diy::Master::
 send_queue(int              from_gid,
            int              to_gid,
            int              to_proc,
-           MemoryBuffer&    out_queue,
+           QueueRecord&     qr,
            bool             remote,
-           IExchangeInfo*   iexchange)
+           MemoryManagement mem,
+           IExchangeInfo*   iex)
 {
     stats::Annotation::Guard gb( stats::Annotation("diy.block").set(from_gid) );
     stats::Annotation::Guard gt( stats::Annotation("diy.to").set(to_gid) );
-    stats::Annotation::Guard gq( stats::Annotation("diy.q-size").set(stats::Variant(static_cast<uint64_t>(out_queue.size()))) );
+    stats::Annotation::Guard gq( stats::Annotation("diy.q-size").set(stats::Variant(static_cast<uint64_t>(qr.size()))) );
 
     // skip empty queues and hold queues shorter than some limit for some time
-    if ( iexchange && (out_queue.size() == 0 || iexchange->hold(out_queue.size())) )
-        return;
-    log->debug("[{}] Sending queue: {} <- {} of size {}, iexchange = {}", comm_.rank(), to_gid, from_gid, out_queue.size(), iexchange ? 1 : 0);
-
-    if (iexchange)
-        iexchange->time_stamp_send();       // hold time begins counting from now
+    assert(!iex || qr.size() != 0);
+    log->debug("[{}] Sending queue: {} <- {} of size {}, iexchange = {}", comm_.rank(), to_gid, from_gid, qr.size(), iex ? 1 : 0);
 
     if (to_proc == comm_.rank())            // sending to same rank, simply swap buffers
-        send_same_rank(from_gid, to_gid, out_queue, iexchange);
+        send_same_rank(from_gid, to_gid, qr, mem, iex);
     else                                    // sending to an actual message to a different rank
-        send_different_rank(from_gid, to_gid, to_proc, out_queue, remote, iexchange);
+        send_different_rank(from_gid, to_gid, to_proc, qr, remote, iex);
 }
 
 void
 diy::Master::
 send_outgoing_queues(GidSendOrder&   gid_order,
                      bool            remote,            // TODO: are remote and iexchange mutually exclusive? If so, use single enum?
-                     IExchangeInfo*  iexchange)
+                     MemoryManagement mem,
+                     IExchangeInfo*  iex)
 {
     auto scoped = prof.scoped("send-outgoing-queues");
     DIY_UNUSED(scoped);
 
-    if (iexchange)                                      // for iexchange, send queues from a single block
+    if (iex)                                      // for iex, send queues from a single block
     {
-        OutgoingQueues& outgoing = outgoing_[iexchange->from_gid].queues;
-        for (OutgoingQueues::iterator it = outgoing.begin(); it != outgoing.end(); ++it)
+        for (int from : gid_order.list)
         {
-            BlockID to_block    = it->first;
-            int     to_gid      = to_block.gid;
-            int     to_proc     = to_block.proc;
+            OutgoingQueues& outgoing = this->outgoing(from);
+            for (auto& x : outgoing)
+            {
+                BlockID to_block    = x.first;
+                int     to_gid      = to_block.gid;
+                int     to_proc     = to_block.proc;
 
-            log->debug("Processing queue:      {} <- {} of size {}", to_gid, iexchange->from_gid, outgoing_[iexchange->from_gid].queues[to_block].size());
-            send_queue(iexchange->from_gid, to_gid, to_proc, it->second, remote, iexchange);
+                auto access = x.second.access();
+                while (!access->empty())
+                {
+                    auto qr = std::move(access->front());
+                    access->pop_front();
+                    access.unlock();            // others can push on this queue, while we are working
+                    assert(!qr.external());
+                    log->debug("Processing queue:      {} <- {} of size {}", to_gid, from, qr.size());
+                    send_queue(from, to_gid, to_proc, qr, remote, mem, iex);
+                    access.lock();
+                }
+            }
         }
     }
     else                                                // normal mode: send all outgoing queues
@@ -943,21 +1029,24 @@ send_outgoing_queues(GidSendOrder&   gid_order,
         {
             int from_gid = gid_order.pop();
 
-            // move external queues going to our rank
-            move_external_local(from_gid);
-
-            if (outgoing_[from_gid].external != -1)
-                load_outgoing(from_gid);
+            load_outgoing(from_gid);
 
-            OutgoingQueues& outgoing = outgoing_[from_gid].queues;
-            for (OutgoingQueues::iterator it = outgoing.begin(); it != outgoing.end(); ++it)
+            OutgoingQueues& outgoing = outgoing_[from_gid];
+            for (auto& x : outgoing)
             {
-                BlockID to_block    = it->first;
+                BlockID to_block    = x.first;
                 int     to_gid      = to_block.gid;
                 int     to_proc     = to_block.proc;
 
-                log->debug("Processing queue:      {} <- {} of size {}", to_gid, from_gid, outgoing_[from_gid].queues[to_block].size());
-                send_queue(from_gid, to_gid, to_proc, it->second, remote, iexchange);
+                auto access = x.second.access();
+                if (access->empty())
+                    continue;
+
+                // NB: send only front
+                auto& qr = access->front();
+                log->debug("Processing queue:      {} <- {} of size {}", to_gid, from_gid, qr.size());
+                send_queue(from_gid, to_gid, to_proc, qr, remote, mem, iex);
+                access->pop_front();
             }
         }
     }
@@ -965,89 +1054,45 @@ send_outgoing_queues(GidSendOrder&   gid_order,
 
 void
 diy::Master::
-move_external_local(int from)
+send_same_rank(int from, int to, QueueRecord& qr, MemoryManagement mem, IExchangeInfo*)
 {
-    IncomingRound& current_incoming = incoming_[exchange_round_];
+    auto scoped = prof.scoped("send-same-rank");
 
-    // deal with external_local queues
-    for (auto& x : outgoing_[from].external_local)
-    {
-        int to = x.first.gid;
+    log->debug("Moving queue in-place: {} <- {}", to, from);
 
-        log->debug("Processing local queue: {} <- {} of size {}", to, from, x.second.size);
+    IncomingRound& current_incoming = incoming_[exchange_round_];
 
-        QueueRecord& in_qr        = current_incoming.map[to].records[from];
-        bool         to_external  = block(lid(to)) == 0;
+    auto access_incoming = current_incoming.map[to][from].access();
 
-        if (to_external)
-            in_qr = x.second;
-        else
-        {
-            // load the queue
-            in_qr.size     = x.second.size;
-            in_qr.external = -1;
+    // save blobs to copy them explicitly
+    std::vector<BinaryBlob> blobs;
+    qr.buffer().blobs.swap(blobs);
+    qr.buffer().blob_position = 0;
 
-            MemoryBuffer bb;
-            storage_->get(x.second.external, bb);
+    access_incoming->emplace_back(std::move(qr));
+    QueueRecord& in_qr = access_incoming->back();
 
-            current_incoming.map[to].queues[from].swap(bb);
-        }
-        current_incoming.received++;
+    // copy blobs explicitly; we cannot just move them in place, since we don't
+    // own their memory and must guarantee that it's safe to free, once
+    // exchange() is done
+    for (BinaryBlob& blob : blobs)
+    {
+        char* p = mem.allocate(to, blob.size);
+        mem.copy(p, blob.pointer.get(), blob.size);
+        in_qr.buffer().save_binary_blob(p, blob.size, mem.deallocate);
     }
-    outgoing_[from].external_local.clear();
-}
-
-void
-diy::Master::
-send_same_rank(int from, int to, MemoryBuffer& bb, IExchangeInfo* iexchange)
-{
-    auto scoped = prof.scoped("send-same-rank");
-
-    log->debug("Moving queue in-place: {} <- {}", to, from);
-
-    IncomingRound& current_incoming = incoming_[exchange_round_];
 
-    QueueRecord& in_qr       = current_incoming.map[to].records[from];
-    bool         to_external = block(lid(to)) == 0;
-    if (to_external)
-    {
-        log->debug("Unloading outgoing directly as incoming: {} <- {}", to, from);
-        in_qr.size = bb.size();
-        if (queue_policy_->unload_incoming(*this, from, to, in_qr.size))
-            in_qr.external = storage_->put(bb);
-        else
-        {
-            MemoryBuffer& in_bb = current_incoming.map[to].queues[from];
-            if (!iexchange)
-            {
-                in_bb.swap(bb);
-                in_bb.reset();
-            }
-            else
-            {
-                iexchange->not_done(to);
-                in_bb.append_binary(&bb.buffer[0], bb.size());
-                bb.clear();
-            }
-            in_qr.external = -1;
-        }
-    } else        // !to_external
+    if (!in_qr.external())
     {
-        log->debug("Swapping in memory:    {} <- {}", to, from);
-        MemoryBuffer& in_bb = current_incoming.map[to].queues[from];
-        if (!iexchange)
-        {
-            in_bb.swap(bb);
-            in_bb.reset();
-        }
-        else
+        in_qr.reset();
+
+        bool to_external = block(lid(to)) == 0;
+        if (to_external)
         {
-            iexchange->not_done(to);
-            in_bb.append_binary(&bb.buffer[0], bb.size());
-            bb.wipe();
+            log->debug("Unloading outgoing directly as incoming: {} <- {}", to, from);
+            if (queue_policy_->unload_incoming(*this, from, to, in_qr.size()))
+                in_qr.unload(storage_);
         }
-        in_qr.size = in_bb.size();
-        in_qr.external = -1;
     }
 
     ++current_incoming.received;
@@ -1055,17 +1100,18 @@ send_same_rank(int from, int to, MemoryBuffer& bb, IExchangeInfo* iexchange)
 
 void
 diy::Master::
-send_different_rank(int from, int to, int proc, MemoryBuffer& bb, bool remote, IExchangeInfo* iexchange)
+send_different_rank(int from, int to, int proc, QueueRecord& qr, bool remote, IExchangeInfo* iex)
 {
     auto scoped = prof.scoped("send-different-rank");
 
+    assert(!qr.external());
+
     static const size_t MAX_MPI_MESSAGE_COUNT = INT_MAX;
 
     // sending to a different rank
-    std::shared_ptr<MemoryBuffer> buffer = std::make_shared<MemoryBuffer>();
-    buffer->swap(bb);
+    std::shared_ptr<MemoryBuffer> buffer = std::make_shared<MemoryBuffer>(qr.move());
 
-    MessageInfo info{from, to, 1, exchange_round_};
+    MessageInfo info{from, to, 1, exchange_round_, static_cast<int>(buffer->nblobs())};
     // size fits in one message
     if (Serialization<MemoryBuffer>::size(*buffer) + Serialization<MessageInfo>::size(info) <= MAX_MPI_MESSAGE_COUNT)
     {
@@ -1075,15 +1121,8 @@ send_different_rank(int from, int to, int proc, MemoryBuffer& bb, bool remote, I
         auto& inflight_send = inflight_sends().back();
 
         inflight_send.info = info;
-        if (remote || iexchange)
-        {
-            if (iexchange)
-            {
-                iexchange->inc_work();
-                log->debug("[{}] Incrementing work when sending queue\n", comm_.rank());
-            }
+        if (remote || iex)
             inflight_send.request = comm_.issend(proc, tags::queue, buffer->buffer);
-        }
         else
             inflight_send.request = comm_.isend(proc, tags::queue, buffer->buffer);
         inflight_send.message = buffer;
@@ -1098,23 +1137,27 @@ send_different_rank(int from, int to, int proc, MemoryBuffer& bb, bool remote, I
         diy::save(*hb, buffer->size());
         diy::save(*hb, info);
 
-        inflight_sends().emplace_back();
-        auto& inflight_send = inflight_sends().back();
-
-        inflight_send.info = info;
-        if (remote || iexchange)
         {
-            // add one unit of work for the entire large message (upon sending the head, not the individual pieces below)
-            if (iexchange)
-            {
-                iexchange->inc_work();
-                log->debug("[{}] Incrementing work when sending the leading piece\n", comm_.rank());
-            }
-            inflight_send.request = comm_.issend(proc, tags::queue, hb->buffer);
+          inflight_sends().emplace_back();
+          auto& inflight_send = inflight_sends().back();
+
+          inflight_send.info = info;
+          if (remote || iex)
+          {
+              // add one unit of work for the entire large message (upon sending the head, not the individual pieces below)
+              if (iex)
+              {
+                  iex->inc_work();
+                  log->debug("[{}] Incrementing work when sending the leading piece\n", comm_.rank());
+              }
+              inflight_send.request = comm_.issend(proc, tags::queue, hb->buffer);
+          }
+          else
+          {
+              inflight_send.request = comm_.isend(proc, tags::queue, hb->buffer);
+          }
+          inflight_send.message = hb;
         }
-        else
-            inflight_send.request = comm_.isend(proc, tags::queue, hb->buffer);
-        inflight_send.message = hb;
 
         // send the message pieces
         size_t msg_buff_idx = 0;
@@ -1128,11 +1171,11 @@ send_different_rank(int from, int to, int proc, MemoryBuffer& bb, bool remote, I
             auto& inflight_send = inflight_sends().back();
 
             inflight_send.info = info;
-            if (remote || iexchange)
+            if (remote || iex)
             {
-                if (iexchange)
+                if (iex)
                 {
-                    iexchange->inc_work();
+                    iex->inc_work();
                     log->debug("[{}] Incrementing work when sending non-leading piece\n", comm_.rank());
                 }
                 inflight_send.request = comm_.issend(proc, tags::queue, window);
@@ -1142,11 +1185,33 @@ send_different_rank(int from, int to, int proc, MemoryBuffer& bb, bool remote, I
             inflight_send.message = buffer;
         }
     }   // large message broken into pieces
+
+    // send binary blobs
+    for (size_t i = 0; i < buffer->nblobs(); ++i)
+    {
+        auto blob = buffer->load_binary_blob();
+        assert(blob.size < MAX_MPI_MESSAGE_COUNT);      // for now assume blobs are small enough that we don't need to break them into multiple parts
+
+        inflight_sends().emplace_back();
+        auto& inflight_send = inflight_sends().back();
+
+        inflight_send.info = info;
+
+        detail::VectorWindow<char> window;
+        window.begin = const_cast<char*>(blob.pointer.get());
+        window.count = blob.size;
+
+        if (remote || iex)
+            inflight_send.request = comm_.issend(proc, tags::queue, window);
+        else
+            inflight_send.request = comm_.isend(proc, tags::queue, window);
+        inflight_send.blob = std::move(blob);
+    }
 }
 
 void
 diy::Master::
-check_incoming_queues(IExchangeInfo* iexchange)
+check_incoming_queues(MemoryManagement mem, IExchangeInfo* iex)
 {
     auto scoped = prof.scoped("check-incoming-queues");
     DIY_UNUSED(scoped);
@@ -1155,12 +1220,13 @@ check_incoming_queues(IExchangeInfo* iexchange)
     while (ostatus)
     {
         InFlightRecv& ir = inflight_recv(ostatus->source());
+        ir.mem = mem;
 
-        if (iexchange)
-            iexchange->inc_work();                      // increment work before sender's issend request can complete (so we are now responsible for the queue)
+        if (iex)
+            iex->inc_work();                      // increment work before sender's issend request can complete (so we are now responsible for the queue)
         bool first_message = ir.recv(comm_, *ostatus);  // possibly partial recv, in case of a multi-piece message
-        if (!first_message && iexchange)
-            iexchange->dec_work();
+        if (!first_message && iex)
+            iex->dec_work();
 
         if (ir.done)                // all pieces assembled
         {
@@ -1170,7 +1236,7 @@ check_incoming_queues(IExchangeInfo* iexchange)
             bool unload = ((ir.info.round == exchange_round_) ? (block(lid(ir.info.to)) == 0) : (limit_ != -1))
                           && queue_policy_->unload_incoming(*this, ir.info.from, ir.info.to, ir.message.size());
 
-            ir.place(in, unload, storage_, iexchange);
+            ir.place(in, unload, storage_, iex);
             ir.reset();
         }
 
@@ -1180,7 +1246,7 @@ check_incoming_queues(IExchangeInfo* iexchange)
 
 void
 diy::Master::
-flush(bool remote)
+flush(bool remote, MemoryManagement mem)
 {
 #ifdef DEBUG
   time_type start = get_time();
@@ -1194,13 +1260,13 @@ flush(bool remote)
 
 
   if (remote)
-      rcomm_exchange();
+      rcomm_exchange(mem);
   else
   {
       auto gid_order = order_gids();
       do
       {
-          comm_exchange(gid_order);
+          comm_exchange(gid_order, mem);
 
 #ifdef DEBUG
           time_type cur = get_time();
@@ -1217,14 +1283,13 @@ flush(bool remote)
   outgoing_.clear();
 
   log->debug("Done in flush");
-  //show_incoming_records();
 
   process_collectives();
 }
 
 bool
 diy::Master::
-nudge(IExchangeInfo* iexchange)
+nudge(IExchangeInfo* iex)
 {
   bool success = false;
   for (InFlightSendsList::iterator it = inflight_sends().begin(); it != inflight_sends().end();)
@@ -1234,10 +1299,10 @@ nudge(IExchangeInfo* iexchange)
     {
       success = true;
       it = inflight_sends().erase(it);
-      if (iexchange)
+      if (iex)
       {
-          log->debug("[{}] message left, decrementing work", iexchange->comm.rank());
-          iexchange->dec_work();                // this message is receiver's responsibility now
+          log->debug("[{}] message left, decrementing work", iex->comm.rank());
+          iex->dec_work();                // this message is receiver's responsibility now
       }
     }
     else
@@ -1248,33 +1313,4 @@ nudge(IExchangeInfo* iexchange)
   return success;
 }
 
-void
-diy::Master::
-show_incoming_records() const
-{
-  for (IncomingRoundMap::const_iterator rounds_itr = incoming_.begin(); rounds_itr != incoming_.end(); ++rounds_itr)
-  {
-    for (IncomingQueuesMap::const_iterator it = rounds_itr->second.map.begin(); it != rounds_itr->second.map.end(); ++it)
-    {
-      const IncomingQueuesRecords& in_qrs = it->second;
-      for (InQueueRecords::const_iterator cur = in_qrs.records.begin(); cur != in_qrs.records.end(); ++cur)
-      {
-        const QueueRecord& qr = cur->second;
-        log->info("round: {}, {} <- {}: (size,external) = ({},{})",
-                  rounds_itr->first,
-                  it->first, cur->first,
-                  qr.size,
-                  qr.external);
-      }
-      for (IncomingQueues::const_iterator cur = in_qrs.queues.begin(); cur != in_qrs.queues.end(); ++cur)
-      {
-        log->info("round: {}, {} <- {}: queue.size() = {}",
-                  rounds_itr->first,
-                  it->first, cur->first,
-                  const_cast<IncomingQueuesRecords&>(in_qrs).queues[cur->first].size());
-      }
-    }
-  }
-}
-
 #endif
diff --git a/include/vtkdiy2/mpi.hpp b/include/vtkdiy2/mpi.hpp
index a1f19908f98..cac3d47d527 100644
--- a/include/vtkdiy2/mpi.hpp
+++ b/include/vtkdiy2/mpi.hpp
@@ -1,14 +1,9 @@
 #ifndef DIY_MPI_HPP
 #define DIY_MPI_HPP
 
-#ifndef DIY_NO_MPI
-#include <vtk_mpi.h>
-#else
-#include "mpi/no-mpi.hpp"
-#endif
-
-#include "mpi/constants.hpp"
+#include "mpi/config.hpp"
 #include "mpi/datatypes.hpp"
+#include "mpi/environment.hpp"
 #include "mpi/optional.hpp"
 #include "mpi/status.hpp"
 #include "mpi/request.hpp"
@@ -18,54 +13,4 @@
 #include "mpi/io.hpp"
 #include "mpi/window.hpp"
 
-namespace diy
-{
-namespace mpi
-{
-
-//! \ingroup MPI
-struct environment
-{
-  inline environment(int threading = MPI_THREAD_FUNNELED);
-  inline environment(int argc, char* argv[], int threading = MPI_THREAD_FUNNELED);
-  inline ~environment();
-
-  int   threading() const           { return provided_threading; }
-
-  int   provided_threading;
-};
-
-}
-}
-
-diy::mpi::environment::
-environment(int threading)
-{
-#ifndef DIY_NO_MPI
-  int argc = 0; char** argv;
-  MPI_Init_thread(&argc, &argv, threading, &provided_threading);
-#else
-  provided_threading = threading;
-#endif
-}
-
-diy::mpi::environment::
-environment(int argc, char* argv[], int threading)
-{
-#ifndef DIY_NO_MPI
-  MPI_Init_thread(&argc, &argv, threading, &provided_threading);
-#else
-  (void) argc; (void) argv;
-  provided_threading = threading;
-#endif
-}
-
-diy::mpi::environment::
-~environment()
-{
-#ifndef DIY_NO_MPI
-  MPI_Finalize();
-#endif
-}
-
-#endif
+#endif // DIY_MPI_HPP
diff --git a/include/vtkdiy2/mpi/collectives.cpp b/include/vtkdiy2/mpi/collectives.cpp
new file mode 100644
index 00000000000..1de4be4d197
--- /dev/null
+++ b/include/vtkdiy2/mpi/collectives.cpp
@@ -0,0 +1,161 @@
+#ifdef DIY_MPI_AS_LIB
+#include "collectives.hpp"
+#endif
+
+namespace diy
+{
+namespace mpi
+{
+namespace detail
+{
+
+inline void copy_buffer(const void* src, void* dst, size_t size, int count)
+{
+  if (src != dst)
+  {
+    std::copy_n(static_cast<const int8_t*>(src),
+                size * static_cast<size_t>(count),
+                static_cast<int8_t*>(dst));
+  }
+}
+
+void broadcast(const communicator& comm, void* data, int count, const datatype& type, int root)
+{
+#if DIY_HAS_MPI
+  MPI_Bcast(data, count, mpi_cast(type.handle), root, mpi_cast(comm.handle()));
+#else
+  (void) comm; (void) data; (void) count; (void) type; (void) root;
+#endif
+}
+
+request ibroadcast(const communicator& comm, void* data, int count, const datatype& type, int root)
+{
+  request r;
+#if DIY_HAS_MPI
+  MPI_Ibcast(data, count, mpi_cast(type.handle), root, mpi_cast(comm.handle()), &mpi_cast(r.handle));
+#else
+  (void) comm; (void) data; (void) count; (void) type; (void) root;
+#endif
+  return r;
+}
+
+void gather(const communicator& comm,
+            const void* dataIn, int count, const datatype& type, void* dataOut,
+            int root)
+{
+#if DIY_HAS_MPI
+  MPI_Gather(dataIn, count, mpi_cast(type.handle),
+             dataOut, count, mpi_cast(type.handle),
+             root, mpi_cast(comm.handle()));
+#else
+  copy_buffer(dataIn, dataOut, mpi_cast(type.handle), count);
+  (void)comm; (void)root;
+#endif
+}
+
+void gather_v(const communicator& comm,
+              const void* dataIn, int countIn, const datatype& type,
+              void* dataOut, const int counts[], const int offsets[],
+              int root)
+{
+#if DIY_HAS_MPI
+  MPI_Gatherv(dataIn, countIn, mpi_cast(type.handle),
+              dataOut, counts, offsets, mpi_cast(type.handle),
+              root, mpi_cast(comm.handle()));
+#else
+  copy_buffer(dataIn, dataOut, mpi_cast(type.handle), countIn);
+  (void)comm; (void)counts, (void)offsets, (void)root;
+#endif
+}
+
+void all_gather(const communicator& comm,
+                const void* dataIn, int count, const datatype& type, void* dataOut)
+{
+#if DIY_HAS_MPI
+  MPI_Allgather(dataIn, count, mpi_cast(type.handle),
+                dataOut, count, mpi_cast(type.handle),
+                mpi_cast(comm.handle()));
+#else
+  copy_buffer(dataIn, dataOut, mpi_cast(type.handle), count);
+  (void)comm;
+#endif
+}
+
+void all_gather_v(const communicator& comm,
+                  const void* dataIn, int countIn, const datatype& type,
+                  void* dataOut, const int counts[], const int offsets[])
+{
+#if DIY_HAS_MPI
+  MPI_Allgatherv(dataIn, countIn, mpi_cast(type.handle),
+                 dataOut, counts, offsets, mpi_cast(type.handle),
+                 mpi_cast(comm.handle()));
+#else
+  copy_buffer(dataIn, dataOut, mpi_cast(type.handle), countIn);
+  (void)comm; (void)counts; (void)offsets;
+#endif
+}
+
+void reduce(const communicator& comm,
+            const void* dataIn, int count, const datatype& type, void* dataOut,
+            const operation& op, int root)
+{
+#if DIY_HAS_MPI
+  MPI_Reduce(dataIn, dataOut, count, mpi_cast(type.handle), mpi_cast(op.handle), root, mpi_cast(comm.handle()));
+#else
+  copy_buffer(dataIn, dataOut, mpi_cast(type.handle), count);
+  (void)comm; (void)op; (void)root;
+#endif
+}
+
+void all_reduce(const communicator& comm,
+                const void* dataIn, void* dataOut, int count, const datatype& type,
+                const operation& op)
+{
+#if DIY_HAS_MPI
+  MPI_Allreduce(dataIn, dataOut, count, mpi_cast(type.handle), mpi_cast(op.handle), mpi_cast(comm.handle()));
+#else
+  copy_buffer(dataIn, dataOut, mpi_cast(type.handle), count);
+  (void)comm; (void)op;
+#endif
+}
+
+request iall_reduce(const communicator& comm,
+                    const void* dataIn, void* dataOut, int count, const datatype& type,
+                    const operation& op)
+{
+  request r;
+#if DIY_HAS_MPI
+  MPI_Iallreduce(dataIn, dataOut, count, mpi_cast(type.handle), mpi_cast(op.handle), mpi_cast(comm.handle()), &mpi_cast(r.handle));
+#else
+  copy_buffer(dataIn, dataOut, mpi_cast(type.handle), count);
+  (void)comm; (void)op;
+#endif
+  return r;
+}
+
+void scan(const communicator& comm,
+          const void* dataIn, void* dataOut, int count, const datatype& type,
+          const operation& op)
+{
+#if DIY_HAS_MPI
+  MPI_Scan(dataIn, dataOut, count, mpi_cast(type.handle), mpi_cast(op.handle), mpi_cast(comm.handle()));
+#else
+  copy_buffer(dataIn, dataOut, mpi_cast(type.handle), count);
+  (void)comm; (void)op;
+#endif
+}
+
+void all_to_all(const communicator& comm,
+                const void* dataIn, int count, const datatype& type, void* dataOut)
+{
+#if DIY_HAS_MPI
+  MPI_Alltoall(dataIn, count, mpi_cast(type.handle), dataOut, count, mpi_cast(type.handle), mpi_cast(comm.handle()));
+#else
+  copy_buffer(dataIn, dataOut, mpi_cast(type.handle), count);
+  (void)comm;
+#endif
+}
+
+}
+}
+} // diy::mpi::detail
diff --git a/include/vtkdiy2/mpi/collectives.hpp b/include/vtkdiy2/mpi/collectives.hpp
index 226a65f894c..5f1490c80c0 100644
--- a/include/vtkdiy2/mpi/collectives.hpp
+++ b/include/vtkdiy2/mpi/collectives.hpp
@@ -1,12 +1,80 @@
-#include <vector>
+#ifndef DIY_MPI_COLLECTIVES_HPP
+#define DIY_MPI_COLLECTIVES_HPP
 
-#include "../constants.h" // for DIY_UNUSED.
+#include "config.hpp"
+#include "communicator.hpp"
+#include "datatypes.hpp"
 #include "operations.hpp"
+#include "request.hpp"
+
+#include <algorithm>
+#include <vector>
+#include <numeric>
 
 namespace diy
 {
 namespace mpi
 {
+
+namespace detail
+{
+
+DIY_MPI_EXPORT_FUNCTION
+void broadcast(const communicator& comm,
+               void* data, int count, const datatype& type,
+               int root);
+
+DIY_MPI_EXPORT_FUNCTION
+request ibroadcast(const communicator& comm,
+                   void* data, int count, const datatype& type,
+                   int root);
+
+DIY_MPI_EXPORT_FUNCTION
+void gather(const communicator& comm,
+            const void* dataIn, int count, const datatype& type, void* dataOut,
+            int root);
+
+DIY_MPI_EXPORT_FUNCTION
+void gather_v(const communicator& comm,
+              const void* dataIn, int countIn, const datatype& type,
+              void* dataOut, const int counts[], const int offsets[],
+              int root);
+
+DIY_MPI_EXPORT_FUNCTION
+void all_gather(const communicator& comm,
+                const void* dataIn, int count, const datatype& type, void* dataOut);
+
+DIY_MPI_EXPORT_FUNCTION
+void all_gather_v(const communicator& comm,
+                  const void* dataIn, int countIn, const datatype& type,
+                  void* dataOut, const int counts[], const int offsets[]);
+
+DIY_MPI_EXPORT_FUNCTION
+void reduce(const communicator& comm,
+            const void* dataIn, int count, const datatype& type, void* dataOut,
+            const operation& op, int root);
+
+DIY_MPI_EXPORT_FUNCTION
+void all_reduce(const communicator& comm,
+                const void* dataIn, void* dataOut, int count, const datatype& type,
+                const operation& op);
+
+DIY_MPI_EXPORT_FUNCTION
+request iall_reduce(const communicator& comm,
+                    const void* dataIn, void* dataOut, int count, const datatype& type,
+                    const operation& op);
+
+DIY_MPI_EXPORT_FUNCTION
+void scan(const communicator& comm,
+          const void* dataIn, void* dataOut, int count, const datatype& type,
+          const operation& op);
+
+DIY_MPI_EXPORT_FUNCTION
+void all_to_all(const communicator& comm,
+                const void* dataIn, int count, const datatype& type, void* dataOut);
+
+} // detail
+
   //!\addtogroup MPI
   //!@{
 
@@ -15,288 +83,169 @@ namespace mpi
   {
     static void broadcast(const communicator& comm, T& x, int root)
     {
-#ifndef DIY_NO_MPI
-      MPI_Bcast(address(x), count(x), datatype(x), root, comm);
-#else
-      DIY_UNUSED(comm);
-      DIY_UNUSED(x);
-      DIY_UNUSED(root);
-#endif
+      detail::broadcast(comm, address(x), count(x), datatype_of(x), root);
     }
 
     static void broadcast(const communicator& comm, std::vector<T>& x, int root)
     {
-#ifndef DIY_NO_MPI
       size_t sz = x.size();
-      Collectives<size_t, void*>::broadcast(comm, sz, root);
+      detail::broadcast(comm, &sz, 1, datatype_of(sz), root);
 
       if (comm.rank() != root)
           x.resize(sz);
 
-      MPI_Bcast(address(x), count(x), datatype(x), root, comm);
-#else
-      DIY_UNUSED(comm);
-      DIY_UNUSED(x);
-      DIY_UNUSED(root);
-#endif
+      detail::broadcast(comm, address(x), count(x), datatype_of(x), root);
     }
 
     static request ibroadcast(const communicator& comm, T& x, int root)
     {
-#ifndef DIY_NO_MPI
-      request r;
-      MPI_Ibcast(address(x), count(x), datatype(x), root, comm, &r.r);
-      return r;
-#else
-      DIY_UNUSED(comm);
-      DIY_UNUSED(x);
-      DIY_UNUSED(root);
-      DIY_UNSUPPORTED_MPI_CALL(MPI_Ibcast);
-#endif
+      return detail::ibroadcast(comm, address(x), count(x), datatype_of(x), root);
     }
 
     static void gather(const communicator& comm, const T& in, std::vector<T>& out, int root)
     {
       out.resize(comm.size());
-#ifndef DIY_NO_MPI
-      MPI_Gather(address(in), count(in), datatype(in), address(out), count(in), datatype(out), root, comm);
-#else
-      DIY_UNUSED(comm);
-      DIY_UNUSED(root);
-      out[0] = in;
-#endif
+      detail::gather(comm, address(in), count(in), datatype_of(in), address(out), root);
     }
 
     static void gather(const communicator& comm, const std::vector<T>& in, std::vector< std::vector<T> >& out, int root)
     {
-#ifndef DIY_NO_MPI
-      std::vector<int>  counts(comm.size());
+      std::vector<int> counts;
+      if (comm.rank() == root)
+      {
+        counts.resize(static_cast<size_t>(comm.size()));
+      }
+
       Collectives<int,void*>::gather(comm, count(in), counts, root);
 
-      std::vector<int>  offsets(comm.size(), 0);
-      for (unsigned i = 1; i < offsets.size(); ++i)
-        offsets[i] = offsets[i-1] + counts[i-1];
+      std::vector<int> offsets;
+      if (comm.rank() == root)
+      {
+        offsets.resize(counts.size());
+        offsets[0] = 0;
+        std::partial_sum(counts.begin(), counts.end() - 1, offsets.begin() + 1);
+      }
 
       int elem_size = count(in[0]);     // size of 1 vector element in units of mpi datatype
-      std::vector<T> buffer((offsets.back() + counts.back()) / elem_size);
-      MPI_Gatherv(address(in), count(in), datatype(in),
-                  address(buffer),
-                  &counts[0],
-                  &offsets[0],
-                  datatype(buffer),
-                  root, comm);
+      std::vector<T> buffer;
+      if (comm.rank() == root)
+      {
+        buffer.resize((offsets.back() + counts.back()) / elem_size);
+      }
 
-      out.resize(comm.size());
-      size_t cur = 0;
-      for (unsigned i = 0; i < (unsigned)comm.size(); ++i)
+      detail::gather_v(comm, address(in), count(in), datatype_of(in),
+                       address(buffer), counts.data(), offsets.data(),
+                       root);
+
+      if (comm.rank() == root)
       {
-          out[i].reserve(counts[i] / elem_size);
-          for (unsigned j = 0; j < (unsigned)(counts[i] / elem_size); ++j)
-              out[i].push_back(buffer[cur++]);
+          out.resize(static_cast<size_t>(comm.size()));
+          size_t offset = 0;
+          for (size_t i = 0; i < out.size(); ++i)
+          {
+            auto count = static_cast<size_t>(counts[i] / elem_size);
+            out[i].insert(out[i].end(), buffer.data() + offset, buffer.data() + offset + count);
+            offset += count;
+          }
       }
-#else
-      DIY_UNUSED(comm);
-      DIY_UNUSED(root);
-      out.resize(1);
-      out[0] = in;
-#endif
     }
 
     static void gather(const communicator& comm, const T& in, int root)
     {
-#ifndef DIY_NO_MPI
-      MPI_Gather(address(in), count(in), datatype(in), address(in), count(in), datatype(in), root, comm);
-#else
-      DIY_UNUSED(comm);
-      DIY_UNUSED(in);
-      DIY_UNUSED(root);
-      DIY_UNSUPPORTED_MPI_CALL("MPI_Gather");
-#endif
+      detail::gather(comm, address(in), count(in), datatype_of(in), address(in), root);
     }
 
     static void gather(const communicator& comm, const std::vector<T>& in, int root)
     {
-#ifndef DIY_NO_MPI
       Collectives<int,void*>::gather(comm, count(in), root);
-
-      MPI_Gatherv(address(in), count(in), datatype(in),
-                  0, 0, 0,
-                  datatype(in),
-                  root, comm);
-#else
-      DIY_UNUSED(comm);
-      DIY_UNUSED(in);
-      DIY_UNUSED(root);
-      DIY_UNSUPPORTED_MPI_CALL("MPI_Gatherv");
-#endif
+      detail::gather_v(comm, address(in), count(in), datatype_of(in), 0, 0, 0, root);
     }
 
     static void all_gather(const communicator& comm, const T& in, std::vector<T>& out)
     {
       out.resize(comm.size());
-#ifndef DIY_NO_MPI
-      MPI_Allgather(address(in), count(in), datatype(in),
-                    address(out), count(in), datatype(in),
-                    comm);
-#else
-      DIY_UNUSED(comm);
-      out[0] = in;
-#endif
+      detail::all_gather(comm, address(in), count(in), datatype_of(in), address(out));
     }
 
     static void all_gather(const communicator& comm, const std::vector<T>& in, std::vector< std::vector<T> >& out)
     {
-#ifndef DIY_NO_MPI
-      std::vector<int>  counts(comm.size());
+      std::vector<int>  counts(static_cast<size_t>(comm.size()));
       Collectives<int,void*>::all_gather(comm, count(in), counts);
 
-      std::vector<int>  offsets(comm.size(), 0);
-      for (unsigned i = 1; i < offsets.size(); ++i)
-        offsets[i] = offsets[i-1] + counts[i-1];
+      std::vector<int>  offsets(counts.size());
+      offsets[0] = 0;
+      std::partial_sum(counts.begin(), counts.end() - 1, offsets.begin() + 1);
 
       int elem_size = count(in[0]);     // size of 1 vector element in units of mpi datatype
       std::vector<T> buffer((offsets.back() + counts.back()) / elem_size);
-      MPI_Allgatherv(address(in), count(in), datatype(in),
-                     address(buffer),
-                     &counts[0],
-                     &offsets[0],
-                     datatype(buffer),
-                     comm);
-
-      out.resize(comm.size());
-      size_t cur = 0;
-      for (int i = 0; i < comm.size(); ++i)
+      detail::all_gather_v(comm,
+                           address(in), count(in), datatype_of(in),
+                           address(buffer),
+                           &counts[0],
+                           &offsets[0]);
+
+      out.resize(static_cast<size_t>(comm.size()));
+      size_t offset = 0;
+      for (size_t i = 0; i < out.size(); ++i)
       {
-          out[i].reserve(counts[i] / elem_size);
-          for (int j = 0; j < (int)(counts[i] / elem_size); ++j)
-              out[i].push_back(buffer[cur++]);
+          auto count = static_cast<size_t>(counts[i] / elem_size);
+          out[i].insert(out[i].end(), buffer.data() + offset, buffer.data() + offset + count);
+          offset += count;
       }
-#else
-      DIY_UNUSED(comm);
-      out.resize(1);
-      out[0] = in;
-#endif
     }
 
     static void reduce(const communicator& comm, const T& in, T& out, int root, const Op&)
     {
-#ifndef DIY_NO_MPI
-      MPI_Reduce(address(in), address(out), count(in), datatype(in),
-                 detail::mpi_op<Op>::get(),
-                 root, comm);
-#else
-      DIY_UNUSED(comm);
-      DIY_UNUSED(root);
-      out = in;
-#endif
+      auto op = detail::mpi_op<Op>::get();
+      detail::reduce(comm, address(in), count(in), datatype_of(in), address(out), op, root);
     }
 
     static void reduce(const communicator& comm, const T& in, int root, const Op&)
     {
-#ifndef DIY_NO_MPI
-      MPI_Reduce(address(in), address(in), count(in), datatype(in),
-                 detail::mpi_op<Op>::get(),
-                 root, comm);
-#else
-      DIY_UNUSED(comm);
-      DIY_UNUSED(in);
-      DIY_UNUSED(root);
-      DIY_UNSUPPORTED_MPI_CALL("MPI_Reduce");
-#endif
+      auto op = detail::mpi_op<Op>::get();
+      detail::reduce(comm, address(in), count(in), datatype_of(in), address(in), op, root);
     }
 
     static void all_reduce(const communicator& comm, const T& in, T& out, const Op&)
     {
-#ifndef DIY_NO_MPI
-      MPI_Allreduce(address(in), address(out), count(in), datatype(in),
-                    detail::mpi_op<Op>::get(),
-                    comm);
-#else
-      DIY_UNUSED(comm);
-      out = in;
-#endif
+      auto op = detail::mpi_op<Op>::get();
+      detail::all_reduce(comm, address(in), address(out), count(in), datatype_of(in), op);
     }
 
     static void all_reduce(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, const Op&)
     {
-#ifndef DIY_NO_MPI
+      auto op = detail::mpi_op<Op>::get();
       out.resize(in.size());
-      MPI_Allreduce(address(in), address(out), count(in),
-                    datatype(in),
-                    detail::mpi_op<Op>::get(),
-                    comm);
-#else
-      DIY_UNUSED(comm);
-      out = in;
-#endif
+      detail::all_reduce(comm, address(in), address(out), count(in), datatype_of(in), op);
     }
 
     static request iall_reduce(const communicator& comm, const T& in, T& out, const Op&)
     {
-#ifndef DIY_NO_MPI
-      request r;
-      MPI_Iallreduce(address(in), address(out), count(in), datatype(in),
-                     detail::mpi_op<Op>::get(),
-                     comm, &r.r);
-      return r;
-#else
-      DIY_UNUSED(comm);
-      DIY_UNUSED(in);
-      DIY_UNUSED(out);
-      DIY_UNSUPPORTED_MPI_CALL(MPI_Iallreduce);
-#endif
+      auto op = detail::mpi_op<Op>::get();
+      return detail::iall_reduce(comm, address(in), address(out), count(in), datatype_of(in), op);
     }
 
     static request iall_reduce(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, const Op&)
     {
-#ifndef DIY_NO_MPI
-      request r;
+      auto op = detail::mpi_op<Op>::get();
       out.resize(in.size());
-      MPI_Iallreduce(address(in), address(out), count(in),
-                     datatype(in),
-                     detail::mpi_op<Op>::get(),
-                     comm, &r.r);
-      return r;
-#else
-      DIY_UNUSED(comm);
-      DIY_UNUSED(in);
-      DIY_UNUSED(out);
-      DIY_UNSUPPORTED_MPI_CALL(MPI_Iallreduce);
-#endif
+      return detail::iall_reduce(comm, address(in), address(out), count(in), datatype_of(in), op);
     }
 
     static void scan(const communicator& comm, const T& in, T& out, const Op&)
     {
-#ifndef DIY_NO_MPI
-      MPI_Scan(address(in), address(out), count(in), datatype(in),
-               detail::mpi_op<Op>::get(),
-               comm);
-#else
-      DIY_UNUSED(comm);
-      out = in;
-#endif
+      auto op = detail::mpi_op<Op>::get();
+      detail::scan(comm, address(in), address(out), count(in), datatype_of(in), op);
     }
 
     static void all_to_all(const communicator& comm, const std::vector<T>& in, std::vector<T>& out, int n = 1)
     {
-#ifndef DIY_NO_MPI
       // n specifies how many elements go to/from every process from every process;
       // the sizes of in and out are expected to be n * comm.size()
 
       int elem_size = count(in[0]);               // size of 1 vector element in units of mpi datatype
       // NB: this will fail if T is a vector
-      MPI_Alltoall(address(in),
-                   elem_size * n,
-                   datatype(in),
-                   address(out),
-                   elem_size * n,
-                   datatype(out),
-                   comm);
-#else
-      DIY_UNUSED(comm);
-      DIY_UNUSED(n);
-      out = in;
-#endif
+      detail::all_to_all(comm, address(in), elem_size * n, datatype_of(in), address(out));
     }
   };
 
@@ -449,3 +398,9 @@ namespace mpi
   //!@}
 }
 }
+
+#ifndef DIY_MPI_AS_LIB
+#include "collectives.cpp"
+#endif
+
+#endif // DIY_MPI_COLLECTIVES_HPP
diff --git a/include/vtkdiy2/mpi/communicator.cpp b/include/vtkdiy2/mpi/communicator.cpp
new file mode 100644
index 00000000000..a3df545fdda
--- /dev/null
+++ b/include/vtkdiy2/mpi/communicator.cpp
@@ -0,0 +1,130 @@
+#ifdef DIY_MPI_AS_LIB
+#include "communicator.hpp"
+#endif
+
+diy::mpi::communicator::communicator()
+  : comm_(make_DIY_MPI_Comm(MPI_COMM_WORLD)), rank_(0), size_(1), owner_(false)
+{
+#if DIY_HAS_MPI
+  MPI_Comm_rank(mpi_cast(comm_), &rank_);
+  MPI_Comm_size(mpi_cast(comm_), &size_);
+#endif
+}
+
+diy::mpi::communicator::
+communicator(DIY_MPI_Comm comm, bool owner):
+    comm_(comm), rank_(0), size_(1), owner_(owner)
+{
+#if DIY_HAS_MPI
+  if (mpi_cast(comm_) != MPI_COMM_NULL)
+  {
+    MPI_Comm_rank(mpi_cast(comm_), &rank_);
+    MPI_Comm_size(mpi_cast(comm_), &size_);
+  }
+#endif
+}
+
+#ifndef DIY_MPI_AS_LIB // only available in header-only mode
+diy::mpi::communicator::
+communicator(MPI_Comm comm, bool owner):
+    comm_(comm), rank_(0), size_(1), owner_(owner)
+{
+#if DIY_HAS_MPI
+  if (comm_ != MPI_COMM_NULL)
+  {
+    MPI_Comm_rank(comm_, &rank_);
+    MPI_Comm_size(comm_, &size_);
+  }
+#endif
+}
+#endif
+
+void
+diy::mpi::communicator::
+destroy()
+{
+#if DIY_HAS_MPI
+    if (owner_)
+        MPI_Comm_free(&mpi_cast(comm_));
+#endif
+}
+
+diy::mpi::status
+diy::mpi::communicator::
+probe(int source, int tag) const
+{
+#if DIY_HAS_MPI
+  status s;
+  MPI_Probe(source, tag, mpi_cast(comm_), &mpi_cast(s.handle));
+  return s;
+#else
+  (void) source; (void) tag;
+  DIY_UNSUPPORTED_MPI_CALL(MPI_Probe);
+#endif
+}
+
+diy::mpi::optional<diy::mpi::status>
+diy::mpi::communicator::
+iprobe(int source, int tag) const
+{
+  (void) source; (void) tag;
+#if DIY_HAS_MPI
+  status s;
+  int flag;
+  MPI_Iprobe(source, tag, mpi_cast(comm_), &flag, &mpi_cast(s.handle));
+  if (flag)
+    return s;
+#endif
+  return optional<status>();
+}
+
+void
+diy::mpi::communicator::
+barrier() const
+{
+#if DIY_HAS_MPI
+  MPI_Barrier(mpi_cast(comm_));
+#endif
+}
+
+diy::mpi::communicator
+diy::mpi::communicator::
+split(int color, int key) const
+{
+#if DIY_HAS_MPI
+    DIY_MPI_Comm newcomm;
+    MPI_Comm_split(mpi_cast(comm_), color, key, &mpi_cast(newcomm));
+    return communicator(newcomm, true);
+#else
+    (void) color; (void) key;
+    return communicator();
+#endif
+}
+
+diy::mpi::request
+diy::mpi::communicator::
+ibarrier() const
+{
+#if DIY_HAS_MPI
+    request r;
+    MPI_Ibarrier(mpi_cast(comm_), &mpi_cast(r.handle));
+    return r;
+#else
+    // this is not the ideal fix; in principle we should just return a status
+    // that tests true, but this requires redesigning some parts of our no-mpi
+    // handling
+    DIY_UNSUPPORTED_MPI_CALL(MPI_Ibarrier);
+#endif
+}
+
+void
+diy::mpi::communicator::
+duplicate(const communicator& other)
+{
+#if DIY_HAS_MPI
+    DIY_MPI_Comm newcomm;
+    MPI_Comm_dup(mpi_cast(other.comm_), &mpi_cast(newcomm));
+    (*this) = communicator(newcomm,true);
+#endif
+    (void) other;
+}
diff --git a/include/vtkdiy2/mpi/communicator.hpp b/include/vtkdiy2/mpi/communicator.hpp
index 4b29c4aca19..a4facaa149f 100644
--- a/include/vtkdiy2/mpi/communicator.hpp
+++ b/include/vtkdiy2/mpi/communicator.hpp
@@ -1,3 +1,12 @@
+#ifndef DIY_MPI_COMMUNICATOR_HPP
+#define DIY_MPI_COMMUNICATOR_HPP
+
+#include "config.hpp"
+#include "optional.hpp"
+#include "point-to-point.hpp"
+#include "request.hpp"
+#include "status.hpp"
+
 namespace diy
 {
 namespace mpi
@@ -8,8 +17,14 @@ namespace mpi
   class communicator
   {
     public:
-                inline
-                communicator(MPI_Comm comm = MPI_COMM_WORLD, bool owner = false);
+                DIY_MPI_EXPORT_FUNCTION
+                communicator();
+
+                communicator(DIY_MPI_Comm comm):
+                  communicator(comm, false) {}
+
+                DIY_MPI_EXPORT_FUNCTION
+                communicator(DIY_MPI_Comm comm, bool owner);
 
                 ~communicator()                     { destroy(); }
 
@@ -25,9 +40,19 @@ namespace mpi
                     size_(other.size_),
                     owner_(other.owner_)                    { other.owner_ = false; }
 
-    communicator&
+#ifndef DIY_MPI_AS_LIB // only available in header-only mode
+                communicator(MPI_Comm comm):
+                  communicator(comm, false) {}
+
+                DIY_MPI_EXPORT_FUNCTION
+                communicator(MPI_Comm comm, bool owner);
+
+                operator MPI_Comm() { return comm_; }
+#endif
+
+      communicator&
                 operator=(const communicator& other)        { destroy(); comm_ = other.comm_; rank_ = other.rank_; size_ = other.size_; owner_ = false; return *this; }
-    communicator&
+      communicator&
                 operator=(communicator&& other)             { destroy(); comm_ = other.comm_; rank_ = other.rank_; size_ = other.size_; owner_ = other.owner_; other.owner_ = false; return *this; }
 
       int       rank() const                        { return rank_; }
@@ -35,169 +60,74 @@ namespace mpi
 
       //! Send `x` to processor `dest` using `tag` (blocking).
       template<class T>
-      void      send(int dest, int tag, const T& x) const   { detail::send<T>()(comm_, dest, tag, x); }
+      void      send(int dest, int tag, const T& x) const   { detail::send(comm_, dest, tag, x); }
+
+      template<class T>
+      void      ssend(int dest, int tag, const T& x) const   { detail::ssend(comm_, dest, tag, x); }
 
       //! Receive `x` from `dest` using `tag` (blocking).
       //! If `T` is an `std::vector<...>`, `recv` will resize it to fit exactly the sent number of values.
       template<class T>
-      status    recv(int source, int tag, T& x) const       { return detail::recv<T>()(comm_, source, tag, x); }
+      status    recv(int source, int tag, T& x) const       { return detail::recv(comm_, source, tag, x); }
 
       //! Non-blocking version of `send()`.
       template<class T>
-      request   isend(int dest, int tag, const T& x) const  { return detail::isend<T>()(comm_, dest, tag, x); }
+      request   isend(int dest, int tag, const T& x) const  { return detail::isend(comm_, dest, tag, x); }
 
       //! Non-blocking version of `ssend()`.
       template<class T>
-      request   issend(int dest, int tag, const T& x) const  { return detail::issend<T>()(comm_, dest, tag, x); }
+      request   issend(int dest, int tag, const T& x) const  { return detail::issend(comm_, dest, tag, x); }
 
       //! Non-blocking version of `recv()`.
       //! If `T` is an `std::vector<...>`, its size must be big enough to accommodate the sent values.
       template<class T>
-      request   irecv(int source, int tag, T& x) const      { return detail::irecv<T>()(comm_, source, tag, x); }
+      request   irecv(int source, int tag, T& x) const      { return detail::irecv(comm_, source, tag, x); }
 
       //! probe
-      inline
+      DIY_MPI_EXPORT_FUNCTION
       status    probe(int source, int tag) const;
 
       //! iprobe
-      inline
+      DIY_MPI_EXPORT_FUNCTION
       optional<status>
                 iprobe(int source, int tag) const;
 
       //! barrier
-      inline
+      DIY_MPI_EXPORT_FUNCTION
       void      barrier() const;
 
       //! Nonblocking version of barrier
-      inline
+      DIY_MPI_EXPORT_FUNCTION
       request   ibarrier() const;
 
-                operator MPI_Comm() const                   { return comm_; }
-
       //! split
       //! When keys are the same, the ties are broken by the rank in the original comm.
-      inline
+      DIY_MPI_EXPORT_FUNCTION
       communicator
                 split(int color, int key = 0) const;
 
       //! duplicate
-      inline
+      DIY_MPI_EXPORT_FUNCTION
       void      duplicate(const communicator& other);
 
+      DIY_MPI_Comm handle() const { return comm_; }
+
     private:
-      inline
+      DIY_MPI_EXPORT_FUNCTION
       void      destroy();
 
     private:
-      MPI_Comm  comm_;
-      int       rank_;
-      int       size_;
-      bool      owner_;
+      DIY_MPI_Comm  comm_;
+      int           rank_;
+      int           size_;
+      bool          owner_;
   };
-}
-}
 
-diy::mpi::communicator::
-communicator(MPI_Comm comm, bool owner):
-    comm_(comm), rank_(0), size_(1), owner_(owner)
-{
-#ifndef DIY_NO_MPI
-  if (comm != MPI_COMM_NULL)
-  {
-    MPI_Comm_rank(comm_, &rank_);
-    MPI_Comm_size(comm_, &size_);
-  }
-#endif
 }
+} // diy::mpi
 
-void
-diy::mpi::communicator::
-destroy()
-{
-#ifndef DIY_NO_MPI
-    if (owner_)
-        MPI_Comm_free(&comm_);
+#ifndef DIY_MPI_AS_LIB
+#include "communicator.cpp"
 #endif
-}
 
-diy::mpi::status
-diy::mpi::communicator::
-probe(int source, int tag) const
-{
-  (void) source;
-  (void) tag;
-
-#ifndef DIY_NO_MPI
-  status s;
-  MPI_Probe(source, tag, comm_, &s.s);
-  return s;
-#else
-  DIY_UNSUPPORTED_MPI_CALL(MPI_Probe);
-#endif
-}
-
-diy::mpi::optional<diy::mpi::status>
-diy::mpi::communicator::
-iprobe(int source, int tag) const
-{
-  (void) source;
-  (void) tag;
-#ifndef DIY_NO_MPI
-  status s;
-  int flag;
-  MPI_Iprobe(source, tag, comm_, &flag, &s.s);
-  if (flag)
-    return s;
-#endif
-  return optional<status>();
-}
-
-void
-diy::mpi::communicator::
-barrier() const
-{
-#ifndef DIY_NO_MPI
-  MPI_Barrier(comm_);
-#endif
-}
-
-diy::mpi::communicator
-diy::mpi::communicator::
-split(int color, int key) const
-{
-#ifndef DIY_NO_MPI
-    MPI_Comm newcomm;
-    MPI_Comm_split(comm_, color, key, &newcomm);
-    return communicator(newcomm, true);
-#else
-    return communicator();
-#endif
-}
-
-diy::mpi::request
-diy::mpi::communicator::
-ibarrier() const
-{
-#ifndef DIY_NO_MPI
-    request r_;
-    MPI_Ibarrier(comm_, &r_.r);
-    return r_;
-#else
-    // this is not the ideal fix; in principle we should just return a status
-    // that tests true, but this requires redesigning some parts of our no-mpi
-    // handling
-    DIY_UNSUPPORTED_MPI_CALL(MPI_Ibarrier);
-#endif
-}
-
-
-void
-diy::mpi::communicator::
-duplicate(const communicator& other)
-{
-#ifndef DIY_NO_MPI
-    MPI_Comm newcomm;
-    MPI_Comm_dup(other.comm_, &newcomm);
-    (*this) = communicator(newcomm,true);
-#endif
-}
+#endif // DIY_MPI_COMMUNICATOR_HPP
diff --git a/include/vtkdiy2/mpi/config.hpp b/include/vtkdiy2/mpi/config.hpp
new file mode 100644
index 00000000000..d9edb1ce326
--- /dev/null
+++ b/include/vtkdiy2/mpi/config.hpp
@@ -0,0 +1,85 @@
+#ifndef DIY_MPI_CONFIG_HPP
+#define DIY_MPI_CONFIG_HPP
+
+#include <utility>
+
+/// We want to allow the use of `diy::mpi` in either header-only or library mode.
+/// DIY_MPI_AS_LIB is defined when using library mode.
+/// This file contains some configuration macros. To maintain backwards compatibility
+/// suitable default values should be defined when using header-only mode.
+
+/// DIY_HAS_MPI should always be defined when DIY_MPI_AS_LIB is defined, but only for
+/// the compilation units that are part of the library.
+/// DIY_HAS_MPI=1 means MPI library is availalbe.
+/// For header-only, the default is to assume MPI is available
+#if !defined(DIY_MPI_AS_LIB) && !defined(DIY_HAS_MPI)
+#  define DIY_HAS_MPI 1
+#endif
+
+/// Include appropriate mpi header. Since DIY_HAS_MPI is only defined for
+/// the compilation units of the library, when in library mode, the header is
+/// only included for the library's compilation units.
+#ifdef DIY_HAS_MPI
+#  if DIY_HAS_MPI
+#    include <vtk_mpi.h>
+#  else
+#    include "no-mpi.hpp"
+#  endif
+#endif
+
+/// Classes and objects that need to be visible to clients of the library should be
+/// marked as DIY_MPI_EXPORT. Similarly API functions should be marked as
+/// DIY_MPI_EXPORT_FUNCTION.
+#include "diy-mpi-export.h" // defines DIY_MPI_EXPORT and DIY_MPI_EXPORT_FUNCTION
+
+/// Define alisases for MPI types
+#ifdef DIY_MPI_AS_LIB
+#  include "mpitypes.hpp" // only configured in library mode
+#else // ifdef DIY_MPI_AS_LIB
+
+namespace diy
+{
+namespace mpi
+{
+
+#define DEFINE_DIY_MPI_TYPE(mpitype)                                          \
+struct DIY_##mpitype {                                                        \
+  DIY_##mpitype() = default;                                                  \
+  DIY_##mpitype(const mpitype& obj) : data(obj) {}                            \
+  DIY_##mpitype& operator=(const mpitype& obj) { data = obj; return *this; }  \
+  operator mpitype() { return data; }                                         \
+  mpitype data;                                                               \
+};
+
+#define DEFINE_DIY_MPI_TYPE_MOVE(mpitype)                                           \
+struct DIY_##mpitype {                                                              \
+  DIY_##mpitype() = default;                                                        \
+  DIY_##mpitype(const mpitype&) = delete;                                           \
+  DIY_##mpitype(mpitype&& obj) : data(std::move(obj)) {}                            \
+  DIY_##mpitype& operator=(const mpitype&) = delete;                                \
+  DIY_##mpitype& operator=(mpitype&& obj) { data = std::move(obj); return *this; }  \
+  operator const mpitype&() const { return data; }                                  \
+  void reset() { data = mpitype(); }                                                \
+private:                                                                            \
+  mpitype data;                                                                     \
+};
+
+DEFINE_DIY_MPI_TYPE(MPI_Comm)
+DEFINE_DIY_MPI_TYPE(MPI_Datatype)
+DEFINE_DIY_MPI_TYPE(MPI_Status)
+DEFINE_DIY_MPI_TYPE(MPI_Request)
+DEFINE_DIY_MPI_TYPE(MPI_Op)
+DEFINE_DIY_MPI_TYPE(MPI_File)
+DEFINE_DIY_MPI_TYPE_MOVE(MPI_Win)
+
+#undef DEFINE_DIY_MPI_TYPE
+
+}
+} // diy::mpi
+#endif // ifdef DIY_MPI_AS_LIB
+
+#ifdef DIY_HAS_MPI
+#  include "mpi_cast.hpp"
+#endif
+
+#endif // DIY_MPI_CONFIG_HPP
diff --git a/include/vtkdiy2/mpi/constants.hpp b/include/vtkdiy2/mpi/constants.hpp
deleted file mode 100644
index 7668e418f19..00000000000
--- a/include/vtkdiy2/mpi/constants.hpp
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef DIY_MPI_CONSTANTS_HPP
-#define DIY_MPI_CONSTANTS_HPP
-
-namespace diy
-{
-namespace mpi
-{
-  const int any_source  = MPI_ANY_SOURCE;
-  const int any_tag     = MPI_ANY_TAG;
-}
-}
-
-#endif
diff --git a/include/vtkdiy2/mpi/datatypes.cpp b/include/vtkdiy2/mpi/datatypes.cpp
new file mode 100644
index 00000000000..434e13b66b8
--- /dev/null
+++ b/include/vtkdiy2/mpi/datatypes.cpp
@@ -0,0 +1,34 @@
+#ifdef DIY_MPI_AS_LIB
+#include "datatypes.hpp"
+#endif
+
+namespace diy
+{
+namespace mpi
+{
+
+namespace detail
+{
+
+#define DIY_MPI_DATATYPE_MAP(cpp_type, mpi_type)                                                  \
+  template<>  datatype get_mpi_datatype<cpp_type>() {                                             \
+    return datatype(make_DIY_MPI_Datatype(mpi_type));                                             \
+  }
+
+  DIY_MPI_DATATYPE_MAP(char,                MPI_BYTE)
+  DIY_MPI_DATATYPE_MAP(unsigned char,       MPI_BYTE)
+  DIY_MPI_DATATYPE_MAP(bool,                MPI_BYTE)
+  DIY_MPI_DATATYPE_MAP(int,                 MPI_INT)
+  DIY_MPI_DATATYPE_MAP(unsigned,            MPI_UNSIGNED)
+  DIY_MPI_DATATYPE_MAP(long,                MPI_LONG)
+  DIY_MPI_DATATYPE_MAP(unsigned long,       MPI_UNSIGNED_LONG)
+  DIY_MPI_DATATYPE_MAP(long long,           MPI_LONG_LONG_INT)
+  DIY_MPI_DATATYPE_MAP(unsigned long long,  MPI_UNSIGNED_LONG_LONG)
+  DIY_MPI_DATATYPE_MAP(float,               MPI_FLOAT)
+  DIY_MPI_DATATYPE_MAP(double,              MPI_DOUBLE)
+
+#undef DIY_MPI_DATATYPE_MAP
+
+}
+}
+} // diy::mpi::detail
diff --git a/include/vtkdiy2/mpi/datatypes.hpp b/include/vtkdiy2/mpi/datatypes.hpp
index ed89c6b62b5..a8e2f807074 100644
--- a/include/vtkdiy2/mpi/datatypes.hpp
+++ b/include/vtkdiy2/mpi/datatypes.hpp
@@ -1,17 +1,32 @@
 #ifndef DIY_MPI_DATATYPES_HPP
 #define DIY_MPI_DATATYPES_HPP
 
+#include "config.hpp"
+
 #include <vector>
 #include <array>
+#include <cstddef>
 
 namespace diy
 {
 namespace mpi
 {
-namespace detail
+
+struct datatype
 {
-  template<class T> MPI_Datatype  get_mpi_datatype();
+  datatype() = default;
+  datatype(const DIY_MPI_Datatype& dt) : handle(dt) {}
+
+#ifndef DIY_MPI_AS_LIB // only available in header-only mode
+  datatype(const MPI_Datatype& dt) : handle(dt) {}
+  operator MPI_Datatype() { return handle; }
+#endif
+
+  DIY_MPI_Datatype handle;
+};
 
+namespace detail
+{
   struct true_type  {};
   struct false_type {};
 
@@ -19,30 +34,34 @@ namespace detail
   template<class T>
   struct is_mpi_datatype        { typedef false_type    type; };
 
-#define DIY_MPI_DATATYPE_MAP(cpp_type, mpi_type) \
-  template<>  inline MPI_Datatype  get_mpi_datatype<cpp_type>() { return mpi_type; }  \
-  template<>  struct is_mpi_datatype<cpp_type>                  { typedef true_type type; };    \
-  template<>  struct is_mpi_datatype< std::vector<cpp_type> >   { typedef true_type type; };    \
-  template<size_t D>  \
-              struct is_mpi_datatype< std::array<cpp_type,D> >  { typedef true_type type; };
-
-  DIY_MPI_DATATYPE_MAP(char,                  MPI_BYTE);
-  DIY_MPI_DATATYPE_MAP(unsigned char,         MPI_BYTE);
-  DIY_MPI_DATATYPE_MAP(bool,                  MPI_BYTE);
-  DIY_MPI_DATATYPE_MAP(int,                   MPI_INT);
-  DIY_MPI_DATATYPE_MAP(unsigned,              MPI_UNSIGNED);
-  DIY_MPI_DATATYPE_MAP(long,                  MPI_LONG);
-  DIY_MPI_DATATYPE_MAP(unsigned long,         MPI_UNSIGNED_LONG);
-  DIY_MPI_DATATYPE_MAP(long long,             MPI_LONG_LONG_INT);
-  DIY_MPI_DATATYPE_MAP(unsigned long long,    MPI_UNSIGNED_LONG_LONG);
-  DIY_MPI_DATATYPE_MAP(float,                 MPI_FLOAT);
-  DIY_MPI_DATATYPE_MAP(double,                MPI_DOUBLE);
+  template<class T> datatype  get_mpi_datatype();
+
+  #define DIY_MPI_DATATYPE_DEFAULT(cpp_type)                                                      \
+  template<> DIY_MPI_EXPORT_FUNCTION datatype get_mpi_datatype<cpp_type>();                       \
+  template<>  struct is_mpi_datatype< cpp_type >                { typedef true_type type; };      \
+  template<>  struct is_mpi_datatype< std::vector<cpp_type> >   { typedef true_type type; };      \
+  template<size_t N>                                                                              \
+              struct is_mpi_datatype< std::array<cpp_type, N> > { typedef true_type type; };
+
+  DIY_MPI_DATATYPE_DEFAULT(char)
+  DIY_MPI_DATATYPE_DEFAULT(unsigned char)
+  DIY_MPI_DATATYPE_DEFAULT(bool)
+  DIY_MPI_DATATYPE_DEFAULT(int)
+  DIY_MPI_DATATYPE_DEFAULT(unsigned)
+  DIY_MPI_DATATYPE_DEFAULT(long)
+  DIY_MPI_DATATYPE_DEFAULT(unsigned long)
+  DIY_MPI_DATATYPE_DEFAULT(long long)
+  DIY_MPI_DATATYPE_DEFAULT(unsigned long long)
+  DIY_MPI_DATATYPE_DEFAULT(float)
+  DIY_MPI_DATATYPE_DEFAULT(double)
+
+  #undef DIY_MPI_DATATYPE_DEFAULT
 
   /* mpi_datatype: helper routines, specialized for std::vector<...>, std::array<...> */
   template<class T>
   struct mpi_datatype
   {
-    static MPI_Datatype         datatype()              { return get_mpi_datatype<T>(); }
+    static diy::mpi::datatype   datatype()              { return get_mpi_datatype<T>(); }
     static const void*          address(const T& x)     { return &x; }
     static void*                address(T& x)           { return &x; }
     static int                  count(const T&)         { return 1; }
@@ -53,7 +72,7 @@ namespace detail
   {
     typedef     std::vector<U>      VecU;
 
-    static MPI_Datatype         datatype()              { return mpi_datatype<U>::datatype(); }
+    static diy::mpi::datatype   datatype()              { return mpi_datatype<U>::datatype(); }
     static const void*          address(const VecU& x)  { return x.data(); }
     static void*                address(VecU& x)        { return x.data(); }
     static int                  count(const VecU& x)    { return x.empty() ? 0 : (static_cast<int>(x.size()) * mpi_datatype<U>::count(x[0])); }
@@ -64,7 +83,7 @@ namespace detail
   {
     typedef     std::array<U,D> ArrayU;
 
-    static MPI_Datatype         datatype()                  { return mpi_datatype<U>::datatype(); }
+    static diy::mpi::datatype   datatype()                  { return mpi_datatype<U>::datatype(); }
     static const void*          address(const ArrayU& x)    { return x.data(); }
     static void*                address(ArrayU& x)          { return x.data(); }
     static int                  count(const ArrayU& x)      { return x.empty() ? 0 : (static_cast<int>(x.size()) * mpi_datatype<U>::count(x[0])); }
@@ -72,36 +91,34 @@ namespace detail
 } // detail
 
 template<class U>
-static MPI_Datatype datatype(const U&)
+static datatype datatype_of(const U&)
 {
-    using Datatype = detail::mpi_datatype<U>;
-    return Datatype::datatype();
+    return detail::mpi_datatype<U>::datatype();
 }
 
 template<class U>
 static void* address(const U& x)
 {
-    using Datatype = detail::mpi_datatype<U>;
-    return const_cast<void*>(Datatype::address(x));
+    return const_cast<void*>(detail::mpi_datatype<U>::address(x));
 }
 
 template<class U>
 static void* address(U& x)
 {
-    using Datatype = detail::mpi_datatype<U>;
-    return Datatype::address(x);
+    return detail::mpi_datatype<U>::address(x);
 }
 
 template<class U>
 static int count(const U& x)
 {
-    using Datatype = detail::mpi_datatype<U>;
-    return Datatype::count(x);
+    return detail::mpi_datatype<U>::count(x);
 }
 
-
-
 } // mpi
 } // diy
 
+#ifndef DIY_MPI_AS_LIB
+#include "datatypes.cpp"
 #endif
+
+#endif // DIY_MPI_DATATYPES_HPP
diff --git a/include/vtkdiy2/mpi/diy-mpi-export.h b/include/vtkdiy2/mpi/diy-mpi-export.h
new file mode 100644
index 00000000000..fb0e5611184
--- /dev/null
+++ b/include/vtkdiy2/mpi/diy-mpi-export.h
@@ -0,0 +1,49 @@
+#ifndef DIY_MPI_EXPORT_H
+#define DIY_MPI_EXPORT_H
+
+#if defined(_MSC_VER)
+#  ifdef DIY_MPI_STATIC_BUILD
+     /* This is a static component and has no need for exports
+        elf based static libraries are able to have hidden/default visibility
+        controls on symbols so we should propagate this information in that
+        use case
+     */
+#    define DIY_MPI_EXPORT_DEFINE
+#    define DIY_MPI_IMPORT_DEFINE
+#    define DIY_MPI_NO_EXPORT_DEFINE
+#  else
+#    define DIY_MPI_EXPORT_DEFINE __declspec(dllexport)
+#    define DIY_MPI_IMPORT_DEFINE __declspec(dllimport)
+#    define DIY_MPI_NO_EXPORT_DEFINE
+#  endif
+#else
+#  define DIY_MPI_EXPORT_DEFINE __attribute__((visibility("default")))
+#  define DIY_MPI_IMPORT_DEFINE __attribute__((visibility("default")))
+#  define DIY_MPI_NO_EXPORT_DEFINE __attribute__((visibility("hidden")))
+#endif
+
+#ifndef DIY_MPI_EXPORT
+#  if !defined(DIY_MPI_AS_LIB)
+#    define DIY_MPI_EXPORT
+#    define DIY_MPI_EXPORT_FUNCTION inline
+#  else
+#    if defined(DIY_HAS_MPI)
+       /* We are building this library */
+#      define DIY_MPI_EXPORT DIY_MPI_EXPORT_DEFINE
+#    else
+       /* We are using this library */
+#      define DIY_MPI_EXPORT DIY_MPI_IMPORT_DEFINE
+#    endif
+#    define DIY_MPI_EXPORT_FUNCTION DIY_MPI_EXPORT
+#  endif
+#endif
+
+#ifndef DIY_MPI_EXPORT_FUNCTION
+#error "DIY_MPI_EXPORT_FUNCTION not defined"
+#endif
+
+#ifndef DIY_MPI_NO_EXPORT
+#  define DIY_MPI_NO_EXPORT DIY_MPI_NO_EXPORT_DEFINE
+#endif
+
+#endif // DIY_MPI_EXPORT_H
diff --git a/include/vtkdiy2/mpi/environment.cpp b/include/vtkdiy2/mpi/environment.cpp
new file mode 100644
index 00000000000..e6ce48bdd51
--- /dev/null
+++ b/include/vtkdiy2/mpi/environment.cpp
@@ -0,0 +1,62 @@
+#ifdef DIY_MPI_AS_LIB
+#include "environment.hpp"
+#endif
+
+bool diy::mpi::environment::initialized()
+{
+#if DIY_HAS_MPI
+  int flag;
+  MPI_Initialized(&flag);
+  return flag != 0;
+#else
+  return true;
+#endif
+}
+
+diy::mpi::environment::environment()
+{
+#if DIY_HAS_MPI
+  int argc = 0; char** argv = nullptr;
+  MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided_threading);
+#else
+  provided_threading = MPI_THREAD_FUNNELED;
+#endif
+}
+
+diy::mpi::environment::environment(int requested_threading)
+{
+#if DIY_HAS_MPI
+  int argc = 0; char** argv = nullptr;
+  MPI_Init_thread(&argc, &argv, requested_threading, &provided_threading);
+#else
+  provided_threading = requested_threading;
+#endif
+}
+
+diy::mpi::environment::environment(int argc, char* argv[])
+{
+#if DIY_HAS_MPI
+  MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided_threading);
+#else
+  (void) argc; (void) argv;
+  provided_threading = MPI_THREAD_FUNNELED;
+#endif
+}
+
+diy::mpi::environment::environment(int argc, char* argv[], int requested_threading)
+{
+#if DIY_HAS_MPI
+  MPI_Init_thread(&argc, &argv, requested_threading, &provided_threading);
+#else
+  (void) argc; (void) argv;
+  provided_threading = requested_threading;
+#endif
+}
+
+diy::mpi::environment::
+~environment()
+{
+#if DIY_HAS_MPI
+  MPI_Finalize();
+#endif
+}
diff --git a/include/vtkdiy2/mpi/environment.hpp b/include/vtkdiy2/mpi/environment.hpp
new file mode 100644
index 00000000000..3ae52301a0b
--- /dev/null
+++ b/include/vtkdiy2/mpi/environment.hpp
@@ -0,0 +1,35 @@
+#ifndef DIY_MPI_ENVIRONMENT_HPP
+#define DIY_MPI_ENVIRONMENT_HPP
+
+#include "config.hpp"
+
+namespace diy
+{
+namespace mpi
+{
+
+//! \ingroup MPI
+struct environment
+{
+  DIY_MPI_EXPORT_FUNCTION static bool initialized();
+
+  DIY_MPI_EXPORT_FUNCTION environment();
+  DIY_MPI_EXPORT_FUNCTION environment(int requested_threading);
+  DIY_MPI_EXPORT_FUNCTION environment(int argc, char* argv[]);
+  DIY_MPI_EXPORT_FUNCTION environment(int argc, char* argv[], int requested_threading);
+
+  DIY_MPI_EXPORT_FUNCTION  ~environment();
+
+  int   threading() const           { return provided_threading; }
+
+  int   provided_threading;
+};
+
+}
+} // diy::mpi
+
+#ifndef DIY_MPI_AS_LIB
+#include "environment.cpp"
+#endif
+
+#endif // DIY_MPI_ENVIRONMENT_HPP
diff --git a/include/vtkdiy2/mpi/io.cpp b/include/vtkdiy2/mpi/io.cpp
new file mode 100644
index 00000000000..37eb216f1e6
--- /dev/null
+++ b/include/vtkdiy2/mpi/io.cpp
@@ -0,0 +1,222 @@
+#ifdef DIY_MPI_AS_LIB
+#include "io.hpp"
+#endif
+
+#include "status.hpp"
+
+#ifdef DIY_MPI_AS_LIB
+const int diy::mpi::io::file::rdonly          = MPI_MODE_RDONLY;
+const int diy::mpi::io::file::rdwr            = MPI_MODE_RDWR;
+const int diy::mpi::io::file::wronly          = MPI_MODE_WRONLY;
+const int diy::mpi::io::file::create          = MPI_MODE_CREATE;
+const int diy::mpi::io::file::exclusive       = MPI_MODE_EXCL;
+const int diy::mpi::io::file::delete_on_close = MPI_MODE_DELETE_ON_CLOSE;
+const int diy::mpi::io::file::unique_open     = MPI_MODE_UNIQUE_OPEN;
+const int diy::mpi::io::file::sequential      = MPI_MODE_SEQUENTIAL;
+const int diy::mpi::io::file::append          = MPI_MODE_APPEND;
+#endif
+
+diy::mpi::io::file::
+file(const communicator& comm__, const std::string& filename, int mode)
+: comm_(comm__)
+{
+#if DIY_HAS_MPI
+  int ret = MPI_File_open(diy::mpi::mpi_cast(comm__.handle()), const_cast<char*>(filename.c_str()), mode, MPI_INFO_NULL, &diy::mpi::mpi_cast(fh));
+  if (ret)
+      throw std::runtime_error("DIY cannot open file: " + filename);
+#else
+  (void)comm__; (void)filename; (void)mode;
+  DIY_UNSUPPORTED_MPI_CALL(MPI_File_open);
+#endif
+}
+
+void
+diy::mpi::io::file::
+close()
+{
+#if DIY_HAS_MPI
+  if (diy::mpi::mpi_cast(fh) != MPI_FILE_NULL)
+    MPI_File_close(&diy::mpi::mpi_cast(fh));
+#endif
+}
+
+diy::mpi::io::offset
+diy::mpi::io::file::
+size() const
+{
+#if DIY_HAS_MPI
+  MPI_Offset sz;
+  MPI_File_get_size(diy::mpi::mpi_cast(fh), &sz);
+  return static_cast<offset>(sz);
+#else
+  DIY_UNSUPPORTED_MPI_CALL(MPI_File_get_size);
+#endif
+}
+
+void
+diy::mpi::io::file::
+resize(diy::mpi::io::offset size_)
+{
+#if DIY_HAS_MPI
+  MPI_File_set_size(diy::mpi::mpi_cast(fh), static_cast<MPI_Offset>(size_));
+#else
+  (void)size_;
+  DIY_UNSUPPORTED_MPI_CALL(MPI_File_set_size);
+#endif
+}
+
+void
+diy::mpi::io::file::
+read_at(offset o, char* buffer, size_t size_)
+{
+#if DIY_HAS_MPI
+  status s;
+  MPI_File_read_at(diy::mpi::mpi_cast(fh), static_cast<MPI_Offset>(o), buffer, static_cast<int>(size_), MPI_BYTE, &diy::mpi::mpi_cast(s.handle));
+#else
+  (void)o; (void)buffer; (void)size_;
+  DIY_UNSUPPORTED_MPI_CALL(MPI_File_read_at);
+#endif
+}
+
+void
+diy::mpi::io::file::
+read_at_all(offset o, char* buffer, size_t size_)
+{
+#if DIY_HAS_MPI
+  status s;
+  MPI_File_read_at_all(diy::mpi::mpi_cast(fh), static_cast<MPI_Offset>(o), buffer, static_cast<int>(size_), MPI_BYTE, &diy::mpi::mpi_cast(s.handle));
+#else
+  (void)o; (void)buffer; (void)size_;
+  DIY_UNSUPPORTED_MPI_CALL(MPI_File_read_at_all);
+#endif
+}
+
+void
+diy::mpi::io::file::
+write_at(offset o, const char* buffer, size_t size_)
+{
+#if DIY_HAS_MPI
+  status s;
+  MPI_File_write_at(diy::mpi::mpi_cast(fh), static_cast<MPI_Offset>(o), (void *)buffer, static_cast<int>(size_), MPI_BYTE, &diy::mpi::mpi_cast(s.handle));
+#else
+  (void)o; (void)buffer; (void)size_;
+  DIY_UNSUPPORTED_MPI_CALL(MPI_File_write_at);
+#endif
+}
+
+void
+diy::mpi::io::file::
+write_at_all(offset o, const char* buffer, size_t size_)
+{
+#if DIY_HAS_MPI
+  status s;
+  MPI_File_write_at_all(diy::mpi::mpi_cast(fh), static_cast<MPI_Offset>(o), (void *)buffer, static_cast<int>(size_), MPI_BYTE, &diy::mpi::mpi_cast(s.handle));
+#else
+  (void)o; (void)buffer; (void)size_;
+  DIY_UNSUPPORTED_MPI_CALL(MPI_File_write_at_all);
+#endif
+}
+
+void
+diy::mpi::io::file::
+read_bov(const DiscreteBounds& bounds, int ndims, const int dims[], char* buffer, size_t offset, const datatype& dt, bool collective, int chunk)
+{
+#if DIY_HAS_MPI
+  int total = 1;
+  std::vector<int> subsizes;
+  for (unsigned i = 0; i < static_cast<unsigned>(ndims); ++i)
+  {
+    subsizes.push_back(bounds.max[i] - bounds.min[i] + 1);
+    total *= subsizes.back();
+  }
+
+  MPI_Datatype T_type;
+  if (chunk == 1)
+  {
+    T_type = diy::mpi::mpi_cast(dt.handle);
+  }
+  else
+  {
+    // create an MPI struct of size chunk to read the data in those chunks
+    // (this allows to work around MPI-IO weirdness where crucial quantities
+    // are ints, which are too narrow of a type)
+    int             array_of_blocklengths[]  = { chunk };
+    MPI_Aint        array_of_displacements[] = { 0 };
+    MPI_Datatype    array_of_types[]         = { diy::mpi::mpi_cast(dt.handle) };
+    MPI_Type_create_struct(1, array_of_blocklengths, array_of_displacements, array_of_types, &T_type);
+    MPI_Type_commit(&T_type);
+  }
+
+  MPI_Datatype fileblk;
+  MPI_Type_create_subarray(ndims, dims, subsizes.data(), (int*) &bounds.min[0], MPI_ORDER_C, T_type, &fileblk);
+  MPI_Type_commit(&fileblk);
+
+  MPI_File_set_view(diy::mpi::mpi_cast(fh), static_cast<MPI_Offset>(offset), T_type, fileblk, (char*)"native", MPI_INFO_NULL);
+
+  mpi::status s;
+  if (!collective)
+    MPI_File_read(diy::mpi::mpi_cast(fh), buffer, total, T_type, &mpi_cast(s.handle));
+  else
+    MPI_File_read_all(diy::mpi::mpi_cast(fh), buffer, total, T_type, &mpi_cast(s.handle));
+
+  if (chunk != 1)
+    MPI_Type_free(&T_type);
+  MPI_Type_free(&fileblk);
+#else
+  (void) bounds; (void) ndims; (void) dims, (void) buffer; (void) offset, (void) dt, (void) collective; (void) chunk;
+  DIY_UNSUPPORTED_MPI_CALL(diy::mpi::io::file::read_bov);
+#endif
+}
+
+void
+diy::mpi::io::file::
+write_bov(const DiscreteBounds& bounds, const DiscreteBounds& core, int ndims, const int dims[], const char* buffer, size_t offset, const datatype& dt, bool collective, int chunk)
+{
+#if DIY_HAS_MPI
+  std::vector<int> subsizes;
+  std::vector<int> buffer_shape, buffer_start;
+  for (unsigned i = 0; i < static_cast<unsigned>(ndims); ++i)
+  {
+    buffer_shape.push_back(bounds.max[i] - bounds.min[i] + 1);
+    buffer_start.push_back(core.min[i] - bounds.min[i]);
+    subsizes.push_back(core.max[i] - core.min[i] + 1);
+  }
+
+  MPI_Datatype T_type;
+  if (chunk == 1)
+  {
+    T_type = diy::mpi::mpi_cast(dt.handle);
+  }
+  else
+  {
+    // assume T is a binary block and create an MPI struct of appropriate size
+    int             array_of_blocklengths[]  = { chunk };
+    MPI_Aint        array_of_displacements[] = { 0 };
+    MPI_Datatype    array_of_types[]         = { diy::mpi::mpi_cast(dt.handle) };
+    MPI_Type_create_struct(1, array_of_blocklengths, array_of_displacements, array_of_types, &T_type);
+    MPI_Type_commit(&T_type);
+  }
+
+  MPI_Datatype fileblk, subbuffer;
+  MPI_Type_create_subarray(ndims, dims, subsizes.data(), (int*) &core.min[0], MPI_ORDER_C, T_type, &fileblk);
+  MPI_Type_create_subarray(ndims, buffer_shape.data(), subsizes.data(), buffer_start.data(), MPI_ORDER_C, T_type, &subbuffer);
+  MPI_Type_commit(&fileblk);
+  MPI_Type_commit(&subbuffer);
+
+  MPI_File_set_view(diy::mpi::mpi_cast(fh), static_cast<MPI_Offset>(offset), T_type, fileblk, (char*)"native", MPI_INFO_NULL);
+
+  mpi::status s;
+  if (!collective)
+    MPI_File_write(diy::mpi::mpi_cast(fh), (void*)buffer, 1, subbuffer, &mpi_cast(s.handle));
+  else
+    MPI_File_write_all(diy::mpi::mpi_cast(fh), (void*)buffer, 1, subbuffer, &mpi_cast(s.handle));
+
+  if (chunk != 1)
+    MPI_Type_free(&T_type);
+  MPI_Type_free(&fileblk);
+  MPI_Type_free(&subbuffer);
+#else
+  (void) bounds; (void) core, (void) ndims; (void) dims, (void) buffer; (void) offset, (void) dt, (void) collective; (void) chunk;
+  DIY_UNSUPPORTED_MPI_CALL(diy::mpi::io::file::write_bov);
+#endif
+}
diff --git a/include/vtkdiy2/mpi/io.hpp b/include/vtkdiy2/mpi/io.hpp
index 1ed9792219a..42885906ba6 100644
--- a/include/vtkdiy2/mpi/io.hpp
+++ b/include/vtkdiy2/mpi/io.hpp
@@ -1,139 +1,82 @@
 #ifndef DIY_MPI_IO_HPP
 #define DIY_MPI_IO_HPP
 
-#include "../constants.h"
+#include "config.hpp"
+#include "communicator.hpp"
+
+#include "../types.hpp"
 
 #include <vector>
 #include <string>
+#include <stdexcept>
 
 namespace diy
 {
 namespace mpi
 {
+
 namespace io
 {
-  typedef               MPI_Offset              offset;
+#if !defined(DIY_MPI_AS_LIB) && DIY_HAS_MPI
+  using offset = MPI_Offset;
+#else
+  using offset = long long;
+#endif
 
   //! Wraps MPI file IO. \ingroup MPI
   class file
   {
     public:
-      enum
-      {
-        rdonly          = MPI_MODE_RDONLY,
-        rdwr            = MPI_MODE_RDWR,
-        wronly          = MPI_MODE_WRONLY,
-        create          = MPI_MODE_CREATE,
-        exclusive       = MPI_MODE_EXCL,
-        delete_on_close = MPI_MODE_DELETE_ON_CLOSE,
-        unique_open     = MPI_MODE_UNIQUE_OPEN,
-        sequential      = MPI_MODE_SEQUENTIAL,
-        append          = MPI_MODE_APPEND
-      };
+#ifndef DIY_MPI_AS_LIB
+      static constexpr int rdonly          = MPI_MODE_RDONLY;
+      static constexpr int rdwr            = MPI_MODE_RDWR;
+      static constexpr int wronly          = MPI_MODE_WRONLY;
+      static constexpr int create          = MPI_MODE_CREATE;
+      static constexpr int exclusive       = MPI_MODE_EXCL;
+      static constexpr int delete_on_close = MPI_MODE_DELETE_ON_CLOSE;
+      static constexpr int unique_open     = MPI_MODE_UNIQUE_OPEN;
+      static constexpr int sequential      = MPI_MODE_SEQUENTIAL;
+      static constexpr int append          = MPI_MODE_APPEND;
+#else
+      static const int rdonly, rdwr, wronly, create, exclusive, delete_on_close, unique_open, sequential, append;
+#endif
 
     public:
-      inline        file(const communicator& comm, const std::string& filename, int mode);
-                    ~file()                                 { close(); }
-      inline void   close();
+      DIY_MPI_EXPORT_FUNCTION        file(const communicator& comm, const std::string& filename, int mode);
+                                     ~file()                            { close(); }
+      DIY_MPI_EXPORT_FUNCTION void   close();
 
-      inline offset size() const;
-      inline void   resize(offset size);
+      DIY_MPI_EXPORT_FUNCTION offset size() const;
+      DIY_MPI_EXPORT_FUNCTION void   resize(offset size);
 
-      inline void   read_at(offset o, char* buffer, size_t size);
-      inline void   read_at_all(offset o, char* buffer, size_t size);
-      inline void   write_at(offset o, const char* buffer, size_t size);
-      inline void   write_at_all(offset o, const char* buffer, size_t size);
+      DIY_MPI_EXPORT_FUNCTION void   read_at(offset o, char* buffer, size_t size);
+      DIY_MPI_EXPORT_FUNCTION void   read_at_all(offset o, char* buffer, size_t size);
+      DIY_MPI_EXPORT_FUNCTION void   write_at(offset o, const char* buffer, size_t size);
+      DIY_MPI_EXPORT_FUNCTION void   write_at_all(offset o, const char* buffer, size_t size);
 
       template<class T>
-      inline void   read_at(offset o, std::vector<T>& data);
+      inline void           read_at(offset o, std::vector<T>& data);
 
       template<class T>
-      inline void   read_at_all(offset o, std::vector<T>& data);
+      inline void           read_at_all(offset o, std::vector<T>& data);
 
       template<class T>
-      inline void   write_at(offset o, const std::vector<T>& data);
+      inline void           write_at(offset o, const std::vector<T>& data);
 
       template<class T>
-      inline void   write_at_all(offset o, const std::vector<T>& data);
+      inline void           write_at_all(offset o, const std::vector<T>& data);
 
-      const communicator&
-                    comm() const                            { return comm_; }
+      DIY_MPI_EXPORT_FUNCTION void   read_bov(const DiscreteBounds& bounds, int ndims, const int dims[], char* buffer, size_t offset, const datatype& dt, bool collective, int chunk);
+      DIY_MPI_EXPORT_FUNCTION void   write_bov(const DiscreteBounds& bounds, const DiscreteBounds& core, int ndims, const int dims[], const char* buffer, size_t offset, const datatype& dt, bool collective, int chunk);
 
-      MPI_File&     handle()                                { return fh; }
+      const communicator&   comm() const   { return comm_; }
 
     private:
-      const communicator&   comm_;
-      MPI_File              fh;
+      communicator   comm_;
+    protected: // mark protected to avoid the "unused private field" warning
+      DIY_MPI_File   fh;
   };
 }
-}
-}
-
-diy::mpi::io::file::
-file(const communicator& comm__, const std::string& filename, int mode)
-: comm_(comm__)
-{
-#ifndef DIY_NO_MPI
-  int ret = MPI_File_open(comm__, const_cast<char*>(filename.c_str()), mode, MPI_INFO_NULL, &fh);
-  if (ret)
-      throw std::runtime_error("DIY cannot open file: " + filename);
-#else
-  DIY_UNUSED(comm__);
-  DIY_UNUSED(filename);
-  DIY_UNUSED(mode);
-  DIY_UNSUPPORTED_MPI_CALL(MPI_File_open);
-#endif
-}
-
-void
-diy::mpi::io::file::
-close()
-{
-#ifndef DIY_NO_MPI
-  if (fh != MPI_FILE_NULL)
-    MPI_File_close(&fh);
-#endif
-}
-
-diy::mpi::io::offset
-diy::mpi::io::file::
-size() const
-{
-#ifndef DIY_NO_MPI
-  offset sz;
-  MPI_File_get_size(fh, &sz);
-  return sz;
-#else
-  DIY_UNSUPPORTED_MPI_CALL(MPI_File_get_size);
-#endif
-}
-
-void
-diy::mpi::io::file::
-resize(diy::mpi::io::offset size_)
-{
-#ifndef DIY_NO_MPI
-  MPI_File_set_size(fh, size_);
-#else
-  DIY_UNUSED(size_);
-  DIY_UNSUPPORTED_MPI_CALL(MPI_File_set_size);
-#endif
-}
-
-void
-diy::mpi::io::file::
-read_at(offset o, char* buffer, size_t size_)
-{
-#ifndef DIY_NO_MPI
-  status s;
-  MPI_File_read_at(fh, o, buffer, static_cast<int>(size_), detail::get_mpi_datatype<char>(), &s.s);
-#else
-  DIY_UNUSED(o);
-  DIY_UNUSED(buffer);
-  DIY_UNUSED(size_);
-  DIY_UNSUPPORTED_MPI_CALL(MPI_File_read_at);
-#endif
-}
 
 template<class T>
 void
@@ -143,21 +86,6 @@ read_at(offset o, std::vector<T>& data)
   read_at(o, &data[0], data.size()*sizeof(T));
 }
 
-void
-diy::mpi::io::file::
-read_at_all(offset o, char* buffer, size_t size_)
-{
-#ifndef DIY_NO_MPI
-  status s;
-  MPI_File_read_at_all(fh, o, buffer, static_cast<int>(size_), detail::get_mpi_datatype<char>(), &s.s);
-#else
-  DIY_UNUSED(o);
-  DIY_UNUSED(buffer);
-  DIY_UNUSED(size_);
-  DIY_UNSUPPORTED_MPI_CALL(MPI_File_read_at_all);
-#endif
-}
-
 template<class T>
 void
 diy::mpi::io::file::
@@ -166,21 +94,6 @@ read_at_all(offset o, std::vector<T>& data)
   read_at_all(o, (char*) &data[0], data.size()*sizeof(T));
 }
 
-void
-diy::mpi::io::file::
-write_at(offset o, const char* buffer, size_t size_)
-{
-#ifndef DIY_NO_MPI
-  status s;
-  MPI_File_write_at(fh, o, (void *)buffer, static_cast<int>(size_), detail::get_mpi_datatype<char>(), &s.s);
-#else
-  DIY_UNUSED(o);
-  DIY_UNUSED(buffer);
-  DIY_UNUSED(size_);
-  DIY_UNSUPPORTED_MPI_CALL(MPI_File_write_at);
-#endif
-}
-
 template<class T>
 void
 diy::mpi::io::file::
@@ -189,21 +102,6 @@ write_at(offset o, const std::vector<T>& data)
   write_at(o, (const char*) &data[0], data.size()*sizeof(T));
 }
 
-void
-diy::mpi::io::file::
-write_at_all(offset o, const char* buffer, size_t size_)
-{
-#ifndef DIY_NO_MPI
-  status s;
-  MPI_File_write_at_all(fh, o, (void *)buffer, static_cast<int>(size_), detail::get_mpi_datatype<char>(), &s.s);
-#else
-  DIY_UNUSED(o);
-  DIY_UNUSED(buffer);
-  DIY_UNUSED(size_);
-  DIY_UNSUPPORTED_MPI_CALL(MPI_File_write_at_all);
-#endif
-}
-
 template<class T>
 void
 diy::mpi::io::file::
@@ -212,4 +110,11 @@ write_at_all(offset o, const std::vector<T>& data)
   write_at_all(o, &data[0], data.size()*sizeof(T));
 }
 
+}
+} // diy::mpi::io
+
+#ifndef DIY_MPI_AS_LIB
+#include "io.cpp"
 #endif
+
+#endif // DIY_MPI_IO_HPP
diff --git a/include/vtkdiy2/mpi/mpi_cast.hpp b/include/vtkdiy2/mpi/mpi_cast.hpp
new file mode 100644
index 00000000000..a2a0d91d20f
--- /dev/null
+++ b/include/vtkdiy2/mpi/mpi_cast.hpp
@@ -0,0 +1,39 @@
+#ifndef DIY_MPI_MPICAST_HPP
+#define DIY_MPI_MPICAST_HPP
+
+/// This header provides convinience functions to cast from diy's type erased MPI objects
+/// to thier correct types.
+
+#ifndef DIY_HAS_MPI
+#  include <mpi.h>
+#endif
+
+namespace diy
+{
+namespace mpi
+{
+
+#define DEFINE_MPI_CAST(mpitype)                                                                              \
+inline mpitype& mpi_cast(DIY_##mpitype& obj) { return *reinterpret_cast<mpitype*>(&obj); }                    \
+inline const mpitype& mpi_cast(const DIY_##mpitype& obj) { return *reinterpret_cast<const mpitype*>(&obj); }  \
+inline DIY_##mpitype make_DIY_##mpitype(const mpitype& obj) { DIY_##mpitype ret; mpi_cast(ret) = obj; return ret; }
+
+#define DEFINE_MPI_CAST_MOVE(mpitype)                                                                         \
+inline mpitype& mpi_cast(DIY_##mpitype& obj) { return *reinterpret_cast<mpitype*>(&obj); }                    \
+inline const mpitype& mpi_cast(const DIY_##mpitype& obj) { return *reinterpret_cast<const mpitype*>(&obj); }  \
+inline DIY_##mpitype make_DIY_##mpitype(mpitype&& obj) { DIY_##mpitype ret = std::move(obj); return ret; }
+
+DEFINE_MPI_CAST(MPI_Comm)
+DEFINE_MPI_CAST(MPI_Datatype)
+DEFINE_MPI_CAST(MPI_Status)
+DEFINE_MPI_CAST(MPI_Request)
+DEFINE_MPI_CAST(MPI_Op)
+DEFINE_MPI_CAST(MPI_File)
+DEFINE_MPI_CAST_MOVE(MPI_Win)
+
+#undef DEFINE_MPI_CAST
+
+}
+} // diy::mpi
+
+#endif // DIY_MPI_MPICAST_HPP
diff --git a/include/vtkdiy2/mpi/mpitypes.hpp.in b/include/vtkdiy2/mpi/mpitypes.hpp.in
new file mode 100644
index 00000000000..4fee00b9364
--- /dev/null
+++ b/include/vtkdiy2/mpi/mpitypes.hpp.in
@@ -0,0 +1,80 @@
+#ifndef DIY_MPI_MPITYPES_H
+#define DIY_MPI_MPITYPES_H
+
+#include <cstring>
+
+#cmakedefine TYPESIZE_MPI_Comm     @TYPESIZE_MPI_Comm@
+#cmakedefine TYPESIZE_MPI_Datatype @TYPESIZE_MPI_Datatype@
+#cmakedefine TYPESIZE_MPI_Status   @TYPESIZE_MPI_Status@
+#cmakedefine TYPESIZE_MPI_Request  @TYPESIZE_MPI_Request@
+#cmakedefine TYPESIZE_MPI_Op       @TYPESIZE_MPI_Op@
+#cmakedefine TYPESIZE_MPI_File     @TYPESIZE_MPI_File@
+#cmakedefine TYPESIZE_MPI_Win      @TYPESIZE_MPI_Win@
+
+namespace diy
+{
+namespace mpi
+{
+
+#if defined(DIY_HAS_MPI)
+#  define ASSERT_MPI_TYPE_SIZE(mpitype) static_assert(sizeof(mpitype) <= sizeof(DIY_##mpitype), "");
+#else
+# define ASSERT_MPI_TYPE_SIZE(mpitype)
+struct MPI_Win;
+#endif
+
+#define DEFINE_DIY_MPI_TYPE(mpitype)                                         \
+struct DIY_##mpitype {                                                       \
+  void* data[((TYPESIZE_##mpitype) + sizeof(void*) - 1)/sizeof(void*)];      \
+};                                                                           \
+ASSERT_MPI_TYPE_SIZE(mpitype)
+
+#define DEFINE_DIY_MPI_TYPE_MOVE(mpitype)                                               \
+  struct DIY_##mpitype                                                                  \
+  {                                                                                     \
+    DIY_##mpitype() = default;                                                          \
+    DIY_##mpitype(const mpitype&) = delete;                                             \
+    DIY_##mpitype& operator=(const mpitype&) = delete;                                  \
+    DIY_##mpitype(mpitype&& obj)                                                        \
+    {                                                                                   \
+      std::memcpy(data, &obj, TYPESIZE_##mpitype);                                      \
+      std::memset(&obj, 0, TYPESIZE_##mpitype);                                         \
+    }                                                                                   \
+    DIY_##mpitype& operator=(mpitype&& obj)                                             \
+    {                                                                                   \
+      std::memcpy(data, &obj, TYPESIZE_##mpitype);                                      \
+      std::memset(&obj, 0, TYPESIZE_##mpitype);                                         \
+      return *this;                                                                     \
+    }                                                                                   \
+    operator const mpitype&() const { return *reinterpret_cast<const mpitype*>(data); } \
+    void reset() { std::memset(data, 0, TYPESIZE_##mpitype); }                          \
+                                                                                        \
+  private:                                                                              \
+    char* data[TYPESIZE_##mpitype];                                                     \
+  };                                                                                    \
+  ASSERT_MPI_TYPE_SIZE(mpitype);
+
+DEFINE_DIY_MPI_TYPE(MPI_Comm)
+DEFINE_DIY_MPI_TYPE(MPI_Datatype)
+DEFINE_DIY_MPI_TYPE(MPI_Status)
+DEFINE_DIY_MPI_TYPE(MPI_Request)
+DEFINE_DIY_MPI_TYPE(MPI_Op)
+DEFINE_DIY_MPI_TYPE(MPI_File)
+DEFINE_DIY_MPI_TYPE_MOVE(MPI_Win)
+
+#undef DEFINE_DIY_MPI_TYPE
+#undef DEFINE_DIY_MPI_TYPE_MOVE
+#undef ASSERT_MPI_TYPE_SIZE
+
+}
+} // diy::mpi
+
+#undef TYPESIZE_MPI_Comm
+#undef TYPESIZE_MPI_Datatype
+#undef TYPESIZE_MPI_Status
+#undef TYPESIZE_MPI_Request
+#undef TYPESIZE_MPI_Op
+#undef TYPESIZE_MPI_File
+#undef TYPESIZE_MPI_Win
+
+#endif // DIY_MPI_MPITYPES_H
diff --git a/include/vtkdiy2/mpi/no-mpi.hpp b/include/vtkdiy2/mpi/no-mpi.hpp
index 7356b098e80..68ffff74bc7 100644
--- a/include/vtkdiy2/mpi/no-mpi.hpp
+++ b/include/vtkdiy2/mpi/no-mpi.hpp
@@ -1,9 +1,10 @@
 #ifndef DIY_MPI_NO_MPI_HPP
 #define DIY_MPI_NO_MPI_HPP
 
+#include <cassert> // std::assert
+#include <cstdint> // uintptr_t
 #include <stdexcept> // std::runtime_error
 
-
 static const int MPI_SUCCESS = 0;
 static const int MPI_ANY_SOURCE = -1;
 static const int MPI_ANY_TAG = -1;
@@ -33,7 +34,6 @@ DIY_NO_MPI_DATATYPE(long long,             MPI_LONG_LONG_INT);
 DIY_NO_MPI_DATATYPE(unsigned long long,    MPI_UNSIGNED_LONG_LONG);
 DIY_NO_MPI_DATATYPE(float,                 MPI_FLOAT);
 DIY_NO_MPI_DATATYPE(double,                MPI_DOUBLE);
-#endif
 
 /* status type */
 struct MPI_Status
@@ -49,7 +49,7 @@ struct MPI_Status
 using MPI_Request = int;
 
 #define DIY_UNSUPPORTED_MPI_CALL(name) \
-  throw std::runtime_error("`" #name "` not supported when DIY_NO_MPI is defined.");
+  throw std::runtime_error("`" #name "` not supported when DIY_HAS_MPI is false.");
 
 /* define operations */
 using MPI_Op = int;
@@ -61,7 +61,7 @@ static const MPI_Op MPI_LAND = 0;
 static const MPI_Op MPI_LOR = 0;
 
 /* mpi i/o stuff */
-using MPI_Offset = size_t;
+using MPI_Offset = long long;
 using MPI_File = int;
 static const MPI_File MPI_FILE_NULL = 0;
 
@@ -76,7 +76,39 @@ static const int MPI_MODE_APPEND          = 128;
 static const int MPI_MODE_SEQUENTIAL      = 256;
 
 /* define window type */
-using MPI_Win = int;
+struct MPI_Win {
+       MPI_Win(): data_(0) {}
+       MPI_Win(void* data, bool owned = false): data_(uintptr_t(data) | (owned ? 0x1 : 0x0))
+       {
+              // We assume that pointers have at least some higher-byte alignment.
+              assert(!(uintptr_t(data) & 0x1));
+       }
+       void* data() const { return (void*)(data_ & ~0x1); }
+       bool owned() const { return data_ & 0x1; }
+
+       // We cannot copy owned windows.
+       MPI_Win(MPI_Win const&) = delete;
+       MPI_Win& operator=(MPI_Win const&) = delete;
+
+       // We cannot move owned windows (we don't know how to delete them in general).
+       MPI_Win(MPI_Win&& rhs): data_(rhs.data_)
+       {
+              rhs.data_ = 0;
+       }
+       MPI_Win& operator=(MPI_Win&& rhs)
+       {
+              if (this == &rhs)
+                     return *this;
+
+              data_ = rhs.data_;
+              rhs.data_ = 0;
+
+              return *this;
+       }
+private:
+       uintptr_t data_;
+};
+#define MPI_WIN_NULL MPI_Win()
 
 /* window fence assertions */
 static const int MPI_MODE_NOSTORE       = 1;
@@ -88,3 +120,5 @@ static const int MPI_MODE_NOCHECK       = 16;
 /* window lock types */
 static const int MPI_LOCK_SHARED        = 1;
 static const int MPI_LOCK_EXCLUSIVE     = 2;
+
+#endif // DIY_MPI_NO_MPI_HPP
diff --git a/include/vtkdiy2/mpi/operations.cpp b/include/vtkdiy2/mpi/operations.cpp
new file mode 100644
index 00000000000..aa059e3429a
--- /dev/null
+++ b/include/vtkdiy2/mpi/operations.cpp
@@ -0,0 +1,33 @@
+#ifdef DIY_MPI_AS_LIB
+#include "operations.hpp"
+#endif
+
+#include <functional>
+
+namespace diy
+{
+namespace mpi
+{
+
+namespace detail
+{
+
+operation get_builtin_operation(BuiltinOperation id)
+{
+  operation op{};
+  switch(id)
+  {
+    case OP_MAXIMUM:     op.handle = make_DIY_MPI_Op(MPI_MAX);  break;
+    case OP_MINIMUM:     op.handle = make_DIY_MPI_Op(MPI_MIN);  break;
+    case OP_PLUS:        op.handle = make_DIY_MPI_Op(MPI_SUM);  break;
+    case OP_MULTIPLIES:  op.handle = make_DIY_MPI_Op(MPI_PROD); break;
+    case OP_LOGICAL_AND: op.handle = make_DIY_MPI_Op(MPI_LAND); break;
+    case OP_LOGICAL_OR:  op.handle = make_DIY_MPI_Op(MPI_LOR);  break;
+    default: break;
+  }
+  return op;
+}
+
+}
+}
+} // diy::mpi::detail
diff --git a/include/vtkdiy2/mpi/operations.hpp b/include/vtkdiy2/mpi/operations.hpp
index 5a920c0ee08..ef79a504717 100644
--- a/include/vtkdiy2/mpi/operations.hpp
+++ b/include/vtkdiy2/mpi/operations.hpp
@@ -1,5 +1,10 @@
-#include <functional>
+#ifndef DIY_MPI_OPERATIONS_HPP
+#define DIY_MPI_OPERATIONS_HPP
+
+#include "config.hpp"
+
 #include <algorithm> // for std::min/max
+#include <functional>
 
 namespace diy
 {
@@ -7,6 +12,19 @@ namespace mpi
 {
   //! \addtogroup MPI
   //!@{
+  struct operation
+  {
+    operation() = default;
+    operation(const DIY_MPI_Op& op) : handle(op) {}
+
+#ifndef DIY_MPI_AS_LIB // only available in header-only mode
+    operation(const MPI_Op& op) : handle(op) {}
+    operator MPI_Op() { return handle; }
+#endif
+
+    DIY_MPI_Op handle;
+  };
+
   template<class U>
   struct maximum { const U& operator()(const U& x, const U& y) const { return (std::max)(x,y); } };
   template<class U>
@@ -15,13 +33,32 @@ namespace mpi
 
 namespace detail
 {
-  template<class T> struct mpi_op                           { static MPI_Op  get(); };
-  template<class U> struct mpi_op< maximum<U> >             { static MPI_Op  get() { return MPI_MAX; }  };
-  template<class U> struct mpi_op< minimum<U> >             { static MPI_Op  get() { return MPI_MIN; }  };
-  template<class U> struct mpi_op< std::plus<U> >           { static MPI_Op  get() { return MPI_SUM; }  };
-  template<class U> struct mpi_op< std::multiplies<U> >     { static MPI_Op  get() { return MPI_PROD; }  };
-  template<class U> struct mpi_op< std::logical_and<U> >    { static MPI_Op  get() { return MPI_LAND; }  };
-  template<class U> struct mpi_op< std::logical_or<U> >     { static MPI_Op  get() { return MPI_LOR; }  };
-}
+  enum BuiltinOperation {
+    OP_MAXIMUM = 0,
+    OP_MINIMUM,
+    OP_PLUS,
+    OP_MULTIPLIES,
+    OP_LOGICAL_AND,
+    OP_LOGICAL_OR
+  };
+
+  DIY_MPI_EXPORT_FUNCTION operation get_builtin_operation(BuiltinOperation id);
+
+  template<class T> struct mpi_op;
+
+  template<class U> struct mpi_op< maximum<U> >          { static operation get() { return get_builtin_operation(OP_MAXIMUM); } };
+  template<class U> struct mpi_op< minimum<U> >          { static operation get() { return get_builtin_operation(OP_MINIMUM); } };
+  template<class U> struct mpi_op< std::plus<U> >        { static operation get() { return get_builtin_operation(OP_PLUS); } };
+  template<class U> struct mpi_op< std::multiplies<U> >  { static operation get() { return get_builtin_operation(OP_MULTIPLIES); } };
+  template<class U> struct mpi_op< std::logical_and<U> > { static operation get() { return get_builtin_operation(OP_LOGICAL_AND); } };
+  template<class U> struct mpi_op< std::logical_or<U> >  { static operation get() { return get_builtin_operation(OP_LOGICAL_OR); } };
 }
+
 }
+} // diy::mpi
+
+#ifndef DIY_MPI_AS_LIB
+#include "operations.cpp"
+#endif
+
+#endif // DIY_MPI_OPERATIONS_HPP
diff --git a/include/vtkdiy2/mpi/optional.hpp b/include/vtkdiy2/mpi/optional.hpp
index ab58aaf8108..ddbd455621c 100644
--- a/include/vtkdiy2/mpi/optional.hpp
+++ b/include/vtkdiy2/mpi/optional.hpp
@@ -1,3 +1,6 @@
+#ifndef DIY_MPI_OPTIONAL_HPP
+#define DIY_MPI_OPTIONAL_HPP
+
 namespace diy
 {
 namespace mpi
@@ -34,8 +37,8 @@ namespace mpi
       const void*   address() const             { return buf_; }
 
     private:
+      alignas(T) char buf_[sizeof(T)];
       bool init_;
-      char buf_[sizeof(T)];
   };
 }
 }
@@ -53,3 +56,5 @@ operator=(const optional& o)
 
   return *this;
 }
+
+#endif // DIY_MPI_OPTIONAL_HPP
diff --git a/include/vtkdiy2/mpi/point-to-point.cpp b/include/vtkdiy2/mpi/point-to-point.cpp
new file mode 100644
index 00000000000..bba84cebed4
--- /dev/null
+++ b/include/vtkdiy2/mpi/point-to-point.cpp
@@ -0,0 +1,106 @@
+#ifdef DIY_MPI_AS_LIB
+#include "point-to-point.hpp"
+#endif
+
+namespace diy
+{
+namespace mpi
+{
+
+#ifdef DIY_MPI_AS_LIB
+#  ifdef _MSC_VER
+#    define EXPORT_MACRO DIY_MPI_EXPORT
+#  else
+#    define EXPORT_MACRO
+#  endif
+EXPORT_MACRO const int any_source  = MPI_ANY_SOURCE;
+EXPORT_MACRO const int any_tag     = MPI_ANY_TAG;
+#  undef EXPORT_MACRO
+#endif
+
+namespace detail
+{
+
+void send(DIY_MPI_Comm comm, int dest, int tag, const void* data, int count, const datatype& type)
+{
+#if DIY_HAS_MPI
+  MPI_Send(data, count, mpi_cast(type.handle), dest, tag, mpi_cast(comm));
+#else
+  (void) comm; (void) dest; (void) tag; (void) data; (void) count; (void) type;
+  DIY_UNSUPPORTED_MPI_CALL(MPI_Send);
+#endif
+}
+
+void ssend(DIY_MPI_Comm comm, int dest, int tag, const void* data, int count, const datatype& type)
+{
+#if DIY_HAS_MPI
+  MPI_Ssend(data, count, mpi_cast(type.handle), dest, tag, mpi_cast(comm));
+#else
+  (void) comm; (void) dest; (void) tag; (void) data; (void) count; (void) type;
+  DIY_UNSUPPORTED_MPI_CALL(MPI_Ssend);
+#endif
+}
+
+status probe(DIY_MPI_Comm comm, int source, int tag)
+{
+#if DIY_HAS_MPI
+  status s;
+  MPI_Probe(source, tag, mpi_cast(comm), &mpi_cast(s.handle));
+  return s;
+#else
+  (void) comm; (void) source; (void) tag;
+  DIY_UNSUPPORTED_MPI_CALL(MPI_Probe);
+#endif
+}
+
+status recv(DIY_MPI_Comm comm, int source, int tag, void* data, int count, const datatype& type)
+{
+#if DIY_HAS_MPI
+  status s;
+  MPI_Recv(data, count, mpi_cast(type.handle), source, tag, mpi_cast(comm), &mpi_cast(s.handle));
+  return s;
+#else
+  (void) comm; (void) source; (void) tag; (void) data; (void) count; (void) type;
+  DIY_UNSUPPORTED_MPI_CALL(MPI_Recv);
+#endif
+}
+
+request isend(DIY_MPI_Comm comm, int dest, int tag, const void* data, int count, const datatype& type)
+{
+#if DIY_HAS_MPI
+  request r;
+  MPI_Isend(data, count, mpi_cast(type.handle), dest, tag, mpi_cast(comm), &mpi_cast(r.handle));
+  return r;
+#else
+  (void) comm; (void) dest; (void) tag; (void) data; (void) count; (void) type;
+  DIY_UNSUPPORTED_MPI_CALL(MPI_Isend);
+#endif
+}
+
+request issend(DIY_MPI_Comm comm, int dest, int tag, const void* data, int count, const datatype& type)
+{
+#if DIY_HAS_MPI
+  request r;
+  MPI_Issend(data, count, mpi_cast(type.handle), dest, tag, mpi_cast(comm), &mpi_cast(r.handle));
+  return r;
+#else
+  (void) comm; (void) dest; (void) tag; (void) data; (void) count; (void) type;
+  DIY_UNSUPPORTED_MPI_CALL(MPI_Issend);
+#endif
+}
+
+request irecv(DIY_MPI_Comm comm, int source, int tag, void* data, int count, const datatype& type)
+{
+#if DIY_HAS_MPI
+  request r;
+  MPI_Irecv(data, count, mpi_cast(type.handle), source, tag, mpi_cast(comm), &mpi_cast(r.handle));
+  return r;
+#else
+  (void) comm; (void) source; (void) tag; (void) data; (void) count; (void) type;
+  DIY_UNSUPPORTED_MPI_CALL(MPI_Irecv);
+#endif
+}
+
+}
+}
+} // diy::mpi::detail
diff --git a/include/vtkdiy2/mpi/point-to-point.hpp b/include/vtkdiy2/mpi/point-to-point.hpp
index e8651362319..150d3c8f0fe 100644
--- a/include/vtkdiy2/mpi/point-to-point.hpp
+++ b/include/vtkdiy2/mpi/point-to-point.hpp
@@ -1,147 +1,92 @@
+#ifndef DIY_MPI_POINT_TO_POINT_HPP
+#define DIY_MPI_POINT_TO_POINT_HPP
+
+#include "config.hpp"
+#include "datatypes.hpp"
+#include "request.hpp"
+#include "status.hpp"
+
 #include <vector>
 
 namespace diy
 {
 namespace mpi
 {
-namespace detail
-{
-  // send
-  template< class T, class is_mpi_datatype_ = typename is_mpi_datatype<T>::type >
-  struct send;
 
-  template<class T>
-  struct send<T, true_type>
-  {
-    void operator()(MPI_Comm comm, int dest, int tag, const T& x) const
-    {
-#ifndef DIY_NO_MPI
-      typedef       mpi_datatype<T>     Datatype;
-      MPI_Send((void*) Datatype::address(x),
-               Datatype::count(x),
-               Datatype::datatype(),
-               dest, tag, comm);
+#ifndef DIY_MPI_AS_LIB
+constexpr int any_source  = MPI_ANY_SOURCE;
+constexpr int any_tag     = MPI_ANY_TAG;
 #else
-      (void) comm; (void) dest; (void) tag; (void) x;
-      DIY_UNSUPPORTED_MPI_CALL(MPI_Send);
+DIY_MPI_EXPORT extern const int any_source;
+DIY_MPI_EXPORT extern const int any_tag;
 #endif
-    }
-  };
 
-  // recv
-  template< class T, class is_mpi_datatype_ = typename is_mpi_datatype<T>::type >
-  struct recv;
+namespace detail
+{
+  DIY_MPI_EXPORT_FUNCTION void send(DIY_MPI_Comm comm, int dest, int tag, const void* data, int count, const datatype& type);
+  DIY_MPI_EXPORT_FUNCTION void ssend(DIY_MPI_Comm comm, int dest, int tag, const void* data, int count, const datatype& type);
+  DIY_MPI_EXPORT_FUNCTION request isend(DIY_MPI_Comm comm, int dest, int tag, const void* data, int count, const datatype& type);
+  DIY_MPI_EXPORT_FUNCTION request issend(DIY_MPI_Comm comm, int dest, int tag, const void* data, int count, const datatype& type);
+  DIY_MPI_EXPORT_FUNCTION status probe(DIY_MPI_Comm comm, int source, int tag);
+  DIY_MPI_EXPORT_FUNCTION status recv(DIY_MPI_Comm comm, int source, int tag, void* data, int count, const datatype& type);
+  DIY_MPI_EXPORT_FUNCTION request irecv(DIY_MPI_Comm comm, int source, int tag, void* data, int count, const datatype& type);
 
-  template<class T>
-  struct recv<T, true_type>
+  template <class T>
+  inline void send(DIY_MPI_Comm comm, int dest, int tag, const T& x)
   {
-    status operator()(MPI_Comm comm, int source, int tag, T& x) const
-    {
-#ifndef DIY_NO_MPI
-      typedef       mpi_datatype<T>     Datatype;
-      status s;
-      MPI_Recv((void*) Datatype::address(x),
-                Datatype::count(x),
-                Datatype::datatype(),
-                source, tag, comm, &s.s);
-      return s;
-#else
-      (void) comm; (void) source; (void) tag; (void) x;
-      DIY_UNSUPPORTED_MPI_CALL(MPI_Recv);
-#endif
-    }
-  };
+    static_assert(std::is_same<typename is_mpi_datatype<T>::type, true_type>::value, "is_mpi_datatype<T>::type must be true_type");
+    send(comm, dest, tag, address(x), count(x), datatype_of(x));
+  }
 
-  template<class U>
-  struct recv<std::vector<U>, true_type>
+  template <class T>
+  inline void ssend(DIY_MPI_Comm comm, int dest, int tag, const T& x)
   {
-    status operator()(MPI_Comm comm, int source, int tag, std::vector<U>& x) const
-    {
-#ifndef DIY_NO_MPI
-      status s;
-
-      MPI_Probe(source, tag, comm, &s.s);
-      x.resize(s.count<U>());
-      MPI_Recv(&x[0], static_cast<int>(x.size()), get_mpi_datatype<U>(), source, tag, comm, &s.s);
-      return s;
-#else
-      (void) comm; (void) source; (void) tag; (void) x;
-      DIY_UNSUPPORTED_MPI_CALL(MPI_Recv);
-#endif
-    }
-  };
-
-  // isend
-  template< class T, class is_mpi_datatype_ = typename is_mpi_datatype<T>::type >
-  struct isend;
+    static_assert(std::is_same<typename is_mpi_datatype<T>::type, true_type>::value, "is_mpi_datatype<T>::type must be true_type");
+    ssend(comm, dest, tag, address(x), count(x), datatype_of(x));
+  }
 
-  template<class T>
-  struct isend<T, true_type>
+  template <class T>
+  status recv(DIY_MPI_Comm comm, int source, int tag, T& x)
   {
-    request operator()(MPI_Comm comm, int dest, int tag, const T& x) const
-    {
-#ifndef DIY_NO_MPI
-      request r;
-      typedef       mpi_datatype<T>     Datatype;
-      MPI_Isend((void*) Datatype::address(x),
-                Datatype::count(x),
-                Datatype::datatype(),
-                dest, tag, comm, &r.r);
-      return r;
-#else
-      (void) comm; (void) dest; (void) tag; (void) x;
-      DIY_UNSUPPORTED_MPI_CALL(MPI_Isend);
-#endif
-    }
-  };
+    static_assert(std::is_same<typename is_mpi_datatype<T>::type, true_type>::value, "is_mpi_datatype<T>::type must be true_type");
+    return recv(comm, source, tag, address(x), count(x), datatype_of(x));
+  }
 
-  // issend
-  template< class T, class is_mpi_datatype_ = typename is_mpi_datatype<T>::type >
-  struct issend;
+  template <class T>
+  status recv(DIY_MPI_Comm comm, int source, int tag, std::vector<T>& x)
+  {
+    auto s = probe(comm, source, tag);
+    x.resize(static_cast<size_t>(s.count<T>()));
+    return recv(comm, source, tag, address(x), count(x), datatype_of(x));
+  }
 
-  template<class T>
-  struct issend<T, true_type>
+  template <class T>
+  request isend(DIY_MPI_Comm comm, int dest, int tag, const T& x)
   {
-    request operator()(MPI_Comm comm, int dest, int tag, const T& x) const
-    {
-#ifndef DIY_NO_MPI
-      request r;
-      typedef       mpi_datatype<T>     Datatype;
-      MPI_Issend((void*) Datatype::address(x),
-                Datatype::count(x),
-                Datatype::datatype(),
-                dest, tag, comm, &r.r);
-      return r;
-#else
-      (void) comm; (void) dest; (void) tag; (void) x;
-      DIY_UNSUPPORTED_MPI_CALL(MPI_Issend);
-#endif
-    }
-  };
+    static_assert(std::is_same<typename is_mpi_datatype<T>::type, true_type>::value, "is_mpi_datatype<T>::type must be true_type");
+    return isend(comm, dest, tag, address(x), count(x), datatype_of(x));
+  }
 
-  // irecv
-  template< class T, class is_mpi_datatype_ = typename is_mpi_datatype<T>::type >
-  struct irecv;
+  template <class T>
+  request issend(DIY_MPI_Comm comm, int dest, int tag, const T& x)
+  {
+    static_assert(std::is_same<typename is_mpi_datatype<T>::type, true_type>::value, "is_mpi_datatype<T>::type must be true_type");
+    return issend(comm, dest, tag, address(x), count(x), datatype_of(x));
+  }
 
-  template<class T>
-  struct irecv<T, true_type>
+  template <class T>
+  request irecv(DIY_MPI_Comm comm, int source, int tag, T& x)
   {
-    request operator()(MPI_Comm comm, int source, int tag, T& x) const
-    {
-#ifndef DIY_NO_MPI
-      request r;
-      typedef       mpi_datatype<T>     Datatype;
-      MPI_Irecv(Datatype::address(x),
-                Datatype::count(x),
-                Datatype::datatype(),
-                source, tag, comm, &r.r);
-      return r;
-#else
-      (void) comm; (void) source; (void) tag; (void) x;
-      DIY_UNSUPPORTED_MPI_CALL(MPI_Irecv);
-#endif
-    }
-  };
-}
+    static_assert(std::is_same<typename is_mpi_datatype<T>::type, true_type>::value, "is_mpi_datatype<T>::type must be true_type");
+    return irecv(comm, source, tag, address(x), count(x), datatype_of(x));
+  }
+
 }
 }
+} // diy::mpi::detail
+
+#ifndef DIY_MPI_AS_LIB
+#include "point-to-point.cpp"
+#endif
+
+#endif // DIY_MPI_POINT_TO_POINT_HPP
diff --git a/include/vtkdiy2/mpi/request.cpp b/include/vtkdiy2/mpi/request.cpp
new file mode 100644
index 00000000000..59920c9525d
--- /dev/null
+++ b/include/vtkdiy2/mpi/request.cpp
@@ -0,0 +1,45 @@
+#ifdef DIY_MPI_AS_LIB
+#include "request.hpp"
+#endif
+
+#include <algorithm>
+#include <iterator>
+
+#if defined(DIY_MPI_AS_LIB) && !DIY_HAS_MPI
+diy::mpi::request::request()
+{
+  std::fill(std::begin(this->handle.data), std::end(this->handle.data), nullptr);
+}
+#else
+diy::mpi::request::request() = default;
+#endif
+
+diy::mpi::status diy::mpi::request::wait()
+{
+#if DIY_HAS_MPI
+  status s;
+  MPI_Wait(&mpi_cast(handle), &mpi_cast(s.handle));
+  return s;
+#else
+  DIY_UNSUPPORTED_MPI_CALL(diy::mpi::request::wait);
+#endif
+}
+
+diy::mpi::optional<diy::mpi::status> diy::mpi::request::test()
+{
+#if DIY_HAS_MPI
+  status s;
+  int flag;
+  MPI_Test(&mpi_cast(handle), &flag, &mpi_cast(s.handle));
+  if (flag)
+    return s;
+#endif
+  return optional<status>();
+}
+
+void diy::mpi::request::cancel()
+{
+#if DIY_HAS_MPI
+  MPI_Cancel(&mpi_cast(handle));
+#endif
+}
diff --git a/include/vtkdiy2/mpi/request.hpp b/include/vtkdiy2/mpi/request.hpp
index b4f999cade1..8db9369e9e2 100644
--- a/include/vtkdiy2/mpi/request.hpp
+++ b/include/vtkdiy2/mpi/request.hpp
@@ -1,50 +1,29 @@
+#ifndef DIY_MPI_REQUEST_HPP
+#define DIY_MPI_REQUEST_HPP
+
+#include "config.hpp"
+#include "status.hpp"
+#include "optional.hpp"
+
 namespace diy
 {
 namespace mpi
 {
   struct request
   {
-    inline
-    status              wait();
-    inline
-    optional<status>    test();
-    inline
-    void                cancel();
+    DIY_MPI_EXPORT_FUNCTION                  request();
+    DIY_MPI_EXPORT_FUNCTION status           wait();
+    DIY_MPI_EXPORT_FUNCTION optional<status> test();
+    DIY_MPI_EXPORT_FUNCTION void             cancel();
 
-    MPI_Request         r;
+    DIY_MPI_Request handle;
   };
-}
-}
 
-diy::mpi::status
-diy::mpi::request::wait()
-{
-#ifndef DIY_NO_MPI
-  status s;
-  MPI_Wait(&r, &s.s);
-  return s;
-#else
-  DIY_UNSUPPORTED_MPI_CALL(diy::mpi::request::wait);
-#endif
 }
+} // diy::mpi
 
-diy::mpi::optional<diy::mpi::status>
-diy::mpi::request::test()
-{
-#ifndef DIY_NO_MPI
-  status s;
-  int flag;
-  MPI_Test(&r, &flag, &s.s);
-  if (flag)
-    return s;
+#ifndef DIY_MPI_AS_LIB
+#include "request.cpp"
 #endif
-  return optional<status>();
-}
 
-void
-diy::mpi::request::cancel()
-{
-#ifndef DIY_NO_MPI
-  MPI_Cancel(&r);
-#endif
-}
+#endif // DIY_MPI_REQUEST_HPP
diff --git a/include/vtkdiy2/mpi/status.cpp b/include/vtkdiy2/mpi/status.cpp
new file mode 100644
index 00000000000..af2af71062f
--- /dev/null
+++ b/include/vtkdiy2/mpi/status.cpp
@@ -0,0 +1,30 @@
+#ifdef DIY_MPI_AS_LIB
+#include "status.hpp"
+#endif
+
+int diy::mpi::status::source() const { return mpi_cast(handle).MPI_SOURCE; }
+int diy::mpi::status::tag() const    { return mpi_cast(handle).MPI_TAG; }
+int diy::mpi::status::error() const  { return mpi_cast(handle).MPI_ERROR; }
+
+bool diy::mpi::status::cancelled() const
+{
+#if DIY_HAS_MPI
+  int flag;
+  MPI_Test_cancelled(&mpi_cast(handle), &flag);
+  return flag;
+#else
+  DIY_UNSUPPORTED_MPI_CALL(diy::mpi::status::cancelled);
+#endif
+}
+
+int diy::mpi::status::count(const diy::mpi::datatype& type) const
+{
+#if DIY_HAS_MPI
+  int c;
+  MPI_Get_count(&mpi_cast(handle), mpi_cast(type.handle), &c);
+  return c;
+#else
+  (void) type;
+  DIY_UNSUPPORTED_MPI_CALL(diy::mpi::status::count);
+#endif
+}
diff --git a/include/vtkdiy2/mpi/status.hpp b/include/vtkdiy2/mpi/status.hpp
index e90978bd44e..8838561f937 100644
--- a/include/vtkdiy2/mpi/status.hpp
+++ b/include/vtkdiy2/mpi/status.hpp
@@ -1,49 +1,42 @@
+#ifndef DIY_MPI_STATUS_HPP
+#define DIY_MPI_STATUS_HPP
+
+#include "config.hpp"
+#include "datatypes.hpp"
+
 namespace diy
 {
 namespace mpi
 {
   struct status
   {
-    int             source() const          { return s.MPI_SOURCE; }
-    int             tag() const             { return s.MPI_TAG; }
-    int             error() const           { return s.MPI_ERROR; }
+    status() = default;
+    status(const DIY_MPI_Status& s) : handle(s) {}
 
-    inline
-    bool            cancelled() const;
+#ifndef DIY_MPI_AS_LIB // only available in header-only mode
+    status(const MPI_Status& s) : handle(s) {}
+    operator MPI_Status() { return handle; }
+#endif
 
-    template<class T>
-    int             count() const;
+    DIY_MPI_EXPORT_FUNCTION int  source() const;
+    DIY_MPI_EXPORT_FUNCTION int  tag() const;
+    DIY_MPI_EXPORT_FUNCTION int  error() const;
+    DIY_MPI_EXPORT_FUNCTION bool cancelled() const;
+    DIY_MPI_EXPORT_FUNCTION int  count(const datatype& type) const;
 
-                    operator MPI_Status&()              { return s; }
-                    operator const MPI_Status&() const  { return s; }
+    template<class T>       int count() const
+    {
+      return this->count(detail::get_mpi_datatype<T>());
+    }
 
-    MPI_Status      s;
+    DIY_MPI_Status handle;
   };
-}
-}
-
 
-bool
-diy::mpi::status::cancelled() const
-{
-#ifndef DIY_NO_MPI
-  int flag;
-  MPI_Test_cancelled(const_cast<MPI_Status*>(&s), &flag);
-  return flag;
-#else
-  DIY_UNSUPPORTED_MPI_CALL(diy::mpi::status::cancelled);
-#endif
 }
+} // diy::mpi
 
-template<class T>
-int
-diy::mpi::status::count() const
-{
-#ifndef DIY_NO_MPI
-  int c;
-  MPI_Get_count(const_cast<MPI_Status*>(&s), detail::get_mpi_datatype<T>(), &c);
-  return c;
-#else
-  DIY_UNSUPPORTED_MPI_CALL(diy::mpi::status::count);
+#ifndef DIY_MPI_AS_LIB
+#include "status.cpp"
 #endif
-}
+
+#endif // DIY_MPI_STATUS_HPP
diff --git a/include/vtkdiy2/mpi/window.cpp b/include/vtkdiy2/mpi/window.cpp
new file mode 100644
index 00000000000..ec20d0b3e1c
--- /dev/null
+++ b/include/vtkdiy2/mpi/window.cpp
@@ -0,0 +1,226 @@
+#ifdef DIY_MPI_AS_LIB
+#include "window.hpp"
+#endif
+
+#include <algorithm>
+
+namespace diy
+{
+namespace mpi
+{
+
+#ifdef DIY_MPI_AS_LIB
+#  ifdef _MSC_VER
+#    define EXPORT_MACRO DIY_MPI_EXPORT
+#  else
+#    define EXPORT_MACRO
+#  endif
+EXPORT_MACRO const int nocheck  = MPI_MODE_NOCHECK;
+#  undef EXPORT_MACRO
+#endif
+
+namespace detail
+{
+
+DIY_MPI_Win win_allocate(const communicator& comm, void** base, unsigned size, int disp)
+{
+#if DIY_HAS_MPI
+  DIY_MPI_Win win;
+  MPI_Win_allocate(size, disp, MPI_INFO_NULL, mpi_cast(comm.handle()), base, &mpi_cast(win));
+  return win;
+#else
+  (void)comm; (void)disp;
+  *base = malloc(size);
+  auto mpi_win = MPI_Win(*base, true);
+  auto win = make_DIY_MPI_Win(std::move(mpi_win));
+  return win;
+#endif
+}
+
+DIY_MPI_Win win_create(const communicator& comm, void* base, unsigned size, int disp)
+{
+#if DIY_HAS_MPI
+  DIY_MPI_Win win;
+  MPI_Win_create(base, size, disp, MPI_INFO_NULL, mpi_cast(comm.handle()), &mpi_cast(win));
+  return win;
+#else
+  (void)comm; (void)size; (void)disp;
+  auto mpi_win = MPI_Win(base);
+  auto win = make_DIY_MPI_Win(std::move(mpi_win));
+  return win;
+#endif
+}
+
+void win_free(DIY_MPI_Win& win)
+{
+#if DIY_HAS_MPI
+  MPI_Win_free(&mpi_cast(win));
+#else
+  auto& mpi_win = mpi_cast(win);
+  if (mpi_win.owned())
+    free(mpi_win.data());
+#endif
+}
+
+void put(const DIY_MPI_Win& win, const void* data, int count, const datatype& type, int rank, unsigned offset)
+{
+#if DIY_HAS_MPI
+  MPI_Put(data, count, mpi_cast(type.handle), rank, offset, count, mpi_cast(type.handle), mpi_cast(win));
+#else
+  void* buffer = mpi_cast(win).data();
+  size_t size = mpi_cast(type.handle);
+  std::copy_n(static_cast<const int8_t*>(data),
+              size * static_cast<size_t>(count),
+              static_cast<int8_t*>(buffer) + (offset * size));
+  (void)rank;
+#endif
+}
+
+void get(const DIY_MPI_Win& win, void* data, int count, const datatype& type, int rank, unsigned offset)
+{
+#if DIY_HAS_MPI
+  MPI_Get(data, count, mpi_cast(type.handle), rank, offset, count, mpi_cast(type.handle), mpi_cast(win));
+#else
+  const void* buffer = mpi_cast(win).data();
+  size_t size = mpi_cast(type.handle);
+  std::copy_n(static_cast<const int8_t*>(buffer) + (offset * size),
+              size * static_cast<size_t>(count),
+              static_cast<int8_t*>(data));
+  (void)rank;
+#endif
+}
+
+void fence(const DIY_MPI_Win& win, int assert)
+{
+#if DIY_HAS_MPI
+  MPI_Win_fence(assert, mpi_cast(win));
+#else
+  (void) win; (void) assert;
+#endif
+}
+
+void lock(const DIY_MPI_Win& win, int lock_type, int rank, int assert)
+{
+#if DIY_HAS_MPI
+  MPI_Win_lock(lock_type, rank, assert, mpi_cast(win));
+#else
+  (void) win; (void) lock_type; (void) rank; (void) assert;
+#endif
+}
+
+void unlock(const DIY_MPI_Win& win, int rank)
+{
+#if DIY_HAS_MPI
+  MPI_Win_unlock(rank, mpi_cast(win));
+#else
+  (void) win; (void) rank;
+#endif
+}
+
+void lock_all(const DIY_MPI_Win& win, int assert)
+{
+#if DIY_HAS_MPI
+  MPI_Win_lock_all(assert, mpi_cast(win));
+#else
+  (void) win; (void) assert;
+#endif
+}
+
+void unlock_all(const DIY_MPI_Win& win)
+{
+#if DIY_HAS_MPI
+  MPI_Win_unlock_all(mpi_cast(win));
+#else
+  (void) win;
+#endif
+}
+
+void fetch_and_op(const DIY_MPI_Win& win,
+                  const void* origin, void* result, const datatype& type,
+                  int rank, unsigned offset,
+                  const operation& op)
+{
+#if DIY_HAS_MPI
+  MPI_Fetch_and_op(origin, result, mpi_cast(type.handle), rank, offset, mpi_cast(op.handle), mpi_cast(win));
+#else
+  (void) win; (void) origin; (void) result; (void) type; (void) rank; (void) offset; (void) op;
+  DIY_UNSUPPORTED_MPI_CALL(MPI_Fetch_and_op);
+#endif
+}
+
+void fetch(const DIY_MPI_Win& win, void* result, const datatype& type, int rank, unsigned offset)
+{
+#if DIY_HAS_MPI
+  MPI_Fetch_and_op(nullptr, result, mpi_cast(type.handle), rank, offset, MPI_NO_OP, mpi_cast(win));
+#else
+  (void) rank;
+  const void* buffer = mpi_cast(win).data();
+  size_t size = mpi_cast(type.handle);
+  std::copy_n(static_cast<const int8_t*>(buffer) + (offset * size),
+              size,
+              static_cast<int8_t*>(result));
+#endif
+}
+
+void replace(const DIY_MPI_Win& win, const void* value, const datatype& type, int rank, unsigned offset)
+{
+#if DIY_HAS_MPI
+  MPI_Fetch_and_op(value, nullptr, mpi_cast(type.handle), rank, offset, MPI_REPLACE, mpi_cast(win));
+#else
+  (void) rank;
+  void* buffer = mpi_cast(win).data();
+  size_t size = mpi_cast(type.handle);
+  std::copy_n(static_cast<const int8_t*>(value),
+              size,
+              static_cast<int8_t*>(buffer) + (offset * size));
+#endif
+}
+
+void sync(const DIY_MPI_Win& win)
+{
+#if DIY_HAS_MPI
+  MPI_Win_sync(mpi_cast(win));
+#else
+  (void) win;
+#endif
+}
+
+void flush(const DIY_MPI_Win& win, int rank)
+{
+#if DIY_HAS_MPI
+  MPI_Win_flush(rank, mpi_cast(win));
+#else
+  (void) win; (void) rank;
+#endif
+}
+
+void flush_all(const DIY_MPI_Win& win)
+{
+#if DIY_HAS_MPI
+  MPI_Win_flush_all(mpi_cast(win));
+#else
+  (void) win;
+#endif
+}
+
+void flush_local(const DIY_MPI_Win& win, int rank)
+{
+#if DIY_HAS_MPI
+  MPI_Win_flush_local(rank, mpi_cast(win));
+#else
+  (void) win; (void) rank;
+#endif
+}
+
+void flush_local_all(const DIY_MPI_Win& win)
+{
+#if DIY_HAS_MPI
+  MPI_Win_flush_local_all(mpi_cast(win));
+#else
+  (void) win;
+#endif
+}
+
+}
+}
+} // diy::mpi::detail
diff --git a/include/vtkdiy2/mpi/window.hpp b/include/vtkdiy2/mpi/window.hpp
index 7bf941ee29d..2088ce2a07c 100644
--- a/include/vtkdiy2/mpi/window.hpp
+++ b/include/vtkdiy2/mpi/window.hpp
@@ -1,10 +1,92 @@
+#ifndef DIY_MPI_WINODW_HPP
+#define DIY_MPI_WINODW_HPP
+
+#include "config.hpp"
+#include "communicator.hpp"
+#include "operations.hpp"
+
 #include <type_traits>
+#include <vector>
 
 namespace diy
 {
 namespace mpi
 {
 
+#ifndef DIY_MPI_AS_LIB
+constexpr int nocheck  = MPI_MODE_NOCHECK;
+#else
+DIY_MPI_EXPORT extern const int nocheck;
+#endif
+
+namespace detail
+{
+
+DIY_MPI_EXPORT_FUNCTION
+DIY_MPI_Win win_allocate(const communicator& comm, void** base, unsigned size, int disp);
+
+DIY_MPI_EXPORT_FUNCTION
+DIY_MPI_Win win_create(const communicator& comm, void* base, unsigned size, int disp);
+
+DIY_MPI_EXPORT_FUNCTION
+void win_free(DIY_MPI_Win& win);
+
+DIY_MPI_EXPORT_FUNCTION
+void put(const DIY_MPI_Win& win,
+         const void* data, int count, const datatype& type,
+         int rank, unsigned offset);
+
+DIY_MPI_EXPORT_FUNCTION
+void get(const DIY_MPI_Win& win,
+         void* data, int count, const datatype& type,
+         int rank, unsigned offset);
+
+DIY_MPI_EXPORT_FUNCTION
+void fence(const DIY_MPI_Win& win, int assert);
+
+DIY_MPI_EXPORT_FUNCTION
+void lock(const DIY_MPI_Win& win, int lock_type, int rank, int assert);
+
+DIY_MPI_EXPORT_FUNCTION
+void unlock(const DIY_MPI_Win& win, int rank);
+
+DIY_MPI_EXPORT_FUNCTION
+void lock_all(const DIY_MPI_Win& win, int assert);
+
+DIY_MPI_EXPORT_FUNCTION
+void unlock_all(const DIY_MPI_Win& win);
+
+DIY_MPI_EXPORT_FUNCTION
+void fetch_and_op(const DIY_MPI_Win& win,
+                  const void* origin, void* result, const datatype& type,
+                  int rank, unsigned offset,
+                  const operation& op);
+
+DIY_MPI_EXPORT_FUNCTION
+void fetch(const DIY_MPI_Win& win, void* result, const datatype& type, int rank, unsigned offset);
+
+DIY_MPI_EXPORT_FUNCTION
+void replace(const DIY_MPI_Win& win,
+             const void* value, const datatype& type,
+             int rank, unsigned offset);
+
+DIY_MPI_EXPORT_FUNCTION
+void sync(const DIY_MPI_Win& win);
+
+DIY_MPI_EXPORT_FUNCTION
+void flush(const DIY_MPI_Win& win, int rank);
+
+DIY_MPI_EXPORT_FUNCTION
+void flush_all(const DIY_MPI_Win& win);
+
+DIY_MPI_EXPORT_FUNCTION
+void flush_local(const DIY_MPI_Win& win, int rank);
+
+DIY_MPI_EXPORT_FUNCTION
+void flush_local_all(const DIY_MPI_Win& win);
+
+} // detail
+
     //! \ingroup MPI
     //! Simple wrapper around MPI window functions.
     template<class T>
@@ -17,8 +99,8 @@ namespace mpi
             inline ~window();
 
             // moving is Ok
-            window(window&&)      = default;
-            window& operator=(window&&) = default;
+            inline window(window&&);
+            inline window& operator=(window&&);
 
             // cannot copy because of the buffer_
             window(const window&) = delete;
@@ -38,7 +120,7 @@ namespace mpi
             inline void lock_all(int assert = 0);
             inline void unlock_all();
 
-            inline void fetch_and_op(const T* origin, T* result, int rank, unsigned offset, MPI_Op op);
+            inline void fetch_and_op(const T* origin, T* result, int rank, unsigned offset, const operation& op);
             inline void fetch(T& result, int rank, unsigned offset);
             inline void replace(const T& value, int rank, unsigned offset);
 
@@ -50,32 +132,57 @@ namespace mpi
             inline void flush_local_all();
 
         private:
-            std::vector<T>      buffer_;
+            void*               buffer_;
             int                 rank_;
-#ifndef DIY_NO_MPI
-            MPI_Win             window_;
-#endif
+            DIY_MPI_Win         window_;
     };
+
 } // mpi
 } // diy
 
 template<class T>
 diy::mpi::window<T>::
-window(const communicator& comm, unsigned size):
-  buffer_(size), rank_(comm.rank())
+window(const diy::mpi::communicator& comm, unsigned size):
+  buffer_(nullptr), rank_(comm.rank())
 {
-#ifndef DIY_NO_MPI
-    MPI_Win_create(buffer_.data(), buffer_.size()*sizeof(T), sizeof(T), MPI_INFO_NULL, comm, &window_);
-#endif
+  window_ = detail::win_allocate(comm, &buffer_, static_cast<unsigned>(size*sizeof(T)), static_cast<int>(sizeof(T)));
 }
 
 template<class T>
 diy::mpi::window<T>::
 ~window()
 {
-#ifndef DIY_NO_MPI
-    MPI_Win_free(&window_);
-#endif
+  if (buffer_)
+    detail::win_free(window_);
+}
+
+template<class T>
+diy::mpi::window<T>::
+window(window&& rhs):
+  buffer_(rhs.buffer_), rank_(rhs.rank_), window_(std::move(rhs.window_))
+{
+  rhs.buffer_ = nullptr;
+  rhs.window_.reset();
+}
+
+template<class T>
+diy::mpi::window<T>&
+diy::mpi::window<T>::
+operator=(window&& rhs)
+{
+  if (this == &rhs)
+    return *this;
+
+  if (buffer_)
+    detail::win_free(window_);
+
+  buffer_ = rhs.buffer_;
+  rhs.buffer_ = nullptr;
+  rank_ = rhs.rank_;
+  window_ = std::move(rhs.window_);
+  rhs.window_.reset();
+
+  return *this;
 }
 
 template<class T>
@@ -83,15 +190,7 @@ void
 diy::mpi::window<T>::
 put(const T& x, int rank, unsigned offset)
 {
-#ifndef DIY_NO_MPI
-    MPI_Put(address(x), count(x), datatype(x),
-            rank,
-            offset,
-            count(x), datatype(x),
-            window_);
-#else
-    buffer_[offset] = x;
-#endif
+  detail::put(window_, address(x), count(x), datatype_of(x), rank, offset);
 }
 
 template<class T>
@@ -99,16 +198,7 @@ void
 diy::mpi::window<T>::
 put(const std::vector<T>& x, int rank, unsigned offset)
 {
-#ifndef DIY_NO_MPI
-    MPI_Put(address(x), count(x), datatype(x),
-            rank,
-            offset,
-            count(x), datatype(x),
-            window_);
-#else
-    for (size_t i = 0; i < x.size(); ++i)
-        buffer_[offset + i] = x[i];
-#endif
+  detail::put(window_, address(x), count(x), datatype_of(x), rank, offset);
 }
 
 template<class T>
@@ -116,15 +206,7 @@ void
 diy::mpi::window<T>::
 get(T& x, int rank, unsigned offset)
 {
-#ifndef DIY_NO_MPI
-    MPI_Get(address(x), count(x), datatype(x),
-            rank,
-            offset,
-            count(x), datatype(x),
-            window_);
-#else
-    x = buffer_[offset];
-#endif
+  detail::get(window_, address(x), count(x), datatype_of(x), rank, offset);
 }
 
 template<class T>
@@ -132,16 +214,7 @@ void
 diy::mpi::window<T>::
 get(std::vector<T>& x, int rank, unsigned offset)
 {
-#ifndef DIY_NO_MPI
-    MPI_Get(address(x), count(x), datatype(x),
-            rank,
-            offset,
-            count(x), datatype(x),
-            window_);
-#else
-    for (size_t i = 0; i < x.size(); ++i)
-        x[i] = buffer_[offset + i];
-#endif
+  detail::get(window_, address(x), count(x), datatype_of(x), rank, offset);
 }
 
 template<class T>
@@ -149,9 +222,7 @@ void
 diy::mpi::window<T>::
 fence(int assert)
 {
-#ifndef DIY_NO_MPI
-    MPI_Win_fence(assert, window_);
-#endif
+  detail::fence(window_, assert);
 }
 
 template<class T>
@@ -159,9 +230,7 @@ void
 diy::mpi::window<T>::
 lock(int lock_type, int rank, int assert)
 {
-#ifndef DIY_NO_MPI
-    MPI_Win_lock(lock_type, rank, assert, window_);
-#endif
+  detail::lock(window_, lock_type, rank, assert);
 }
 
 template<class T>
@@ -169,9 +238,7 @@ void
 diy::mpi::window<T>::
 unlock(int rank)
 {
-#ifndef DIY_NO_MPI
-    MPI_Win_unlock(rank, window_);
-#endif
+  detail::unlock(window_, rank);
 }
 
 template<class T>
@@ -179,9 +246,7 @@ void
 diy::mpi::window<T>::
 lock_all(int assert)
 {
-#ifndef DIY_NO_MPI
-    MPI_Win_lock_all(assert, window_);
-#endif
+  detail::lock_all(window_, assert);
 }
 
 template<class T>
@@ -189,20 +254,15 @@ void
 diy::mpi::window<T>::
 unlock_all()
 {
-#ifndef DIY_NO_MPI
-    MPI_Win_unlock_all(window_);
-#endif
+  detail::unlock_all(window_);
 }
+
 template<class T>
 void
 diy::mpi::window<T>::
-fetch_and_op(const T* origin, T* result, int rank, unsigned offset, MPI_Op op)
+fetch_and_op(const T* origin, T* result, int rank, unsigned offset, const diy::mpi::operation& op)
 {
-#ifndef DIY_NO_MPI
-    MPI_Fetch_and_op(origin, result, datatype(*origin), rank, offset, op, window_);
-#else
-    DIY_UNSUPPORTED_MPI_CALL(MPI_Fetch_and_op);
-#endif
+  detail::fetch_and_op(window_, origin, result, datatype_of(*origin), rank, offset, op);
 }
 
 template<class T>
@@ -210,12 +270,7 @@ void
 diy::mpi::window<T>::
 fetch(T& result, int rank, unsigned offset)
 {
-#ifndef DIY_NO_MPI
-    T unused;
-    fetch_and_op(&unused, &result, rank, offset, MPI_NO_OP);
-#else
-    result = buffer_[offset];
-#endif
+  detail::fetch(window_, &result, datatype_of(result), rank, offset);
 }
 
 template<class T>
@@ -223,12 +278,7 @@ void
 diy::mpi::window<T>::
 replace(const T& value, int rank, unsigned offset)
 {
-#ifndef DIY_NO_MPI
-    T unused;
-    fetch_and_op(&value, &unused, rank, offset, MPI_REPLACE);
-#else
-    buffer_[offset] = value;
-#endif
+  detail::replace(window_, &value, datatype_of(value), rank, offset);
 }
 
 template<class T>
@@ -236,9 +286,7 @@ void
 diy::mpi::window<T>::
 sync()
 {
-#ifndef DIY_NO_MPI
-    MPI_Win_sync(window_);
-#endif
+  detail::sync(window_);
 }
 
 template<class T>
@@ -246,9 +294,7 @@ void
 diy::mpi::window<T>::
 flush(int rank)
 {
-#ifndef DIY_NO_MPI
-    MPI_Win_flush(rank, window_);
-#endif
+  detail::flush(window_, rank);
 }
 
 template<class T>
@@ -256,9 +302,7 @@ void
 diy::mpi::window<T>::
 flush_all()
 {
-#ifndef DIY_NO_MPI
-    MPI_Win_flush_all(window_);
-#endif
+  detail::flush_all(window_);
 }
 
 template<class T>
@@ -266,9 +310,7 @@ void
 diy::mpi::window<T>::
 flush_local(int rank)
 {
-#ifndef DIY_NO_MPI
-    MPI_Win_flush_local(rank, window_);
-#endif
+  detail::flush_local(window_, rank);
 }
 
 template<class T>
@@ -276,7 +318,11 @@ void
 diy::mpi::window<T>::
 flush_local_all()
 {
-#ifndef DIY_NO_MPI
-    MPI_Win_flush_local_all(window_);
-#endif
+  detail::flush_local_all(window_);
 }
+
+#ifndef DIY_MPI_AS_LIB
+#include "window.cpp"
+#endif
+
+#endif // DIY_MPI_WINODW_HPP
diff --git a/include/vtkdiy2/no-thread.hpp b/include/vtkdiy2/no-thread.hpp
index 218c3067d05..12533c0f610 100644
--- a/include/vtkdiy2/no-thread.hpp
+++ b/include/vtkdiy2/no-thread.hpp
@@ -18,6 +18,8 @@ namespace diy
     template<class Function, class... Args>
     explicit            thread(Function&& f, Args&&... args)      { f(args...); }       // not ideal, since it doesn't support member functions
 
+    thread&             operator=(thread&&)                       = default;
+
     void                join()                                    {}
 
     static unsigned     hardware_concurrency()                    { return 1; }
@@ -31,8 +33,13 @@ namespace diy
   struct lock_guard
   {
       lock_guard(T&)        {}
+      void lock()           {}
+      void unlock()         {}
   };
 
+  template<class T, class U>
+  using concurrent_map = std::map<T,U>;
+
   namespace this_thread
   {
       inline unsigned long int  get_id()    { return 0; }
diff --git a/include/vtkdiy2/proxy.hpp b/include/vtkdiy2/proxy.hpp
index 00bf8279fa4..8f33c98b773 100644
--- a/include/vtkdiy2/proxy.hpp
+++ b/include/vtkdiy2/proxy.hpp
@@ -1,6 +1,7 @@
 #ifndef DIY_PROXY_HPP
 #define DIY_PROXY_HPP
 
+#include "coroutine.hpp"
 
 namespace diy
 {
@@ -10,17 +11,88 @@ namespace diy
     template <class T>
     struct EnqueueIterator;
 
+    using IncomingQueues = std::map<int,     MemoryBuffer>;
+    using OutgoingQueues = std::map<BlockID, MemoryBuffer>;
+
                         Proxy(Master* master__, int gid__,
                               IExchangeInfo*  iexchange__ = 0):
                           gid_(gid__),
                           master_(master__),
                           iexchange_(iexchange__),
-                          incoming_(&master__->incoming(gid__)),
-                          outgoing_(&master__->outgoing(gid__)),
-                          collectives_(&master__->collectives(gid__))   {}
+                          collectives_(&master__->collectives(gid__))
+    {
+        fill_incoming();
+
+        // move outgoing_ back into proxy, in case it's a multi-foreach round
+        if (!iexchange_)
+            for (auto& x : master_->outgoing(gid_))
+            {
+                auto access = x.second.access();
+                if (!access->empty())
+                {
+                    outgoing_.emplace(x.first, access->back().move());
+                    access->pop_back();
+                }
+            }
+    }
+
+    // delete copy constructor to avoid coping incoming_ and outgoing_ (plus it
+    // won't work otherwise because MemoryBuffer has a deleted copy
+    // constructor)
+                        Proxy(const Proxy&)     =delete;
+                        Proxy(Proxy&&)          =default;
+    Proxy&              operator=(const Proxy&) =delete;
+    Proxy&              operator=(Proxy&&)      =default;
+
+                        ~Proxy()
+    {
+        auto& outgoing = master_->outgoing(gid_);
+        auto& incoming = master_->incoming(gid_);
+
+        // copy out outgoing_
+        for (auto& x : outgoing_)
+        {
+            outgoing[x.first].access()->emplace_back(std::move(x.second));
+            if (iexchange_)
+                iexchange_->inc_work();
+        }
+
+        // move incoming_ back into master, in case it's a multi-foreach round
+        if (!iexchange_)
+            for (auto& x : incoming_)
+                incoming[x.first].access()->emplace_front(std::move(x.second));
+    }
+
+    void                init()
+    {
+        collectives_ = &master()->collectives(gid());
+    }
 
     int                 gid() const                                     { return gid_; }
 
+    bool                fill_incoming() const
+    {
+        bool exists = false;
+
+        incoming_.clear();
+
+        // fill incoming_
+        for (auto& x : master_->incoming(gid_))
+        {
+            auto access = x.second.access();
+            if (!access->empty())
+            {
+                exists = true;
+                incoming_.emplace(x.first, access->front().move());
+                access->pop_front();
+                if (iexchange_)
+                    iexchange_->dec_work();
+            }
+        }
+
+        return exists;
+    }
+
     //! Enqueue data whose size can be determined automatically, e.g., an STL vector.
     template<class T>
     void                enqueue(const BlockID&  to,                                     //!< target block (gid,proc)
@@ -28,13 +100,7 @@ namespace diy
                                 void (*save)(BinaryBuffer&, const T&) = &::diy::save    //!< optional serialization function
                                ) const
     {
-        OutgoingQueues& out = *outgoing_; save(out[to], x);
-
-        if (iexchange_ && iexchange_->fine())
-        {
-            GidSendOrder gid_order;             // uninitialized, not needed
-            master()->comm_exchange(gid_order, iexchange_);
-        }
+        save(outgoing_[to], x);
     }
 
     //! Enqueue data whose size is given explicitly by the user, e.g., an array.
@@ -45,6 +111,12 @@ namespace diy
                                 void (*save)(BinaryBuffer&, const T&) = &::diy::save    //!< optional serialization function
                                ) const;
 
+    void inline         enqueue_blob
+                               (const BlockID&  to,                                     //!< target block (gid,proc)
+                                const char*     x,                                      //!< pointer to the data
+                                size_t          n                                       //!< size in data elements (eg. ints)
+                               ) const;
+
     //! Dequeue data whose size can be determined automatically (e.g., STL vector) and that was
     //! previously enqueued so that diy knows its size when it is received.
     //! In this case, diy will allocate the receive buffer; the user does not need to do so.
@@ -53,7 +125,7 @@ namespace diy
                                 T&              x,                                      //!< data (eg. STL vector)
                                 void (*load)(BinaryBuffer&, T&) = &::diy::load          //!< optional serialization function
                                ) const
-    { IncomingQueues& in  = *incoming_; load(in[from], x); }
+    { load(incoming_[from], x); }
 
     //! Dequeue an array of data whose size is given explicitly by the user.
     //! In this case, the user needs to allocate the receive buffer prior to calling dequeue.
@@ -82,17 +154,20 @@ namespace diy
                                 void (*load)(BinaryBuffer&, T&) = &::diy::load          //!< optional serialization function
                                ) const                                  { dequeue(from.gid, x, n, load); }
 
+    BinaryBlob inline   dequeue_blob
+                               (int  from) const;
+
     template<class T>
     EnqueueIterator<T>  enqueuer(const T& x,
                                  void (*save)(BinaryBuffer&, const T&) = &::diy::save   ) const
     { return EnqueueIterator<T>(this, x, save); }
 
-    IncomingQueues*     incoming() const                                { return incoming_; }
-    MemoryBuffer&       incoming(int from) const                        { return (*incoming_)[from]; }
+    IncomingQueues*     incoming() const                                { return &incoming_; }
+    MemoryBuffer&       incoming(int from) const                        { return incoming_[from]; }
     inline void         incoming(std::vector<int>& v) const;            // fill v with every gid from which we have a message
 
-    OutgoingQueues*     outgoing() const                                { return outgoing_; }
-    MemoryBuffer&       outgoing(const BlockID& to) const               { return (*outgoing_)[to]; }
+    OutgoingQueues*     outgoing() const                                { return &outgoing_; }
+    MemoryBuffer&       outgoing(const BlockID& to) const               { return outgoing_[to]; }
 
     inline bool         empty_incoming_queues() const;
     inline bool         empty_outgoing_queues() const;
@@ -134,19 +209,30 @@ namespace diy
     Master*             master() const                                  { return master_; }
     IExchangeInfo*      iexchange() const                               { return iexchange_; }
 
+    // Coroutine machinery
+    void                set_main(coroutine::cothread_t main)            { main_ = main; }
+    void                yield() const                                   { coroutine::co_switch(main_); }
+    void                set_done(bool x)                                { done_ = x; }
+    bool                done() const                                    { return done_; }
+
     private:
       int               gid_;
       Master*           master_;
       IExchangeInfo*    iexchange_;
 
-      IncomingQueues*   incoming_;
-      OutgoingQueues*   outgoing_;
+      // TODO: these are marked mutable to not have to undo consts on enqueue/dequeue, in case it breaks things;
+      //       eventually, implement this change
+      mutable IncomingQueues    incoming_;
+      mutable OutgoingQueues    outgoing_;
+
       CollectivesList*  collectives_;
+
+      coroutine::cothread_t main_;
+      bool                  done_ = false;
   };
 
   template<class T>
-  struct Master::Proxy::EnqueueIterator:
-    public std::iterator<std::output_iterator_tag, void, void, void, void>
+  struct Master::Proxy::EnqueueIterator
   {
     typedef     void (*SaveT)(BinaryBuffer&, const T&);
 
@@ -168,10 +254,10 @@ namespace diy
 
   struct Master::ProxyWithLink: public Master::Proxy
   {
-            ProxyWithLink(const Proxy&    proxy,
+            ProxyWithLink(Proxy&&         proxy,
                           void*           block__,
                           Link*           link__):
-              Proxy(proxy),
+              Proxy(std::move(proxy)),
               block_(block__),
               link_(link__)                                         {}
 
@@ -188,8 +274,8 @@ void
 diy::Master::Proxy::
 incoming(std::vector<int>& v) const
 {
-  for (IncomingQueues::const_iterator it = incoming_->begin(); it != incoming_->end(); ++it)
-    v.push_back(it->first);
+  for (auto& x : incoming_)
+    v.push_back(x.first);
 }
 
 bool
@@ -262,19 +348,12 @@ diy::Master::Proxy::
 enqueue(const BlockID& to, const T* x, size_t n,
         void (*save)(BinaryBuffer&, const T&)) const
 {
-    OutgoingQueues& out = *outgoing_;
-    BinaryBuffer&   bb  = out[to];
+    BinaryBuffer&   bb  = outgoing_[to];
     if (save == (void (*)(BinaryBuffer&, const T&)) &::diy::save<T>)
         diy::save(bb, x, n);       // optimized for unspecialized types
     else
         for (size_t i = 0; i < n; ++i)
             save(bb, x[i]);
-
-    if (iexchange_ && iexchange_->fine())
-    {
-        GidSendOrder gid_order;             // uninitialized, not needed
-        master()->comm_exchange(gid_order, iexchange_);
-    }
 }
 
 template<class T>
@@ -283,8 +362,7 @@ diy::Master::Proxy::
 dequeue(int from, T* x, size_t n,
         void (*load)(BinaryBuffer&, T&)) const
 {
-    IncomingQueues& in = *incoming_;
-    BinaryBuffer&   bb = in[from];
+    BinaryBuffer&   bb = incoming_[from];
     if (load == (void (*)(BinaryBuffer&, T&)) &::diy::load<T>)
         diy::load(bb, x, n);       // optimized for unspecialized types
     else
@@ -292,5 +370,20 @@ dequeue(int from, T* x, size_t n,
             load(bb, x[i]);
 }
 
+void
+diy::Master::Proxy::
+enqueue_blob(const BlockID& to, const char* x, size_t n) const
+{
+    BinaryBuffer&   bb  = outgoing_[to];
+    bb.save_binary_blob(x,n);
+}
+
+diy::BinaryBlob
+diy::Master::Proxy::
+dequeue_blob(int from) const
+{
+    BinaryBuffer&   bb = incoming_[from];
+    return bb.load_binary_blob();
+}
 
 #endif
diff --git a/include/vtkdiy2/reduce.hpp b/include/vtkdiy2/reduce.hpp
index 0dac8327f57..44e4c8b6268 100644
--- a/include/vtkdiy2/reduce.hpp
+++ b/include/vtkdiy2/reduce.hpp
@@ -16,13 +16,13 @@ struct ReduceProxy: public Master::Proxy
 {
     typedef     std::vector<int>                            GIDVector;
 
-    ReduceProxy(const Master::Proxy&    proxy, //!< parent proxy
+    ReduceProxy(Master::Proxy&&         proxy, //!< parent proxy
                 void*                   block, //!< diy block
                 unsigned                round, //!< current round
                 const Assigner&         assigner, //!< assigner
                 const GIDVector&        incoming_gids, //!< incoming gids in this group
                 const GIDVector&        outgoing_gids): //!< outgoing gids in this group
-      Master::Proxy(proxy),
+      Master::Proxy(std::move(proxy)),
       block_(block),
       round_(round),
       assigner_(assigner)
@@ -46,13 +46,13 @@ struct ReduceProxy: public Master::Proxy
       }
     }
 
-    ReduceProxy(const Master::Proxy&    proxy, //!< parent proxy
+    ReduceProxy(Master::Proxy&&         proxy, //!< parent proxy
                 void*                   block, //!< diy block
                 unsigned                round, //!< current round
                 const Assigner&         assigner,
                 const Link&             in_link,
                 const Link&             out_link):
-      Master::Proxy(proxy),
+      Master::Proxy(std::move(proxy)),
       block_(block),
       round_(round),
       assigner_(assigner),
@@ -138,7 +138,7 @@ void reduce(Master&                    master,        //!< master object
       }
     }
     master.set_expected(expected);
-    master.flush();
+    master.flush(false);
   }
   // final round
   log->debug("Round {}", round);
@@ -170,7 +170,7 @@ namespace detail
   {
     using Callback = std::function<void(Block*, const ReduceProxy&, const Partners&)>;
 
-                ReductionFunctor(unsigned round_, const Callback& reduce_, const Partners& partners_, const Assigner& assigner_):
+                ReductionFunctor(int round_, const Callback& reduce_, const Partners& partners_, const Assigner& assigner_):
                     round(round_), reduce(reduce_), partners(partners_), assigner(assigner_)        {}
 
     void        operator()(Block* b, const Master::ProxyWithLink& cp) const
@@ -180,20 +180,20 @@ namespace detail
       std::vector<int> incoming_gids, outgoing_gids;
       if (round > 0)
           partners.incoming(round, cp.gid(), incoming_gids, *cp.master());        // receive from the previous round
-      if (round < partners.rounds())
+      if (round < static_cast<int>(partners.rounds()))
           partners.outgoing(round, cp.gid(), outgoing_gids, *cp.master());        // send to the next round
 
-      ReduceProxy   rp(cp, b, round, assigner, incoming_gids, outgoing_gids);
+      ReduceProxy   rp(std::move(const_cast<Master::ProxyWithLink&>(cp)), b, round, assigner, incoming_gids, outgoing_gids);
       reduce(b, rp, partners);
 
       // touch the outgoing queues to make sure they exist
-      Master::OutgoingQueues& outgoing = *cp.outgoing();
-      if (outgoing.size() < (size_t) rp.out_link().size())
-        for (int j = 0; j < rp.out_link().size(); ++j)
-          outgoing[rp.out_link().target(j)];       // touch the outgoing queue, creating it if necessary
+      Master::Proxy::OutgoingQueues& outgoing = *rp.outgoing();
+      if (outgoing.size() < static_cast<size_t>(rp.out_link().size()))
+        for (BlockID target : rp.out_link().neighbors())
+          outgoing[target];       // touch the outgoing queue, creating it if necessary
     }
 
-    unsigned        round;
+    int             round;
     Callback        reduce;
     Partners        partners;
     const Assigner& assigner;
diff --git a/include/vtkdiy2/resolve.hpp b/include/vtkdiy2/resolve.hpp
index 78e4050ee70..07158df2647 100644
--- a/include/vtkdiy2/resolve.hpp
+++ b/include/vtkdiy2/resolve.hpp
@@ -20,7 +20,7 @@ record_local_gids(const diy::Master& master, diy::DynamicAssigner& assigner)
 {
     // figure out local ranks
     std::vector<std::tuple<int,int>> local_gids;
-    for (int i = 0; i < master.size(); ++i)
+    for (int i = 0; i < static_cast<int>(master.size()); ++i)
         local_gids.emplace_back(std::make_tuple(master.communicator().rank(), master.gid(i)));
 
     assigner.set_ranks(local_gids);
@@ -32,7 +32,7 @@ update_links(diy::Master& master, const diy::DynamicAssigner& assigner)
 {
     // figure out all the gids we need
     std::vector<int> nbr_gids;
-    for (int i = 0; i < master.size(); ++i)
+    for (int i = 0; i < static_cast<int>(master.size()); ++i)
     {
         auto* link = master.link(i);
         for (auto blockid : link->neighbors())
@@ -52,7 +52,7 @@ update_links(diy::Master& master, const diy::DynamicAssigner& assigner)
         gid_to_proc[nbr_gids[i]] = nbr_procs[i];
 
     // fix the procs in links
-    for (int i = 0; i < master.size(); ++i)
+    for (int i = 0; i < static_cast<int>(master.size()); ++i)
     {
         auto* link = master.link(i);
         for (auto& blockid : link->neighbors())
diff --git a/include/vtkdiy2/serialization.hpp b/include/vtkdiy2/serialization.hpp
index 6109b69b443..41055737dc9 100644
--- a/include/vtkdiy2/serialization.hpp
+++ b/include/vtkdiy2/serialization.hpp
@@ -1,20 +1,30 @@
 #ifndef DIY_SERIALIZATION_HPP
 #define DIY_SERIALIZATION_HPP
 
-#include <vector>
-#include <valarray>
+#include <cassert>
+#include <fstream>
+#include <functional>
 #include <map>
+#include <memory>
 #include <set>
 #include <string>
-#include <fstream>
-
 #include <tuple>
+#include <type_traits>              // this is used for a safety check for default serialization
 #include <unordered_map>
 #include <unordered_set>
-#include <type_traits>              // this is used for a safety check for default serialization
+#include <valarray>
+#include <vector>
 
 namespace diy
 {
+  struct BinaryBlob
+  {
+      using Deleter = std::function<void(const char[])>;
+      using Pointer = std::unique_ptr<const char[], Deleter>;
+      Pointer   pointer;
+      size_t    size;
+  };
+
   //! A serialization buffer. \ingroup Serialization
   struct BinaryBuffer
   {
@@ -23,23 +33,43 @@ namespace diy
     virtual inline void append_binary(const char* x, size_t count)  =0;   //!< append `count` bytes from `x` to end of buffer
     virtual void        load_binary(char* x, size_t count)          =0;   //!< copy `count` bytes into `x` from the buffer
     virtual void        load_binary_back(char* x, size_t count)     =0;   //!< copy `count` bytes into `x` from the back of the buffer
+    virtual char*       grow(size_t count)                          =0;   //!< allocate enough space for `count` bytes and return the pointer to the beginning
+    virtual char*       advance(size_t count)                       =0;   //!< advance buffer position by `count` bytes and return the pointer to the beginning
+
+    virtual void        save_binary_blob(const char*, size_t)       =0;
+    virtual void        save_binary_blob(const char*, size_t, BinaryBlob::Deleter) = 0;
+    virtual BinaryBlob  load_binary_blob()                          =0;
   };
 
   struct MemoryBuffer: public BinaryBuffer
   {
+    using Blob = BinaryBlob;
+
                         MemoryBuffer(size_t position_ = 0):
                           position(position_)                       {}
 
+                        MemoryBuffer(MemoryBuffer&&)                =default;
+                        MemoryBuffer(const MemoryBuffer&)           =delete;
+    MemoryBuffer&       operator=(MemoryBuffer&&)                   =default;
+    MemoryBuffer&       operator=(const MemoryBuffer&)              =delete;
+
     virtual inline void save_binary(const char* x, size_t count) override;   //!< copy `count` bytes from `x` into the buffer
     virtual inline void append_binary(const char* x, size_t count) override; //!< append `count` bytes from `x` to end of buffer
     virtual inline void load_binary(char* x, size_t count) override;         //!< copy `count` bytes into `x` from the buffer
     virtual inline void load_binary_back(char* x, size_t count) override;    //!< copy `count` bytes into `x` from the back of the buffer
+    virtual inline char* grow(size_t count) override;                        //!< allocate enough space for `count` bytes and return the pointer to the beginning
+    virtual inline char* advance(size_t count) override;                     //!< advance buffer position by `count` bytes and return the pointer to the beginning
+
+    virtual inline void save_binary_blob(const char* x, size_t count) override;
+    virtual inline void save_binary_blob(const char* x, size_t count, Blob::Deleter deleter) override;
+    virtual inline Blob load_binary_blob() override;
+    size_t              nblobs() const                              { return blobs.size(); }
 
     void                clear()                                     { buffer.clear(); reset(); }
     void                wipe()                                      { std::vector<char>().swap(buffer); reset(); }
     void                reset()                                     { position = 0; }
     void                skip(size_t s)                              { position += s; }
-    void                swap(MemoryBuffer& o)                       { std::swap(position, o.position); buffer.swap(o.buffer); }
+    void                swap(MemoryBuffer& o)                       { std::swap(position, o.position); buffer.swap(o.buffer); std::swap(blob_position, o.blob_position); blobs.swap(o.blobs); }
     bool                empty() const                               { return buffer.empty(); }
     size_t              size() const                                { return buffer.size(); }
     void                reserve(size_t s)                           { buffer.reserve(s); }
@@ -52,7 +82,7 @@ namespace diy
     static float        growth_multiplier()                         { return 1.5; }
 
     // simple file IO
-    void                write(const std::string& fn) const          { std::ofstream out(fn.c_str()); out.write(&buffer[0], size()); }
+    void                write(const std::string& fn) const          { std::ofstream out(fn.c_str()); out.write(&buffer[0], static_cast<std::streamsize>(size())); }
     void                read(const std::string& fn)
     {
         std::ifstream in(fn.c_str(), std::ios::binary | std::ios::ate);
@@ -64,6 +94,9 @@ namespace diy
 
     size_t              position;
     std::vector<char>   buffer;
+
+    size_t              blob_position = 0;
+    std::vector<Blob>   blobs;
   };
 
   namespace detail
@@ -92,14 +125,23 @@ namespace diy
   template<class T>
   struct Serialization: public detail::Default
   {
-#if (defined(__clang__) && !defined(__ppc64__)) || (defined(__GNUC__) && __GNUC__ >= 5)
-    //exempt power-pc clang variants due to: https://gitlab.kitware.com/vtk/vtk-m/issues/201
+// GCC release date mapping
+// 20160726 == 4.9.4
+// 20150626 == 4.9.3
+// 20150623 == 4.8.5
+// 20150422 == 5.1
+// 20141030 == 4.9.2
+// See https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html#abi.versioning.__GLIBCXX__
+#if !(defined(__GLIBCXX__) &&                                                                      \
+  (__GLIBCXX__ < 20150422 || __GLIBCXX__ == 20160726 || __GLIBCXX__ == 20150626 ||                 \
+   __GLIBCXX__ == 20150623))
+    //exempt glibcxx-4 variants as they don't have is_trivially_copyable implemented
     static_assert(std::is_trivially_copyable<T>::value, "Default serialization works only for trivially copyable types");
 #endif
 
     static void         save(BinaryBuffer& bb, const T& x)          { bb.save_binary((const char*)  &x, sizeof(T)); }
     static void         load(BinaryBuffer& bb, T& x)                { bb.load_binary((char*)        &x, sizeof(T)); }
-    static size_t       size(const T& x)                            { return sizeof(T); }
+    static size_t       size(const T&)                              { return sizeof(T); }
   };
 
   //! Saves `x` to `bb` by calling `diy::Serialization<T>::save(bb,x)`.
@@ -124,7 +166,7 @@ namespace diy
   template<class T>
   void                  load_back(BinaryBuffer& bb, T& x)           { bb.load_binary_back((char*) &x, sizeof(T)); }
 
-  //@}
+  //!@}
 
 
   namespace detail
@@ -206,7 +248,7 @@ namespace diy
     {
       size_t s;
       diy::load(bb, s);
-      v.resize(s);
+      v.resize(s, U());
       if (s > 0)
         diy::load(bb, &v[0], s);
     }
@@ -229,7 +271,7 @@ namespace diy
     {
       size_t s;
       diy::load(bb, s);
-      v.resize(s);
+      v.resize(s, U());
       if (s > 0)
         diy::load(bb, &v[0], s);
     }
@@ -428,17 +470,7 @@ void
 diy::MemoryBuffer::
 save_binary(const char* x, size_t count)
 {
-  if (position + count > buffer.capacity())
-  {
-    double newsize = static_cast<double>(position + count) * growth_multiplier();  // if we have to grow, grow geometrically
-    buffer.reserve(static_cast<size_t>(newsize));
-  }
-
-  if (position + count > buffer.size())
-    buffer.resize(position + count);
-
-  std::copy_n(x, count, &buffer[position]);
-  position += count;
+  std::copy_n(x, count, grow(count));
 }
 
 void
@@ -493,6 +525,58 @@ load_binary_back(char* x, size_t count)
   buffer.resize(buffer.size() - count);
 }
 
+char*
+diy::MemoryBuffer::
+grow(size_t count)
+{
+  if (position + count > buffer.capacity())
+  {
+    double newsize = static_cast<double>(position + count) * growth_multiplier();  // if we have to grow, grow geometrically
+    buffer.reserve(static_cast<size_t>(newsize));
+  }
+
+  if (position + count > buffer.size())
+    buffer.resize(position + count);
+
+  char* destination = &buffer[position];
+
+  position += count;
+
+  return destination;
+}
+
+char*
+diy::MemoryBuffer::
+advance(size_t count)
+{
+    char* origin = &buffer[position];
+    position += count;
+    return origin;
+}
+
+
+void
+diy::MemoryBuffer::
+save_binary_blob(const char* x, size_t count)
+{
+    // empty deleter means we don't take ownership
+    save_binary_blob(x, count, [](const char[]) {});
+}
+
+void
+diy::MemoryBuffer::
+save_binary_blob(const char* x, size_t count, Blob::Deleter deleter)
+{
+    blobs.emplace_back(Blob { Blob::Pointer {x, deleter}, count });
+}
+
+diy::MemoryBuffer::Blob
+diy::MemoryBuffer::
+load_binary_blob()
+{
+    return std::move(blobs[blob_position++]);
+}
+
 void
 diy::MemoryBuffer::
 copy(MemoryBuffer& from, MemoryBuffer& to)
diff --git a/include/vtkdiy2/stats.hpp b/include/vtkdiy2/stats.hpp
index 837f038c013..95520019fe1 100644
--- a/include/vtkdiy2/stats.hpp
+++ b/include/vtkdiy2/stats.hpp
@@ -148,8 +148,8 @@ struct Profiler
     void    operator<<(std::string name)        { enter(name); }
     void    operator>>(std::string name)        { exit(name); }
 
-    void    enter(std::string name)             {}
-    void    exit(std::string name)              {}
+    void    enter(std::string)                  {}
+    void    exit(std::string)                   {}
 
     void    output(std::ostream& out, std::string = "") const
     {
@@ -173,7 +173,7 @@ struct Annotation
 {
     struct Guard
     {
-                    Guard(Annotation& a)            {}
+                    Guard(Annotation&)              {}
     };
 
                     Annotation(const char*)         {}
@@ -206,7 +206,7 @@ struct Profiler
     void    enter(std::string name)             { CALI_MARK_BEGIN(name.c_str()); }
     void    exit(std::string name)              { CALI_MARK_END(name.c_str()); }
 
-    void    output(std::ostream& out, std::string = "") const {}
+    void    output(std::ostream&, std::string = "") const {}
     void    clear()                             {}
 
     Scoped  scoped(std::string name)            { return Scoped(*this, name); }
diff --git a/include/vtkdiy2/storage.hpp b/include/vtkdiy2/storage.hpp
index f34998bd644..b1da3d3620c 100644
--- a/include/vtkdiy2/storage.hpp
+++ b/include/vtkdiy2/storage.hpp
@@ -15,8 +15,8 @@ namespace diy
 {
   namespace detail
   {
-    typedef       void  (*Save)(const void*, BinaryBuffer& buf);
-    typedef       void  (*Load)(void*,       BinaryBuffer& buf);
+    using Save = std::function<void(const void*, BinaryBuffer&)>;
+    using Load = std::function<void(void*, BinaryBuffer&)>;
 
     struct FileBuffer: public BinaryBuffer
     {
@@ -26,7 +26,7 @@ namespace diy
       virtual inline void save_binary(const char* x, size_t count) override   { fwrite(x, 1, count, file); head += count; }
       virtual inline void append_binary(const char* x, size_t count) override
       {
-          size_t temp_pos = ftell(file);
+          auto temp_pos = ftell(file);
           fseek(file, static_cast<long>(tail), SEEK_END);
           fwrite(x, 1, count, file);
           tail += count;
@@ -34,6 +34,16 @@ namespace diy
       }
       virtual inline void load_binary(char* x, size_t count) override         { auto n = fread(x, 1, count, file); DIY_UNUSED(n);}
       virtual inline void load_binary_back(char* x, size_t count) override    { fseek(file, static_cast<long>(tail), SEEK_END); auto n = fread(x, 1, count, file); tail += count; fseek(file, static_cast<long>(head), SEEK_SET); DIY_UNUSED(n);}
+      virtual inline char* grow(size_t) override                              { throw std::runtime_error("Cannot grow a FileBuffer"); }
+      virtual inline char* advance(size_t) override                           { throw std::runtime_error("Cannot advance a FileBuffer"); }
+
+      // TODO: for now, we just throw, but obviously it should be possile to store binary blobs in a file; might want to fall back
+      using Blob = BinaryBlob;
+      virtual inline void save_binary_blob(const char*, size_t) override      { throw std::runtime_error("Cannot save binary blobs in a FileBuffer"); }
+
+      virtual inline void save_binary_blob(const char*, size_t, Blob::Deleter) override { throw std::runtime_error("Cannot save binary blobs in a FileBuffer"); }
+
+      virtual inline Blob load_binary_blob() override                         { throw std::runtime_error("Cannot load binary blobs from a FileBuffer"); }
 
       size_t              size() const                                { return head; }
 
diff --git a/include/vtkdiy2/fmt/chrono.h b/include/vtkdiy2/thirdparty/fmt/chrono.h
similarity index 60%
rename from include/vtkdiy2/fmt/chrono.h
rename to include/vtkdiy2/thirdparty/fmt/chrono.h
index c965cf78104..1a3b8d5e5cd 100644
--- a/include/vtkdiy2/fmt/chrono.h
+++ b/include/vtkdiy2/thirdparty/fmt/chrono.h
@@ -8,35 +8,285 @@
 #ifndef FMT_CHRONO_H_
 #define FMT_CHRONO_H_
 
-#include "format.h"
-#include "locale.h"
-
 #include <chrono>
 #include <ctime>
 #include <locale>
 #include <sstream>
 
-// enable safe chrono durations, unless explicitly disabled
+#include "format.h"
+#include "locale.h"
+
+FMT_BEGIN_NAMESPACE
+
+// Enable safe chrono durations, unless explicitly disabled.
 #ifndef FMT_SAFE_DURATION_CAST
 #  define FMT_SAFE_DURATION_CAST 1
 #endif
-
 #if FMT_SAFE_DURATION_CAST
-#  include "safe-duration-cast.h"
-#endif
 
-FMT_BEGIN_NAMESPACE
+// For conversion between std::chrono::durations without undefined
+// behaviour or erroneous results.
+// This is a stripped down version of duration_cast, for inclusion in fmt.
+// See https://github.com/pauldreik/safe_duration_cast
+//
+// Copyright Paul Dreik 2019
+namespace safe_duration_cast {
+
+template <typename To, typename From,
+          FMT_ENABLE_IF(!std::is_same<From, To>::value &&
+                        std::numeric_limits<From>::is_signed ==
+                            std::numeric_limits<To>::is_signed)>
+FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) {
+  ec = 0;
+  using F = std::numeric_limits<From>;
+  using T = std::numeric_limits<To>;
+  static_assert(F::is_integer, "From must be integral");
+  static_assert(T::is_integer, "To must be integral");
+
+  // A and B are both signed, or both unsigned.
+  if (F::digits <= T::digits) {
+    // From fits in To without any problem.
+  } else {
+    // From does not always fit in To, resort to a dynamic check.
+    if (from < (T::min)() || from > (T::max)()) {
+      // outside range.
+      ec = 1;
+      return {};
+    }
+  }
+  return static_cast<To>(from);
+}
+
+/**
+ * converts From to To, without loss. If the dynamic value of from
+ * can't be converted to To without loss, ec is set.
+ */
+template <typename To, typename From,
+          FMT_ENABLE_IF(!std::is_same<From, To>::value &&
+                        std::numeric_limits<From>::is_signed !=
+                            std::numeric_limits<To>::is_signed)>
+FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) {
+  ec = 0;
+  using F = std::numeric_limits<From>;
+  using T = std::numeric_limits<To>;
+  static_assert(F::is_integer, "From must be integral");
+  static_assert(T::is_integer, "To must be integral");
+
+  if (detail::const_check(F::is_signed && !T::is_signed)) {
+    // From may be negative, not allowed!
+    if (fmt::detail::is_negative(from)) {
+      ec = 1;
+      return {};
+    }
+    // From is positive. Can it always fit in To?
+    if (F::digits > T::digits &&
+        from > static_cast<From>(detail::max_value<To>())) {
+      ec = 1;
+      return {};
+    }
+  }
+
+  if (!F::is_signed && T::is_signed && F::digits >= T::digits &&
+      from > static_cast<From>(detail::max_value<To>())) {
+    ec = 1;
+    return {};
+  }
+  return static_cast<To>(from);  // Lossless conversion.
+}
+
+template <typename To, typename From,
+          FMT_ENABLE_IF(std::is_same<From, To>::value)>
+FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) {
+  ec = 0;
+  return from;
+}  // function
+
+// clang-format off
+/**
+ * converts From to To if possible, otherwise ec is set.
+ *
+ * input                            |    output
+ * ---------------------------------|---------------
+ * NaN                              | NaN
+ * Inf                              | Inf
+ * normal, fits in output           | converted (possibly lossy)
+ * normal, does not fit in output   | ec is set
+ * subnormal                        | best effort
+ * -Inf                             | -Inf
+ */
+// clang-format on
+template <typename To, typename From,
+          FMT_ENABLE_IF(!std::is_same<From, To>::value)>
+FMT_CONSTEXPR To safe_float_conversion(const From from, int& ec) {
+  ec = 0;
+  using T = std::numeric_limits<To>;
+  static_assert(std::is_floating_point<From>::value, "From must be floating");
+  static_assert(std::is_floating_point<To>::value, "To must be floating");
+
+  // catch the only happy case
+  if (std::isfinite(from)) {
+    if (from >= T::lowest() && from <= (T::max)()) {
+      return static_cast<To>(from);
+    }
+    // not within range.
+    ec = 1;
+    return {};
+  }
+
+  // nan and inf will be preserved
+  return static_cast<To>(from);
+}  // function
+
+template <typename To, typename From,
+          FMT_ENABLE_IF(std::is_same<From, To>::value)>
+FMT_CONSTEXPR To safe_float_conversion(const From from, int& ec) {
+  ec = 0;
+  static_assert(std::is_floating_point<From>::value, "From must be floating");
+  return from;
+}
+
+/**
+ * safe duration cast between integral durations
+ */
+template <typename To, typename FromRep, typename FromPeriod,
+          FMT_ENABLE_IF(std::is_integral<FromRep>::value),
+          FMT_ENABLE_IF(std::is_integral<typename To::rep>::value)>
+To safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
+                      int& ec) {
+  using From = std::chrono::duration<FromRep, FromPeriod>;
+  ec = 0;
+  // the basic idea is that we need to convert from count() in the from type
+  // to count() in the To type, by multiplying it with this:
+  struct Factor
+      : std::ratio_divide<typename From::period, typename To::period> {};
+
+  static_assert(Factor::num > 0, "num must be positive");
+  static_assert(Factor::den > 0, "den must be positive");
+
+  // the conversion is like this: multiply from.count() with Factor::num
+  // /Factor::den and convert it to To::rep, all this without
+  // overflow/underflow. let's start by finding a suitable type that can hold
+  // both To, From and Factor::num
+  using IntermediateRep =
+      typename std::common_type<typename From::rep, typename To::rep,
+                                decltype(Factor::num)>::type;
+
+  // safe conversion to IntermediateRep
+  IntermediateRep count =
+      lossless_integral_conversion<IntermediateRep>(from.count(), ec);
+  if (ec) return {};
+  // multiply with Factor::num without overflow or underflow
+  if (detail::const_check(Factor::num != 1)) {
+    const auto max1 = detail::max_value<IntermediateRep>() / Factor::num;
+    if (count > max1) {
+      ec = 1;
+      return {};
+    }
+    const auto min1 =
+        (std::numeric_limits<IntermediateRep>::min)() / Factor::num;
+    if (count < min1) {
+      ec = 1;
+      return {};
+    }
+    count *= Factor::num;
+  }
+
+  if (detail::const_check(Factor::den != 1)) count /= Factor::den;
+  auto tocount = lossless_integral_conversion<typename To::rep>(count, ec);
+  return ec ? To() : To(tocount);
+}
+
+/**
+ * safe duration_cast between floating point durations
+ */
+template <typename To, typename FromRep, typename FromPeriod,
+          FMT_ENABLE_IF(std::is_floating_point<FromRep>::value),
+          FMT_ENABLE_IF(std::is_floating_point<typename To::rep>::value)>
+To safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
+                      int& ec) {
+  using From = std::chrono::duration<FromRep, FromPeriod>;
+  ec = 0;
+  if (std::isnan(from.count())) {
+    // nan in, gives nan out. easy.
+    return To{std::numeric_limits<typename To::rep>::quiet_NaN()};
+  }
+  // maybe we should also check if from is denormal, and decide what to do about
+  // it.
+
+  // +-inf should be preserved.
+  if (std::isinf(from.count())) {
+    return To{from.count()};
+  }
+
+  // the basic idea is that we need to convert from count() in the from type
+  // to count() in the To type, by multiplying it with this:
+  struct Factor
+      : std::ratio_divide<typename From::period, typename To::period> {};
+
+  static_assert(Factor::num > 0, "num must be positive");
+  static_assert(Factor::den > 0, "den must be positive");
+
+  // the conversion is like this: multiply from.count() with Factor::num
+  // /Factor::den and convert it to To::rep, all this without
+  // overflow/underflow. let's start by finding a suitable type that can hold
+  // both To, From and Factor::num
+  using IntermediateRep =
+      typename std::common_type<typename From::rep, typename To::rep,
+                                decltype(Factor::num)>::type;
+
+  // force conversion of From::rep -> IntermediateRep to be safe,
+  // even if it will never happen be narrowing in this context.
+  IntermediateRep count =
+      safe_float_conversion<IntermediateRep>(from.count(), ec);
+  if (ec) {
+    return {};
+  }
+
+  // multiply with Factor::num without overflow or underflow
+  if (Factor::num != 1) {
+    constexpr auto max1 = detail::max_value<IntermediateRep>() /
+                          static_cast<IntermediateRep>(Factor::num);
+    if (count > max1) {
+      ec = 1;
+      return {};
+    }
+    constexpr auto min1 = std::numeric_limits<IntermediateRep>::lowest() /
+                          static_cast<IntermediateRep>(Factor::num);
+    if (count < min1) {
+      ec = 1;
+      return {};
+    }
+    count *= static_cast<IntermediateRep>(Factor::num);
+  }
+
+  // this can't go wrong, right? den>0 is checked earlier.
+  if (Factor::den != 1) {
+    using common_t = typename std::common_type<IntermediateRep, intmax_t>::type;
+    count /= static_cast<common_t>(Factor::den);
+  }
+
+  // convert to the to type, safely
+  using ToRep = typename To::rep;
+
+  const ToRep tocount = safe_float_conversion<ToRep>(count, ec);
+  if (ec) {
+    return {};
+  }
+  return To{tocount};
+}
+}  // namespace safe_duration_cast
+#endif
 
 // Prevents expansion of a preceding token as a function-style macro.
 // Usage: f FMT_NOMACRO()
 #define FMT_NOMACRO
 
-namespace internal {
+namespace detail {
 inline null<> localtime_r FMT_NOMACRO(...) { return null<>(); }
 inline null<> localtime_s(...) { return null<>(); }
 inline null<> gmtime_r(...) { return null<>(); }
 inline null<> gmtime_s(...) { return null<>(); }
-}  // namespace internal
+}  // namespace detail
 
 // Thread-safe replacement for std::localtime
 inline std::tm localtime(std::time_t time) {
@@ -47,22 +297,22 @@ inline std::tm localtime(std::time_t time) {
     dispatcher(std::time_t t) : time_(t) {}
 
     bool run() {
-      using namespace fmt::internal;
+      using namespace fmt::detail;
       return handle(localtime_r(&time_, &tm_));
     }
 
     bool handle(std::tm* tm) { return tm != nullptr; }
 
-    bool handle(internal::null<>) {
-      using namespace fmt::internal;
+    bool handle(detail::null<>) {
+      using namespace fmt::detail;
       return fallback(localtime_s(&tm_, &time_));
     }
 
     bool fallback(int res) { return res == 0; }
 
 #if !FMT_MSC_VER
-    bool fallback(internal::null<>) {
-      using namespace fmt::internal;
+    bool fallback(detail::null<>) {
+      using namespace fmt::detail;
       std::tm* tm = std::localtime(&time_);
       if (tm) tm_ = *tm;
       return tm != nullptr;
@@ -75,6 +325,11 @@ inline std::tm localtime(std::time_t time) {
   return lt.tm_;
 }
 
+inline std::tm localtime(
+    std::chrono::time_point<std::chrono::system_clock> time_point) {
+  return localtime(std::chrono::system_clock::to_time_t(time_point));
+}
+
 // Thread-safe replacement for std::gmtime
 inline std::tm gmtime(std::time_t time) {
   struct dispatcher {
@@ -84,21 +339,21 @@ inline std::tm gmtime(std::time_t time) {
     dispatcher(std::time_t t) : time_(t) {}
 
     bool run() {
-      using namespace fmt::internal;
+      using namespace fmt::detail;
       return handle(gmtime_r(&time_, &tm_));
     }
 
     bool handle(std::tm* tm) { return tm != nullptr; }
 
-    bool handle(internal::null<>) {
-      using namespace fmt::internal;
+    bool handle(detail::null<>) {
+      using namespace fmt::detail;
       return fallback(gmtime_s(&tm_, &time_));
     }
 
     bool fallback(int res) { return res == 0; }
 
 #if !FMT_MSC_VER
-    bool fallback(internal::null<>) {
+    bool fallback(detail::null<>) {
       std::tm* tm = std::gmtime(&time_);
       if (tm) tm_ = *tm;
       return tm != nullptr;
@@ -111,17 +366,33 @@ inline std::tm gmtime(std::time_t time) {
   return gt.tm_;
 }
 
-namespace internal {
-inline std::size_t strftime(char* str, std::size_t count, const char* format,
-                            const std::tm* time) {
+inline std::tm gmtime(
+    std::chrono::time_point<std::chrono::system_clock> time_point) {
+  return gmtime(std::chrono::system_clock::to_time_t(time_point));
+}
+
+namespace detail {
+inline size_t strftime(char* str, size_t count, const char* format,
+                       const std::tm* time) {
   return std::strftime(str, count, format, time);
 }
 
-inline std::size_t strftime(wchar_t* str, std::size_t count,
-                            const wchar_t* format, const std::tm* time) {
+inline size_t strftime(wchar_t* str, size_t count, const wchar_t* format,
+                       const std::tm* time) {
   return std::wcsftime(str, count, format, time);
 }
-}  // namespace internal
+}  // namespace detail
+
+template <typename Char>
+struct formatter<std::chrono::time_point<std::chrono::system_clock>, Char>
+    : formatter<std::tm, Char> {
+  template <typename FormatContext>
+  auto format(std::chrono::time_point<std::chrono::system_clock> val,
+              FormatContext& ctx) -> decltype(ctx.out()) {
+    std::tm time = localtime(val);
+    return formatter<std::tm, Char>::format(time, ctx);
+  }
+};
 
 template <typename Char> struct formatter<std::tm, Char> {
   template <typename ParseContext>
@@ -130,7 +401,7 @@ template <typename Char> struct formatter<std::tm, Char> {
     if (it != ctx.end() && *it == ':') ++it;
     auto end = it;
     while (end != ctx.end() && *end != '}') ++end;
-    tm_format.reserve(internal::to_unsigned(end - it + 1));
+    tm_format.reserve(detail::to_unsigned(end - it + 1));
     tm_format.append(it, end);
     tm_format.push_back('\0');
     return end;
@@ -139,11 +410,10 @@ template <typename Char> struct formatter<std::tm, Char> {
   template <typename FormatContext>
   auto format(const std::tm& tm, FormatContext& ctx) -> decltype(ctx.out()) {
     basic_memory_buffer<Char> buf;
-    std::size_t start = buf.size();
+    size_t start = buf.size();
     for (;;) {
-      std::size_t size = buf.capacity() - start;
-      std::size_t count =
-          internal::strftime(&buf[start], size, &tm_format[0], &tm);
+      size_t size = buf.capacity() - start;
+      size_t count = detail::strftime(&buf[start], size, &tm_format[0], &tm);
       if (count != 0) {
         buf.resize(start + count);
         break;
@@ -155,7 +425,7 @@ template <typename Char> struct formatter<std::tm, Char> {
         // https://github.com/fmtlib/fmt/issues/367
         break;
       }
-      const std::size_t MIN_GROWTH = 10;
+      const size_t MIN_GROWTH = 10;
       buf.reserve(buf.capacity() + (size > MIN_GROWTH ? size : MIN_GROWTH));
     }
     return std::copy(buf.begin(), buf.end(), ctx.out());
@@ -164,7 +434,7 @@ template <typename Char> struct formatter<std::tm, Char> {
   basic_memory_buffer<Char> tm_format;
 };
 
-namespace internal {
+namespace detail {
 template <typename Period> FMT_CONSTEXPR const char* get_units() {
   return nullptr;
 }
@@ -220,12 +490,12 @@ FMT_CONSTEXPR const Char* parse_chrono_format(const Char* begin,
       handler.on_text(ptr - 1, ptr);
       break;
     case 'n': {
-      const char newline[] = "\n";
+      const Char newline[] = {'\n'};
       handler.on_text(newline, newline + 1);
       break;
     }
     case 't': {
-      const char tab[] = "\t";
+      const Char tab[] = {'\t'};
       handler.on_text(tab, tab + 1);
       break;
     }
@@ -421,7 +691,7 @@ inline int to_nonnegative_int(T value, int upper) {
 
 template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
 inline T mod(T x, int y) {
-  return x % y;
+  return x % static_cast<T>(y);
 }
 template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value)>
 inline T mod(T x, int y) {
@@ -484,18 +754,36 @@ inline std::chrono::duration<Rep, std::milli> get_milliseconds(
   return std::chrono::duration<Rep, std::milli>(static_cast<Rep>(ms));
 }
 
-template <typename Rep, typename OutputIt>
-OutputIt format_chrono_duration_value(OutputIt out, Rep val, int precision) {
-  if (precision >= 0) return format_to(out, "{:.{}f}", val, precision);
-  return format_to(out, std::is_floating_point<Rep>::value ? "{:g}" : "{}",
+template <typename Char, typename Rep, typename OutputIt>
+OutputIt format_duration_value(OutputIt out, Rep val, int precision) {
+  const Char pr_f[] = {'{', ':', '.', '{', '}', 'f', '}', 0};
+  if (precision >= 0) return format_to(out, pr_f, val, precision);
+  const Char fp_f[] = {'{', ':', 'g', '}', 0};
+  const Char format[] = {'{', '}', 0};
+  return format_to(out, std::is_floating_point<Rep>::value ? fp_f : format,
                    val);
 }
+template <typename Char, typename OutputIt>
+OutputIt copy_unit(string_view unit, OutputIt out, Char) {
+  return std::copy(unit.begin(), unit.end(), out);
+}
+
+template <typename OutputIt>
+OutputIt copy_unit(string_view unit, OutputIt out, wchar_t) {
+  // This works when wchar_t is UTF-32 because units only contain characters
+  // that have the same representation in UTF-16 and UTF-32.
+  utf8_to_utf16 u(unit);
+  return std::copy(u.c_str(), u.c_str() + u.size(), out);
+}
 
-template <typename Period, typename OutputIt>
-static OutputIt format_chrono_duration_unit(OutputIt out) {
-  if (const char* unit = get_units<Period>()) return format_to(out, "{}", unit);
-  if (Period::den == 1) return format_to(out, "[{}]s", Period::num);
-  return format_to(out, "[{}/{}]s", Period::num, Period::den);
+template <typename Char, typename Period, typename OutputIt>
+OutputIt format_duration_unit(OutputIt out) {
+  if (const char* unit = get_units<Period>())
+    return copy_unit(string_view(unit), out, Char());
+  const Char num_f[] = {'[', '{', '}', ']', 's', 0};
+  if (const_check(Period::den == 1)) return format_to(out, num_f, Period::num);
+  const Char num_def_f[] = {'[', '{', '}', '/', '{', '}', ']', 's', 0};
+  return format_to(out, num_def_f, Period::num, Period::den);
 }
 
 template <typename FormatContext, typename OutputIt, typename Rep,
@@ -518,7 +806,10 @@ struct chrono_formatter {
 
   explicit chrono_formatter(FormatContext& ctx, OutputIt o,
                             std::chrono::duration<Rep, Period> d)
-      : context(ctx), out(o), val(d.count()), negative(false) {
+      : context(ctx),
+        out(o),
+        val(static_cast<rep>(d.count())),
+        negative(false) {
     if (d.count() < 0) {
       val = 0 - val;
       negative = true;
@@ -582,24 +873,24 @@ struct chrono_formatter {
   void write(Rep value, int width) {
     write_sign();
     if (isnan(value)) return write_nan();
-    uint32_or_64_t<int> n = to_unsigned(
-        to_nonnegative_int(value, (std::numeric_limits<int>::max)()));
-    int num_digits = internal::count_digits(n);
+    uint32_or_64_or_128_t<int> n =
+        to_unsigned(to_nonnegative_int(value, max_value<int>()));
+    int num_digits = detail::count_digits(n);
     if (width > num_digits) out = std::fill_n(out, width - num_digits, '0');
-    out = format_decimal<char_type>(out, n, num_digits);
+    out = format_decimal<char_type>(out, n, num_digits).end;
   }
 
   void write_nan() { std::copy_n("nan", 3, out); }
   void write_pinf() { std::copy_n("inf", 3, out); }
   void write_ninf() { std::copy_n("-inf", 4, out); }
 
-  void format_localized(const tm& time, const char* format) {
+  void format_localized(const tm& time, char format, char modifier = 0) {
     if (isnan(val)) return write_nan();
     auto locale = context.locale().template get<std::locale>();
     auto& facet = std::use_facet<std::time_put<char_type>>(locale);
     std::basic_ostringstream<char_type> os;
     os.imbue(locale);
-    facet.put(os, os, ' ', &time, format, format + std::strlen(format));
+    facet.put(os, os, ' ', &time, format, modifier);
     auto str = os.str();
     std::copy(str.begin(), str.end(), out);
   }
@@ -629,7 +920,7 @@ struct chrono_formatter {
     if (ns == numeric_system::standard) return write(hour(), 2);
     auto time = tm();
     time.tm_hour = to_nonnegative_int(hour(), 24);
-    format_localized(time, "%OH");
+    format_localized(time, 'H', 'O');
   }
 
   void on_12_hour(numeric_system ns) {
@@ -638,7 +929,7 @@ struct chrono_formatter {
     if (ns == numeric_system::standard) return write(hour12(), 2);
     auto time = tm();
     time.tm_hour = to_nonnegative_int(hour12(), 12);
-    format_localized(time, "%OI");
+    format_localized(time, 'I', 'O');
   }
 
   void on_minute(numeric_system ns) {
@@ -647,7 +938,7 @@ struct chrono_formatter {
     if (ns == numeric_system::standard) return write(minute(), 2);
     auto time = tm();
     time.tm_min = to_nonnegative_int(minute(), 60);
-    format_localized(time, "%OM");
+    format_localized(time, 'M', 'O');
   }
 
   void on_second(numeric_system ns) {
@@ -672,13 +963,12 @@ struct chrono_formatter {
     }
     auto time = tm();
     time.tm_sec = to_nonnegative_int(second(), 60);
-    format_localized(time, "%OS");
+    format_localized(time, 'S', 'O');
   }
 
   void on_12_hour_time() {
     if (handle_nan_inf()) return;
-
-    format_localized(time(), "%r");
+    format_localized(time(), 'r');
   }
 
   void on_24_hour_time() {
@@ -702,25 +992,27 @@ struct chrono_formatter {
 
   void on_am_pm() {
     if (handle_nan_inf()) return;
-    format_localized(time(), "%p");
+    format_localized(time(), 'p');
   }
 
   void on_duration_value() {
     if (handle_nan_inf()) return;
     write_sign();
-    out = format_chrono_duration_value(out, val, precision);
+    out = format_duration_value<char_type>(out, val, precision);
   }
 
-  void on_duration_unit() { out = format_chrono_duration_unit<Period>(out); }
+  void on_duration_unit() {
+    out = format_duration_unit<char_type, Period>(out);
+  }
 };
-}  // namespace internal
+}  // namespace detail
 
 template <typename Rep, typename Period, typename Char>
 struct formatter<std::chrono::duration<Rep, Period>, Char> {
  private:
   basic_format_specs<Char> specs;
   int precision;
-  using arg_ref_type = internal::arg_ref<Char>;
+  using arg_ref_type = detail::arg_ref<Char>;
   arg_ref_type width_ref;
   arg_ref_type precision_ref;
   mutable basic_string_view<Char> format_str;
@@ -728,7 +1020,7 @@ struct formatter<std::chrono::duration<Rep, Period>, Char> {
 
   struct spec_handler {
     formatter& f;
-    basic_parse_context<Char>& context;
+    basic_format_parse_context<Char>& context;
     basic_string_view<Char> format_str;
 
     template <typename Id> FMT_CONSTEXPR arg_ref_type make_arg_ref(Id arg_id) {
@@ -738,19 +1030,18 @@ struct formatter<std::chrono::duration<Rep, Period>, Char> {
 
     FMT_CONSTEXPR arg_ref_type make_arg_ref(basic_string_view<Char> arg_id) {
       context.check_arg_id(arg_id);
-      const auto str_val = internal::string_view_metadata(format_str, arg_id);
-      return arg_ref_type(str_val);
+      return arg_ref_type(arg_id);
     }
 
-    FMT_CONSTEXPR arg_ref_type make_arg_ref(internal::auto_id) {
+    FMT_CONSTEXPR arg_ref_type make_arg_ref(detail::auto_id) {
       return arg_ref_type(context.next_arg_id());
     }
 
     void on_error(const char* msg) { FMT_THROW(format_error(msg)); }
-    void on_fill(Char fill) { f.specs.fill[0] = fill; }
+    void on_fill(basic_string_view<Char> fill) { f.specs.fill = fill; }
     void on_align(align_t align) { f.specs.align = align; }
-    void on_width(unsigned width) { f.specs.width = width; }
-    void on_precision(unsigned precision) { f.precision = precision; }
+    void on_width(int width) { f.specs.width = width; }
+    void on_precision(int _precision) { f.precision = _precision; }
     void end_precision() {}
 
     template <typename Id> void on_dynamic_width(Id arg_id) {
@@ -762,38 +1053,38 @@ struct formatter<std::chrono::duration<Rep, Period>, Char> {
     }
   };
 
-  using iterator = typename basic_parse_context<Char>::iterator;
+  using iterator = typename basic_format_parse_context<Char>::iterator;
   struct parse_range {
     iterator begin;
     iterator end;
   };
 
-  FMT_CONSTEXPR parse_range do_parse(basic_parse_context<Char>& ctx) {
+  FMT_CONSTEXPR parse_range do_parse(basic_format_parse_context<Char>& ctx) {
     auto begin = ctx.begin(), end = ctx.end();
     if (begin == end || *begin == '}') return {begin, begin};
     spec_handler handler{*this, ctx, format_str};
-    begin = internal::parse_align(begin, end, handler);
+    begin = detail::parse_align(begin, end, handler);
     if (begin == end) return {begin, begin};
-    begin = internal::parse_width(begin, end, handler);
+    begin = detail::parse_width(begin, end, handler);
     if (begin == end) return {begin, begin};
     if (*begin == '.') {
       if (std::is_floating_point<Rep>::value)
-        begin = internal::parse_precision(begin, end, handler);
+        begin = detail::parse_precision(begin, end, handler);
       else
         handler.on_error("precision not allowed for this argument type");
     }
-    end = parse_chrono_format(begin, end, internal::chrono_format_checker());
+    end = parse_chrono_format(begin, end, detail::chrono_format_checker());
     return {begin, end};
   }
 
  public:
   formatter() : precision(-1) {}
 
-  FMT_CONSTEXPR auto parse(basic_parse_context<Char>& ctx)
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
       -> decltype(ctx.begin()) {
     auto range = do_parse(ctx);
     format_str = basic_string_view<Char>(
-        &*range.begin, internal::to_unsigned(range.end - range.begin));
+        &*range.begin, detail::to_unsigned(range.end - range.begin));
     return range.end;
   }
 
@@ -804,23 +1095,21 @@ struct formatter<std::chrono::duration<Rep, Period>, Char> {
     // is not specified.
     basic_memory_buffer<Char> buf;
     auto out = std::back_inserter(buf);
-    using range = internal::output_range<decltype(ctx.out()), Char>;
-    internal::basic_writer<range> w(range(ctx.out()));
-    internal::handle_dynamic_spec<internal::width_checker>(
-        specs.width, width_ref, ctx, format_str.begin());
-    internal::handle_dynamic_spec<internal::precision_checker>(
-        precision, precision_ref, ctx, format_str.begin());
+    detail::handle_dynamic_spec<detail::width_checker>(specs.width, width_ref,
+                                                       ctx);
+    detail::handle_dynamic_spec<detail::precision_checker>(precision,
+                                                           precision_ref, ctx);
     if (begin == end || *begin == '}') {
-      out = internal::format_chrono_duration_value(out, d.count(), precision);
-      internal::format_chrono_duration_unit<Period>(out);
+      out = detail::format_duration_value<Char>(out, d.count(), precision);
+      detail::format_duration_unit<Char, Period>(out);
     } else {
-      internal::chrono_formatter<FormatContext, decltype(out), Rep, Period> f(
+      detail::chrono_formatter<FormatContext, decltype(out), Rep, Period> f(
           ctx, out, d);
       f.precision = precision;
       parse_chrono_format(begin, end, f);
     }
-    w.write(buf.data(), buf.size(), specs);
-    return w.out();
+    return detail::write(
+        ctx.out(), basic_string_view<Char>(buf.data(), buf.size()), specs);
   }
 };
 
diff --git a/include/vtkdiy2/fmt/color.h b/include/vtkdiy2/thirdparty/fmt/color.h
similarity index 81%
rename from include/vtkdiy2/fmt/color.h
rename to include/vtkdiy2/thirdparty/fmt/color.h
index d9d315599f0..94e3419d1df 100644
--- a/include/vtkdiy2/fmt/color.h
+++ b/include/vtkdiy2/thirdparty/fmt/color.h
@@ -198,7 +198,7 @@ struct rgb {
   uint8_t b;
 };
 
-namespace internal {
+namespace detail {
 
 // color is a struct of either a rgb color or a terminal color.
 struct color_type {
@@ -221,7 +221,7 @@ struct color_type {
     uint32_t rgb_color;
   } value;
 };
-}  // namespace internal
+}  // namespace detail
 
 // Experimental text formatting support.
 class text_style {
@@ -298,22 +298,22 @@ class text_style {
   FMT_CONSTEXPR bool has_emphasis() const FMT_NOEXCEPT {
     return static_cast<uint8_t>(ems) != 0;
   }
-  FMT_CONSTEXPR internal::color_type get_foreground() const FMT_NOEXCEPT {
-    assert(has_foreground() && "no foreground specified for this style");
+  FMT_CONSTEXPR detail::color_type get_foreground() const FMT_NOEXCEPT {
+    FMT_ASSERT(has_foreground(), "no foreground specified for this style");
     return foreground_color;
   }
-  FMT_CONSTEXPR internal::color_type get_background() const FMT_NOEXCEPT {
-    assert(has_background() && "no background specified for this style");
+  FMT_CONSTEXPR detail::color_type get_background() const FMT_NOEXCEPT {
+    FMT_ASSERT(has_background(), "no background specified for this style");
     return background_color;
   }
   FMT_CONSTEXPR emphasis get_emphasis() const FMT_NOEXCEPT {
-    assert(has_emphasis() && "no emphasis specified for this style");
+    FMT_ASSERT(has_emphasis(), "no emphasis specified for this style");
     return ems;
   }
 
  private:
   FMT_CONSTEXPR text_style(bool is_foreground,
-                           internal::color_type text_color) FMT_NOEXCEPT
+                           detail::color_type text_color) FMT_NOEXCEPT
       : set_foreground_color(),
         set_background_color(),
         ems() {
@@ -326,23 +326,23 @@ class text_style {
     }
   }
 
-  friend FMT_CONSTEXPR_DECL text_style fg(internal::color_type foreground)
+  friend FMT_CONSTEXPR_DECL text_style fg(detail::color_type foreground)
       FMT_NOEXCEPT;
-  friend FMT_CONSTEXPR_DECL text_style bg(internal::color_type background)
+  friend FMT_CONSTEXPR_DECL text_style bg(detail::color_type background)
       FMT_NOEXCEPT;
 
-  internal::color_type foreground_color;
-  internal::color_type background_color;
+  detail::color_type foreground_color;
+  detail::color_type background_color;
   bool set_foreground_color;
   bool set_background_color;
   emphasis ems;
 };
 
-FMT_CONSTEXPR text_style fg(internal::color_type foreground) FMT_NOEXCEPT {
+FMT_CONSTEXPR text_style fg(detail::color_type foreground) FMT_NOEXCEPT {
   return text_style(/*is_foreground=*/true, foreground);
 }
 
-FMT_CONSTEXPR text_style bg(internal::color_type background) FMT_NOEXCEPT {
+FMT_CONSTEXPR text_style bg(detail::color_type background) FMT_NOEXCEPT {
   return text_style(/*is_foreground=*/false, background);
 }
 
@@ -350,21 +350,21 @@ FMT_CONSTEXPR text_style operator|(emphasis lhs, emphasis rhs) FMT_NOEXCEPT {
   return text_style(lhs) | rhs;
 }
 
-namespace internal {
+namespace detail {
 
 template <typename Char> struct ansi_color_escape {
-  FMT_CONSTEXPR ansi_color_escape(internal::color_type text_color,
+  FMT_CONSTEXPR ansi_color_escape(detail::color_type text_color,
                                   const char* esc) FMT_NOEXCEPT {
     // If we have a terminal color, we need to output another escape code
     // sequence.
     if (!text_color.is_rgb) {
-      bool is_background = esc == internal::data::background_color;
+      bool is_background = esc == detail::data::background_color;
       uint32_t value = text_color.value.term_color;
       // Background ASCII codes are the same as the foreground ones but with
       // 10 more.
       if (is_background) value += 10u;
 
-      std::size_t index = 0;
+      size_t index = 0;
       buffer[index++] = static_cast<Char>('\x1b');
       buffer[index++] = static_cast<Char>('[');
 
@@ -398,7 +398,7 @@ template <typename Char> struct ansi_color_escape {
     if (em_bits & static_cast<uint8_t>(emphasis::strikethrough))
       em_codes[3] = 9;
 
-    std::size_t index = 0;
+    size_t index = 0;
     for (int i = 0; i < 4; ++i) {
       if (!em_codes[i]) continue;
       buffer[index++] = static_cast<Char>('\x1b');
@@ -412,7 +412,7 @@ template <typename Char> struct ansi_color_escape {
 
   FMT_CONSTEXPR const Char* begin() const FMT_NOEXCEPT { return buffer; }
   FMT_CONSTEXPR const Char* end() const FMT_NOEXCEPT {
-    return buffer + std::strlen(buffer);
+    return buffer + std::char_traits<Char>::length(buffer);
   }
 
  private:
@@ -429,14 +429,14 @@ template <typename Char> struct ansi_color_escape {
 
 template <typename Char>
 FMT_CONSTEXPR ansi_color_escape<Char> make_foreground_color(
-    internal::color_type foreground) FMT_NOEXCEPT {
-  return ansi_color_escape<Char>(foreground, internal::data::foreground_color);
+    detail::color_type foreground) FMT_NOEXCEPT {
+  return ansi_color_escape<Char>(foreground, detail::data::foreground_color);
 }
 
 template <typename Char>
 FMT_CONSTEXPR ansi_color_escape<Char> make_background_color(
-    internal::color_type background) FMT_NOEXCEPT {
-  return ansi_color_escape<Char>(background, internal::data::background_color);
+    detail::color_type background) FMT_NOEXCEPT {
+  return ansi_color_escape<Char>(background, detail::data::background_color);
 }
 
 template <typename Char>
@@ -455,90 +455,71 @@ inline void fputs<wchar_t>(const wchar_t* chars, FILE* stream) FMT_NOEXCEPT {
 }
 
 template <typename Char> inline void reset_color(FILE* stream) FMT_NOEXCEPT {
-  fputs(internal::data::reset_color, stream);
+  fputs(detail::data::reset_color, stream);
 }
 
 template <> inline void reset_color<wchar_t>(FILE* stream) FMT_NOEXCEPT {
-  fputs(internal::data::wreset_color, stream);
+  fputs(detail::data::wreset_color, stream);
 }
 
 template <typename Char>
-inline void reset_color(basic_memory_buffer<Char>& buffer) FMT_NOEXCEPT {
+inline void reset_color(buffer<Char>& buffer) FMT_NOEXCEPT {
   const char* begin = data::reset_color;
   const char* end = begin + sizeof(data::reset_color) - 1;
   buffer.append(begin, end);
 }
 
 template <typename Char>
-std::basic_string<Char> vformat(const text_style& ts,
-                                basic_string_view<Char> format_str,
-                                basic_format_args<buffer_context<Char> > args) {
-  basic_memory_buffer<Char> buffer;
+void vformat_to(buffer<Char>& buf, const text_style& ts,
+                basic_string_view<Char> format_str,
+                basic_format_args<buffer_context<type_identity_t<Char>>> args) {
   bool has_style = false;
   if (ts.has_emphasis()) {
     has_style = true;
-    ansi_color_escape<Char> escape = make_emphasis<Char>(ts.get_emphasis());
-    buffer.append(escape.begin(), escape.end());
+    auto emphasis = detail::make_emphasis<Char>(ts.get_emphasis());
+    buf.append(emphasis.begin(), emphasis.end());
   }
   if (ts.has_foreground()) {
     has_style = true;
-    ansi_color_escape<Char> escape =
-        make_foreground_color<Char>(ts.get_foreground());
-    buffer.append(escape.begin(), escape.end());
+    auto foreground = detail::make_foreground_color<Char>(ts.get_foreground());
+    buf.append(foreground.begin(), foreground.end());
   }
   if (ts.has_background()) {
     has_style = true;
-    ansi_color_escape<Char> escape =
-        make_background_color<Char>(ts.get_background());
-    buffer.append(escape.begin(), escape.end());
+    auto background = detail::make_background_color<Char>(ts.get_background());
+    buf.append(background.begin(), background.end());
   }
-  internal::vformat_to(buffer, format_str, args);
-  if (has_style) {
-    reset_color<Char>(buffer);
-  }
-  return fmt::to_string(buffer);
+  detail::vformat_to(buf, format_str, args);
+  if (has_style) detail::reset_color<Char>(buf);
 }
-}  // namespace internal
+}  // namespace detail
 
-template <typename S, typename Char = char_t<S> >
+template <typename S, typename Char = char_t<S>>
 void vprint(std::FILE* f, const text_style& ts, const S& format,
-            basic_format_args<buffer_context<Char> > args) {
-  bool has_style = false;
-  if (ts.has_emphasis()) {
-    has_style = true;
-    internal::fputs<Char>(internal::make_emphasis<Char>(ts.get_emphasis()), f);
-  }
-  if (ts.has_foreground()) {
-    has_style = true;
-    internal::fputs<Char>(
-        internal::make_foreground_color<Char>(ts.get_foreground()), f);
-  }
-  if (ts.has_background()) {
-    has_style = true;
-    internal::fputs<Char>(
-        internal::make_background_color<Char>(ts.get_background()), f);
-  }
-  vprint(f, format, args);
-  if (has_style) {
-    internal::reset_color<Char>(f);
-  }
+            basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+  basic_memory_buffer<Char> buf;
+  detail::vformat_to(buf, ts, to_string_view(format), args);
+  buf.push_back(Char(0));
+  detail::fputs(buf.data(), f);
 }
 
 /**
+  \rst
   Formats a string and prints it to the specified file stream using ANSI
   escape sequences to specify text formatting.
-  Example:
+
+  **Example**::
+
     fmt::print(fmt::emphasis::bold | fg(fmt::color::red),
                "Elapsed time: {0:.2f} seconds", 1.23);
+  \endrst
  */
 template <typename S, typename... Args,
-          FMT_ENABLE_IF(internal::is_string<S>::value)>
+          FMT_ENABLE_IF(detail::is_string<S>::value)>
 void print(std::FILE* f, const text_style& ts, const S& format_str,
            const Args&... args) {
-  internal::check_format_string<Args...>(format_str);
-  using context = buffer_context<char_t<S> >;
-  format_arg_store<context, Args...> as{args...};
-  vprint(f, ts, format_str, basic_format_args<context>(as));
+  vprint(f, ts, format_str,
+         fmt::make_args_checked<Args...>(format_str, args...));
 }
 
 /**
@@ -549,16 +530,18 @@ void print(std::FILE* f, const text_style& ts, const S& format_str,
                "Elapsed time: {0:.2f} seconds", 1.23);
  */
 template <typename S, typename... Args,
-          FMT_ENABLE_IF(internal::is_string<S>::value)>
+          FMT_ENABLE_IF(detail::is_string<S>::value)>
 void print(const text_style& ts, const S& format_str, const Args&... args) {
   return print(stdout, ts, format_str, args...);
 }
 
-template <typename S, typename Char = char_t<S> >
+template <typename S, typename Char = char_t<S>>
 inline std::basic_string<Char> vformat(
     const text_style& ts, const S& format_str,
-    basic_format_args<buffer_context<Char> > args) {
-  return internal::vformat(ts, to_string_view(format_str), args);
+    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+  basic_memory_buffer<Char> buf;
+  detail::vformat_to(buf, ts, to_string_view(format_str), args);
+  return fmt::to_string(buf);
 }
 
 /**
@@ -573,11 +556,46 @@ inline std::basic_string<Char> vformat(
                                       "The answer is {}", 42);
   \endrst
 */
-template <typename S, typename... Args, typename Char = char_t<S> >
+template <typename S, typename... Args, typename Char = char_t<S>>
 inline std::basic_string<Char> format(const text_style& ts, const S& format_str,
                                       const Args&... args) {
-  return internal::vformat(ts, to_string_view(format_str),
-                           {internal::make_args_checked(format_str, args...)});
+  return vformat(ts, to_string_view(format_str),
+                 fmt::make_args_checked<Args...>(format_str, args...));
+}
+
+/**
+  Formats a string with the given text_style and writes the output to ``out``.
+ */
+template <typename OutputIt, typename Char,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value)>
+OutputIt vformat_to(
+    OutputIt out, const text_style& ts, basic_string_view<Char> format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+  decltype(detail::get_buffer<Char>(out)) buf(detail::get_buffer_init(out));
+  detail::vformat_to(buf, ts, format_str, args);
+  return detail::get_iterator(buf);
+}
+
+/**
+  \rst
+  Formats arguments with the given text_style, writes the result to the output
+  iterator ``out`` and returns the iterator past the end of the output range.
+
+  **Example**::
+
+    std::vector<char> out;
+    fmt::format_to(std::back_inserter(out),
+                   fmt::emphasis::bold | fg(fmt::color::red), "{}", 42);
+  \endrst
+*/
+template <typename OutputIt, typename S, typename... Args,
+          bool enable = detail::is_output_iterator<OutputIt, char_t<S>>::value&&
+              detail::is_string<S>::value>
+inline auto format_to(OutputIt out, const text_style& ts, const S& format_str,
+                      Args&&... args) ->
+    typename std::enable_if<enable, OutputIt>::type {
+  return vformat_to(out, ts, to_string_view(format_str),
+                    fmt::make_args_checked<Args...>(format_str, args...));
 }
 
 FMT_END_NAMESPACE
diff --git a/include/vtkdiy2/thirdparty/fmt/compile.h b/include/vtkdiy2/thirdparty/fmt/compile.h
new file mode 100644
index 00000000000..3a33b02014c
--- /dev/null
+++ b/include/vtkdiy2/thirdparty/fmt/compile.h
@@ -0,0 +1,701 @@
+// Formatting library for C++ - experimental format string compilation
+//
+// Copyright (c) 2012 - present, Victor Zverovich and fmt contributors
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_COMPILE_H_
+#define FMT_COMPILE_H_
+
+#include <vector>
+
+#include "format.h"
+
+FMT_BEGIN_NAMESPACE
+namespace detail {
+
+// A compile-time string which is compiled into fast formatting code.
+class compiled_string {};
+
+template <typename S>
+struct is_compiled_string : std::is_base_of<compiled_string, S> {};
+
+/**
+  \rst
+  Converts a string literal *s* into a format string that will be parsed at
+  compile time and converted into efficient formatting code. Requires C++17
+  ``constexpr if`` compiler support.
+
+  **Example**::
+
+    // Converts 42 into std::string using the most efficient method and no
+    // runtime format string processing.
+    std::string s = fmt::format(FMT_COMPILE("{}"), 42);
+  \endrst
+ */
+#define FMT_COMPILE(s) FMT_STRING_IMPL(s, fmt::detail::compiled_string)
+
+template <typename T, typename... Tail>
+const T& first(const T& value, const Tail&...) {
+  return value;
+}
+
+// Part of a compiled format string. It can be either literal text or a
+// replacement field.
+template <typename Char> struct format_part {
+  enum class kind { arg_index, arg_name, text, replacement };
+
+  struct replacement {
+    arg_ref<Char> arg_id;
+    dynamic_format_specs<Char> specs;
+  };
+
+  kind part_kind;
+  union value {
+    int arg_index;
+    basic_string_view<Char> str;
+    replacement repl;
+
+    FMT_CONSTEXPR value(int index = 0) : arg_index(index) {}
+    FMT_CONSTEXPR value(basic_string_view<Char> s) : str(s) {}
+    FMT_CONSTEXPR value(replacement r) : repl(r) {}
+  } val;
+  // Position past the end of the argument id.
+  const Char* arg_id_end = nullptr;
+
+  FMT_CONSTEXPR format_part(kind k = kind::arg_index, value v = {})
+      : part_kind(k), val(v) {}
+
+  static FMT_CONSTEXPR format_part make_arg_index(int index) {
+    return format_part(kind::arg_index, index);
+  }
+  static FMT_CONSTEXPR format_part make_arg_name(basic_string_view<Char> name) {
+    return format_part(kind::arg_name, name);
+  }
+  static FMT_CONSTEXPR format_part make_text(basic_string_view<Char> text) {
+    return format_part(kind::text, text);
+  }
+  static FMT_CONSTEXPR format_part make_replacement(replacement repl) {
+    return format_part(kind::replacement, repl);
+  }
+};
+
+template <typename Char> struct part_counter {
+  unsigned num_parts = 0;
+
+  FMT_CONSTEXPR void on_text(const Char* begin, const Char* end) {
+    if (begin != end) ++num_parts;
+  }
+
+  FMT_CONSTEXPR int on_arg_id() { return ++num_parts, 0; }
+  FMT_CONSTEXPR int on_arg_id(int) { return ++num_parts, 0; }
+  FMT_CONSTEXPR int on_arg_id(basic_string_view<Char>) {
+    return ++num_parts, 0;
+  }
+
+  FMT_CONSTEXPR void on_replacement_field(int, const Char*) {}
+
+  FMT_CONSTEXPR const Char* on_format_specs(int, const Char* begin,
+                                            const Char* end) {
+    // Find the matching brace.
+    unsigned brace_counter = 0;
+    for (; begin != end; ++begin) {
+      if (*begin == '{') {
+        ++brace_counter;
+      } else if (*begin == '}') {
+        if (brace_counter == 0u) break;
+        --brace_counter;
+      }
+    }
+    return begin;
+  }
+
+  FMT_CONSTEXPR void on_error(const char*) {}
+};
+
+// Counts the number of parts in a format string.
+template <typename Char>
+FMT_CONSTEXPR unsigned count_parts(basic_string_view<Char> format_str) {
+  part_counter<Char> counter;
+  parse_format_string<true>(format_str, counter);
+  return counter.num_parts;
+}
+
+template <typename Char, typename PartHandler>
+class format_string_compiler : public error_handler {
+ private:
+  using part = format_part<Char>;
+
+  PartHandler handler_;
+  part part_;
+  basic_string_view<Char> format_str_;
+  basic_format_parse_context<Char> parse_context_;
+
+ public:
+  FMT_CONSTEXPR format_string_compiler(basic_string_view<Char> format_str,
+                                       PartHandler handler)
+      : handler_(handler),
+        format_str_(format_str),
+        parse_context_(format_str) {}
+
+  FMT_CONSTEXPR void on_text(const Char* begin, const Char* end) {
+    if (begin != end)
+      handler_(part::make_text({begin, to_unsigned(end - begin)}));
+  }
+
+  FMT_CONSTEXPR int on_arg_id() {
+    part_ = part::make_arg_index(parse_context_.next_arg_id());
+    return 0;
+  }
+
+  FMT_CONSTEXPR int on_arg_id(int id) {
+    parse_context_.check_arg_id(id);
+    part_ = part::make_arg_index(id);
+    return 0;
+  }
+
+  FMT_CONSTEXPR int on_arg_id(basic_string_view<Char> id) {
+    part_ = part::make_arg_name(id);
+    return 0;
+  }
+
+  FMT_CONSTEXPR void on_replacement_field(int, const Char* ptr) {
+    part_.arg_id_end = ptr;
+    handler_(part_);
+  }
+
+  FMT_CONSTEXPR const Char* on_format_specs(int, const Char* begin,
+                                            const Char* end) {
+    auto repl = typename part::replacement();
+    dynamic_specs_handler<basic_format_parse_context<Char>> handler(
+        repl.specs, parse_context_);
+    auto it = parse_format_specs(begin, end, handler);
+    if (*it != '}') on_error("missing '}' in format string");
+    repl.arg_id = part_.part_kind == part::kind::arg_index
+                      ? arg_ref<Char>(part_.val.arg_index)
+                      : arg_ref<Char>(part_.val.str);
+    auto part = part::make_replacement(repl);
+    part.arg_id_end = begin;
+    handler_(part);
+    return it;
+  }
+};
+
+// Compiles a format string and invokes handler(part) for each parsed part.
+template <bool IS_CONSTEXPR, typename Char, typename PartHandler>
+FMT_CONSTEXPR void compile_format_string(basic_string_view<Char> format_str,
+                                         PartHandler handler) {
+  parse_format_string<IS_CONSTEXPR>(
+      format_str,
+      format_string_compiler<Char, PartHandler>(format_str, handler));
+}
+
+template <typename OutputIt, typename Context, typename Id>
+void format_arg(
+    basic_format_parse_context<typename Context::char_type>& parse_ctx,
+    Context& ctx, Id arg_id) {
+  ctx.advance_to(visit_format_arg(
+      arg_formatter<OutputIt, typename Context::char_type>(ctx, &parse_ctx),
+      ctx.arg(arg_id)));
+}
+
+// vformat_to is defined in a subnamespace to prevent ADL.
+namespace cf {
+template <typename Context, typename OutputIt, typename CompiledFormat>
+auto vformat_to(OutputIt out, CompiledFormat& cf,
+                basic_format_args<Context> args) -> typename Context::iterator {
+  using char_type = typename Context::char_type;
+  basic_format_parse_context<char_type> parse_ctx(
+      to_string_view(cf.format_str_));
+  Context ctx(out, args);
+
+  const auto& parts = cf.parts();
+  for (auto part_it = std::begin(parts); part_it != std::end(parts);
+       ++part_it) {
+    const auto& part = *part_it;
+    const auto& value = part.val;
+
+    using format_part_t = format_part<char_type>;
+    switch (part.part_kind) {
+    case format_part_t::kind::text: {
+      const auto text = value.str;
+      auto output = ctx.out();
+      auto&& it = reserve(output, text.size());
+      it = std::copy_n(text.begin(), text.size(), it);
+      ctx.advance_to(output);
+      break;
+    }
+
+    case format_part_t::kind::arg_index:
+      advance_to(parse_ctx, part.arg_id_end);
+      detail::format_arg<OutputIt>(parse_ctx, ctx, value.arg_index);
+      break;
+
+    case format_part_t::kind::arg_name:
+      advance_to(parse_ctx, part.arg_id_end);
+      detail::format_arg<OutputIt>(parse_ctx, ctx, value.str);
+      break;
+
+    case format_part_t::kind::replacement: {
+      const auto& arg_id_value = value.repl.arg_id.val;
+      const auto arg = value.repl.arg_id.kind == arg_id_kind::index
+                           ? ctx.arg(arg_id_value.index)
+                           : ctx.arg(arg_id_value.name);
+
+      auto specs = value.repl.specs;
+
+      handle_dynamic_spec<width_checker>(specs.width, specs.width_ref, ctx);
+      handle_dynamic_spec<precision_checker>(specs.precision,
+                                             specs.precision_ref, ctx);
+
+      error_handler h;
+      numeric_specs_checker<error_handler> checker(h, arg.type());
+      if (specs.align == align::numeric) checker.require_numeric_argument();
+      if (specs.sign != sign::none) checker.check_sign();
+      if (specs.alt) checker.require_numeric_argument();
+      if (specs.precision >= 0) checker.check_precision();
+
+      advance_to(parse_ctx, part.arg_id_end);
+      ctx.advance_to(
+          visit_format_arg(arg_formatter<OutputIt, typename Context::char_type>(
+                               ctx, nullptr, &specs),
+                           arg));
+      break;
+    }
+    }
+  }
+  return ctx.out();
+}
+}  // namespace cf
+
+struct basic_compiled_format {};
+
+template <typename S, typename = void>
+struct compiled_format_base : basic_compiled_format {
+  using char_type = char_t<S>;
+  using parts_container = std::vector<detail::format_part<char_type>>;
+
+  parts_container compiled_parts;
+
+  explicit compiled_format_base(basic_string_view<char_type> format_str) {
+    compile_format_string<false>(format_str,
+                                 [this](const format_part<char_type>& part) {
+                                   compiled_parts.push_back(part);
+                                 });
+  }
+
+  const parts_container& parts() const { return compiled_parts; }
+};
+
+template <typename Char, unsigned N> struct format_part_array {
+  format_part<Char> data[N] = {};
+  FMT_CONSTEXPR format_part_array() = default;
+};
+
+template <typename Char, unsigned N>
+FMT_CONSTEXPR format_part_array<Char, N> compile_to_parts(
+    basic_string_view<Char> format_str) {
+  format_part_array<Char, N> parts;
+  unsigned counter = 0;
+  // This is not a lambda for compatibility with older compilers.
+  struct {
+    format_part<Char>* parts;
+    unsigned* counter;
+    FMT_CONSTEXPR void operator()(const format_part<Char>& part) {
+      parts[(*counter)++] = part;
+    }
+  } collector{parts.data, &counter};
+  compile_format_string<true>(format_str, collector);
+  if (counter < N) {
+    parts.data[counter] =
+        format_part<Char>::make_text(basic_string_view<Char>());
+  }
+  return parts;
+}
+
+template <typename T> constexpr const T& constexpr_max(const T& a, const T& b) {
+  return (a < b) ? b : a;
+}
+
+template <typename S>
+struct compiled_format_base<S, enable_if_t<is_compile_string<S>::value>>
+    : basic_compiled_format {
+  using char_type = char_t<S>;
+
+  FMT_CONSTEXPR explicit compiled_format_base(basic_string_view<char_type>) {}
+
+// Workaround for old compilers. Format string compilation will not be
+// performed there anyway.
+#if FMT_USE_CONSTEXPR
+  static FMT_CONSTEXPR_DECL const unsigned num_format_parts =
+      constexpr_max(count_parts(to_string_view(S())), 1u);
+#else
+  static const unsigned num_format_parts = 1;
+#endif
+
+  using parts_container = format_part<char_type>[num_format_parts];
+
+  const parts_container& parts() const {
+    static FMT_CONSTEXPR_DECL const auto compiled_parts =
+        compile_to_parts<char_type, num_format_parts>(
+            detail::to_string_view(S()));
+    return compiled_parts.data;
+  }
+};
+
+template <typename S, typename... Args>
+class compiled_format : private compiled_format_base<S> {
+ public:
+  using typename compiled_format_base<S>::char_type;
+
+ private:
+  basic_string_view<char_type> format_str_;
+
+  template <typename Context, typename OutputIt, typename CompiledFormat>
+  friend auto cf::vformat_to(OutputIt out, CompiledFormat& cf,
+                             basic_format_args<Context> args) ->
+      typename Context::iterator;
+
+ public:
+  compiled_format() = delete;
+  explicit constexpr compiled_format(basic_string_view<char_type> format_str)
+      : compiled_format_base<S>(format_str), format_str_(format_str) {}
+};
+
+#ifdef __cpp_if_constexpr
+template <typename... Args> struct type_list {};
+
+// Returns a reference to the argument at index N from [first, rest...].
+template <int N, typename T, typename... Args>
+constexpr const auto& get([[maybe_unused]] const T& first,
+                          [[maybe_unused]] const Args&... rest) {
+  static_assert(N < 1 + sizeof...(Args), "index is out of bounds");
+  if constexpr (N == 0)
+    return first;
+  else
+    return get<N - 1>(rest...);
+}
+
+template <int N, typename> struct get_type_impl;
+
+template <int N, typename... Args> struct get_type_impl<N, type_list<Args...>> {
+  using type = remove_cvref_t<decltype(get<N>(std::declval<Args>()...))>;
+};
+
+template <int N, typename T>
+using get_type = typename get_type_impl<N, T>::type;
+
+template <typename T> struct is_compiled_format : std::false_type {};
+
+template <typename Char> struct text {
+  basic_string_view<Char> data;
+  using char_type = Char;
+
+  template <typename OutputIt, typename... Args>
+  OutputIt format(OutputIt out, const Args&...) const {
+    return write<Char>(out, data);
+  }
+};
+
+template <typename Char>
+struct is_compiled_format<text<Char>> : std::true_type {};
+
+template <typename Char>
+constexpr text<Char> make_text(basic_string_view<Char> s, size_t pos,
+                               size_t size) {
+  return {{&s[pos], size}};
+}
+
+template <typename Char> struct code_unit {
+  Char value;
+  using char_type = Char;
+
+  template <typename OutputIt, typename... Args>
+  OutputIt format(OutputIt out, const Args&...) const {
+    return write<Char>(out, value);
+  }
+};
+
+template <typename Char>
+struct is_compiled_format<code_unit<Char>> : std::true_type {};
+
+// A replacement field that refers to argument N.
+template <typename Char, typename T, int N> struct field {
+  using char_type = Char;
+
+  template <typename OutputIt, typename... Args>
+  OutputIt format(OutputIt out, const Args&... args) const {
+    // This ensures that the argument type is convertile to `const T&`.
+    const T& arg = get<N>(args...);
+    return write<Char>(out, arg);
+  }
+};
+
+template <typename Char, typename T, int N>
+struct is_compiled_format<field<Char, T, N>> : std::true_type {};
+
+// A replacement field that refers to argument N and has format specifiers.
+template <typename Char, typename T, int N> struct spec_field {
+  using char_type = Char;
+  mutable formatter<T, Char> fmt;
+
+  template <typename OutputIt, typename... Args>
+  OutputIt format(OutputIt out, const Args&... args) const {
+    // This ensures that the argument type is convertile to `const T&`.
+    const T& arg = get<N>(args...);
+    const auto& vargs =
+        make_format_args<basic_format_context<OutputIt, Char>>(args...);
+    basic_format_context<OutputIt, Char> ctx(out, vargs);
+    return fmt.format(arg, ctx);
+  }
+};
+
+template <typename Char, typename T, int N>
+struct is_compiled_format<spec_field<Char, T, N>> : std::true_type {};
+
+template <typename L, typename R> struct concat {
+  L lhs;
+  R rhs;
+  using char_type = typename L::char_type;
+
+  template <typename OutputIt, typename... Args>
+  OutputIt format(OutputIt out, const Args&... args) const {
+    out = lhs.format(out, args...);
+    return rhs.format(out, args...);
+  }
+};
+
+template <typename L, typename R>
+struct is_compiled_format<concat<L, R>> : std::true_type {};
+
+template <typename L, typename R>
+constexpr concat<L, R> make_concat(L lhs, R rhs) {
+  return {lhs, rhs};
+}
+
+struct unknown_format {};
+
+template <typename Char>
+constexpr size_t parse_text(basic_string_view<Char> str, size_t pos) {
+  for (size_t size = str.size(); pos != size; ++pos) {
+    if (str[pos] == '{' || str[pos] == '}') break;
+  }
+  return pos;
+}
+
+template <typename Args, size_t POS, int ID, typename S>
+constexpr auto compile_format_string(S format_str);
+
+template <typename Args, size_t POS, int ID, typename T, typename S>
+constexpr auto parse_tail(T head, S format_str) {
+  if constexpr (POS !=
+                basic_string_view<typename S::char_type>(format_str).size()) {
+    constexpr auto tail = compile_format_string<Args, POS, ID>(format_str);
+    if constexpr (std::is_same<remove_cvref_t<decltype(tail)>,
+                               unknown_format>())
+      return tail;
+    else
+      return make_concat(head, tail);
+  } else {
+    return head;
+  }
+}
+
+template <typename T, typename Char> struct parse_specs_result {
+  formatter<T, Char> fmt;
+  size_t end;
+  int next_arg_id;
+};
+
+template <typename T, typename Char>
+constexpr parse_specs_result<T, Char> parse_specs(basic_string_view<Char> str,
+                                                  size_t pos, int arg_id) {
+  str.remove_prefix(pos);
+  auto ctx = basic_format_parse_context<Char>(str, {}, arg_id + 1);
+  auto f = formatter<T, Char>();
+  auto end = f.parse(ctx);
+  return {f, pos + (end - str.data()) + 1, ctx.next_arg_id()};
+}
+
+// Compiles a non-empty format string and returns the compiled representation
+// or unknown_format() on unrecognized input.
+template <typename Args, size_t POS, int ID, typename S>
+constexpr auto compile_format_string(S format_str) {
+  using char_type = typename S::char_type;
+  constexpr basic_string_view<char_type> str = format_str;
+  if constexpr (str[POS] == '{') {
+    if (POS + 1 == str.size())
+      throw format_error("unmatched '{' in format string");
+    if constexpr (str[POS + 1] == '{') {
+      return parse_tail<Args, POS + 2, ID>(make_text(str, POS, 1), format_str);
+    } else if constexpr (str[POS + 1] == '}') {
+      using type = get_type<ID, Args>;
+      return parse_tail<Args, POS + 2, ID + 1>(field<char_type, type, ID>(),
+                                               format_str);
+    } else if constexpr (str[POS + 1] == ':') {
+      using type = get_type<ID, Args>;
+      constexpr auto result = parse_specs<type>(str, POS + 2, ID);
+      return parse_tail<Args, result.end, result.next_arg_id>(
+          spec_field<char_type, type, ID>{result.fmt}, format_str);
+    } else {
+      return unknown_format();
+    }
+  } else if constexpr (str[POS] == '}') {
+    if (POS + 1 == str.size())
+      throw format_error("unmatched '}' in format string");
+    return parse_tail<Args, POS + 2, ID>(make_text(str, POS, 1), format_str);
+  } else {
+    constexpr auto end = parse_text(str, POS + 1);
+    if constexpr (end - POS > 1) {
+      return parse_tail<Args, end, ID>(make_text(str, POS, end - POS),
+                                       format_str);
+    } else {
+      return parse_tail<Args, end, ID>(code_unit<char_type>{str[POS]},
+                                       format_str);
+    }
+  }
+}
+
+template <typename... Args, typename S,
+          FMT_ENABLE_IF(is_compile_string<S>::value ||
+                        detail::is_compiled_string<S>::value)>
+constexpr auto compile(S format_str) {
+  constexpr basic_string_view<typename S::char_type> str = format_str;
+  if constexpr (str.size() == 0) {
+    return detail::make_text(str, 0, 0);
+  } else {
+    constexpr auto result =
+        detail::compile_format_string<detail::type_list<Args...>, 0, 0>(
+            format_str);
+    if constexpr (std::is_same<remove_cvref_t<decltype(result)>,
+                               detail::unknown_format>()) {
+      return detail::compiled_format<S, Args...>(to_string_view(format_str));
+    } else {
+      return result;
+    }
+  }
+}
+#else
+template <typename... Args, typename S,
+          FMT_ENABLE_IF(is_compile_string<S>::value)>
+constexpr auto compile(S format_str) -> detail::compiled_format<S, Args...> {
+  return detail::compiled_format<S, Args...>(to_string_view(format_str));
+}
+#endif  // __cpp_if_constexpr
+
+// Compiles the format string which must be a string literal.
+template <typename... Args, typename Char, size_t N>
+auto compile(const Char (&format_str)[N])
+    -> detail::compiled_format<const Char*, Args...> {
+  return detail::compiled_format<const Char*, Args...>(
+      basic_string_view<Char>(format_str, N - 1));
+}
+}  // namespace detail
+
+// DEPRECATED! use FMT_COMPILE instead.
+template <typename... Args>
+FMT_DEPRECATED auto compile(const Args&... args)
+    -> decltype(detail::compile(args...)) {
+  return detail::compile(args...);
+}
+
+#if FMT_USE_CONSTEXPR
+#  ifdef __cpp_if_constexpr
+
+template <typename CompiledFormat, typename... Args,
+          typename Char = typename CompiledFormat::char_type,
+          FMT_ENABLE_IF(detail::is_compiled_format<CompiledFormat>::value)>
+FMT_INLINE std::basic_string<Char> format(const CompiledFormat& cf,
+                                          const Args&... args) {
+  basic_memory_buffer<Char> buffer;
+  cf.format(detail::buffer_appender<Char>(buffer), args...);
+  return to_string(buffer);
+}
+
+template <typename OutputIt, typename CompiledFormat, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_format<CompiledFormat>::value)>
+OutputIt format_to(OutputIt out, const CompiledFormat& cf,
+                   const Args&... args) {
+  return cf.format(out, args...);
+}
+#  endif  // __cpp_if_constexpr
+#endif    // FMT_USE_CONSTEXPR
+
+template <typename CompiledFormat, typename... Args,
+          typename Char = typename CompiledFormat::char_type,
+          FMT_ENABLE_IF(std::is_base_of<detail::basic_compiled_format,
+                                        CompiledFormat>::value)>
+std::basic_string<Char> format(const CompiledFormat& cf, const Args&... args) {
+  basic_memory_buffer<Char> buffer;
+  using context = buffer_context<Char>;
+  detail::cf::vformat_to<context>(detail::buffer_appender<Char>(buffer), cf,
+                                  make_format_args<context>(args...));
+  return to_string(buffer);
+}
+
+template <typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+FMT_INLINE std::basic_string<typename S::char_type> format(const S&,
+                                                           Args&&... args) {
+#ifdef __cpp_if_constexpr
+  if constexpr (std::is_same<typename S::char_type, char>::value) {
+    constexpr basic_string_view<typename S::char_type> str = S();
+    if (str.size() == 2 && str[0] == '{' && str[1] == '}')
+      return fmt::to_string(detail::first(args...));
+  }
+#endif
+  constexpr auto compiled = detail::compile<Args...>(S());
+  return format(compiled, std::forward<Args>(args)...);
+}
+
+template <typename OutputIt, typename CompiledFormat, typename... Args,
+          FMT_ENABLE_IF(std::is_base_of<detail::basic_compiled_format,
+                                        CompiledFormat>::value)>
+OutputIt format_to(OutputIt out, const CompiledFormat& cf,
+                   const Args&... args) {
+  using char_type = typename CompiledFormat::char_type;
+  using context = format_context_t<OutputIt, char_type>;
+  return detail::cf::vformat_to<context>(out, cf,
+                                         make_format_args<context>(args...));
+}
+
+template <typename OutputIt, typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+OutputIt format_to(OutputIt out, const S&, const Args&... args) {
+  constexpr auto compiled = detail::compile<Args...>(S());
+  return format_to(out, compiled, args...);
+}
+
+template <typename OutputIt, typename CompiledFormat, typename... Args>
+auto format_to_n(OutputIt out, size_t n, const CompiledFormat& cf,
+                 const Args&... args) ->
+    typename std::enable_if<
+        detail::is_output_iterator<OutputIt,
+                                   typename CompiledFormat::char_type>::value &&
+            std::is_base_of<detail::basic_compiled_format,
+                            CompiledFormat>::value,
+        format_to_n_result<OutputIt>>::type {
+  auto it =
+      format_to(detail::truncating_iterator<OutputIt>(out, n), cf, args...);
+  return {it.base(), it.count()};
+}
+
+template <typename OutputIt, typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+format_to_n_result<OutputIt> format_to_n(OutputIt out, size_t n, const S&,
+                                         const Args&... args) {
+  constexpr auto compiled = detail::compile<Args...>(S());
+  auto it = format_to(detail::truncating_iterator<OutputIt>(out, n), compiled,
+                      args...);
+  return {it.base(), it.count()};
+}
+
+template <typename CompiledFormat, typename... Args>
+size_t formatted_size(const CompiledFormat& cf, const Args&... args) {
+  return format_to(detail::counting_iterator(), cf, args...).count();
+}
+
+FMT_END_NAMESPACE
+
+#endif  // FMT_COMPILE_H_
diff --git a/include/vtkdiy2/thirdparty/fmt/core.h b/include/vtkdiy2/thirdparty/fmt/core.h
new file mode 100644
index 00000000000..0a81e0ccd95
--- /dev/null
+++ b/include/vtkdiy2/thirdparty/fmt/core.h
@@ -0,0 +1,2122 @@
+// Formatting library for C++ - the core API
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_CORE_H_
+#define FMT_CORE_H_
+
+#include <cstdio>  // std::FILE
+#include <cstring>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+// The fmt library version in the form major * 10000 + minor * 100 + patch.
+#define FMT_VERSION 70103
+
+#ifdef __clang__
+#  define FMT_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__)
+#else
+#  define FMT_CLANG_VERSION 0
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#  define FMT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#else
+#  define FMT_GCC_VERSION 0
+#endif
+
+#if defined(__INTEL_COMPILER)
+#  define FMT_ICC_VERSION __INTEL_COMPILER
+#else
+#  define FMT_ICC_VERSION 0
+#endif
+
+#if __cplusplus >= 201103L || defined(__GXX_EXPERIMENTAL_CXX0X__)
+#  define FMT_HAS_GXX_CXX11 FMT_GCC_VERSION
+#else
+#  define FMT_HAS_GXX_CXX11 0
+#endif
+
+#ifdef __NVCC__
+#  define FMT_NVCC __NVCC__
+#else
+#  define FMT_NVCC 0
+#endif
+
+#ifdef _MSC_VER
+#  define FMT_MSC_VER _MSC_VER
+#  define FMT_SUPPRESS_MSC_WARNING(n) __pragma(warning(suppress : n))
+#else
+#  define FMT_MSC_VER 0
+#  define FMT_SUPPRESS_MSC_WARNING(n)
+#endif
+
+#ifdef __has_feature
+#  define FMT_HAS_FEATURE(x) __has_feature(x)
+#else
+#  define FMT_HAS_FEATURE(x) 0
+#endif
+
+#if defined(__has_include) && !defined(__INTELLISENSE__) && \
+    (!FMT_ICC_VERSION || FMT_ICC_VERSION >= 1600)
+#  define FMT_HAS_INCLUDE(x) __has_include(x)
+#else
+#  define FMT_HAS_INCLUDE(x) 0
+#endif
+
+#ifdef __has_cpp_attribute
+#  define FMT_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+#  define FMT_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+
+#define FMT_HAS_CPP14_ATTRIBUTE(attribute) \
+  (__cplusplus >= 201402L && FMT_HAS_CPP_ATTRIBUTE(attribute))
+
+#define FMT_HAS_CPP17_ATTRIBUTE(attribute) \
+  (__cplusplus >= 201703L && FMT_HAS_CPP_ATTRIBUTE(attribute))
+
+// Check if relaxed C++14 constexpr is supported.
+// GCC doesn't allow throw in constexpr until version 6 (bug 67371).
+#ifndef FMT_USE_CONSTEXPR
+#  define FMT_USE_CONSTEXPR                                           \
+    (FMT_HAS_FEATURE(cxx_relaxed_constexpr) || FMT_MSC_VER >= 1910 || \
+     (FMT_GCC_VERSION >= 600 && __cplusplus >= 201402L)) &&           \
+        !FMT_NVCC && !FMT_ICC_VERSION
+#endif
+#if FMT_USE_CONSTEXPR
+#  define FMT_CONSTEXPR constexpr
+#  define FMT_CONSTEXPR_DECL constexpr
+#else
+#  define FMT_CONSTEXPR inline
+#  define FMT_CONSTEXPR_DECL
+#endif
+
+#ifndef FMT_OVERRIDE
+#  if FMT_HAS_FEATURE(cxx_override_control) || \
+      (FMT_GCC_VERSION >= 408 && FMT_HAS_GXX_CXX11) || FMT_MSC_VER >= 1900
+#    define FMT_OVERRIDE override
+#  else
+#    define FMT_OVERRIDE
+#  endif
+#endif
+
+// Check if exceptions are disabled.
+#ifndef FMT_EXCEPTIONS
+#  if (defined(__GNUC__) && !defined(__EXCEPTIONS)) || \
+      FMT_MSC_VER && !_HAS_EXCEPTIONS
+#    define FMT_EXCEPTIONS 0
+#  else
+#    define FMT_EXCEPTIONS 1
+#  endif
+#endif
+
+// Define FMT_USE_NOEXCEPT to make fmt use noexcept (C++11 feature).
+#ifndef FMT_USE_NOEXCEPT
+#  define FMT_USE_NOEXCEPT 0
+#endif
+
+#if FMT_USE_NOEXCEPT || FMT_HAS_FEATURE(cxx_noexcept) || \
+    (FMT_GCC_VERSION >= 408 && FMT_HAS_GXX_CXX11) || FMT_MSC_VER >= 1900
+#  define FMT_DETECTED_NOEXCEPT noexcept
+#  define FMT_HAS_CXX11_NOEXCEPT 1
+#else
+#  define FMT_DETECTED_NOEXCEPT throw()
+#  define FMT_HAS_CXX11_NOEXCEPT 0
+#endif
+
+#ifndef FMT_NOEXCEPT
+#  if FMT_EXCEPTIONS || FMT_HAS_CXX11_NOEXCEPT
+#    define FMT_NOEXCEPT FMT_DETECTED_NOEXCEPT
+#  else
+#    define FMT_NOEXCEPT
+#  endif
+#endif
+
+// [[noreturn]] is disabled on MSVC and NVCC because of bogus unreachable code
+// warnings.
+#if FMT_EXCEPTIONS && FMT_HAS_CPP_ATTRIBUTE(noreturn) && !FMT_MSC_VER && \
+    !FMT_NVCC
+#  define FMT_NORETURN [[noreturn]]
+#else
+#  define FMT_NORETURN
+#endif
+
+#ifndef FMT_DEPRECATED
+#  if FMT_HAS_CPP14_ATTRIBUTE(deprecated) || FMT_MSC_VER >= 1900
+#    define FMT_DEPRECATED [[deprecated]]
+#  else
+#    if (defined(__GNUC__) && !defined(__LCC__)) || defined(__clang__)
+#      define FMT_DEPRECATED __attribute__((deprecated))
+#    elif FMT_MSC_VER
+#      define FMT_DEPRECATED __declspec(deprecated)
+#    else
+#      define FMT_DEPRECATED /* deprecated */
+#    endif
+#  endif
+#endif
+
+// Workaround broken [[deprecated]] in the Intel, PGI and NVCC compilers.
+#if FMT_ICC_VERSION || defined(__PGI) || FMT_NVCC
+#  define FMT_DEPRECATED_ALIAS
+#else
+#  define FMT_DEPRECATED_ALIAS FMT_DEPRECATED
+#endif
+
+#ifndef FMT_INLINE
+#  if FMT_GCC_VERSION || FMT_CLANG_VERSION
+#    define FMT_INLINE inline __attribute__((always_inline))
+#  else
+#    define FMT_INLINE inline
+#  endif
+#endif
+
+#ifndef FMT_USE_INLINE_NAMESPACES
+#  if FMT_HAS_FEATURE(cxx_inline_namespaces) || FMT_GCC_VERSION >= 404 || \
+      (FMT_MSC_VER >= 1900 && !_MANAGED)
+#    define FMT_USE_INLINE_NAMESPACES 1
+#  else
+#    define FMT_USE_INLINE_NAMESPACES 0
+#  endif
+#endif
+
+#ifndef FMT_BEGIN_NAMESPACE
+#  if FMT_USE_INLINE_NAMESPACES
+#    define FMT_INLINE_NAMESPACE inline namespace
+#    define FMT_END_NAMESPACE \
+      }                       \
+      }
+#  else
+#    define FMT_INLINE_NAMESPACE namespace
+#    define FMT_END_NAMESPACE \
+      }                       \
+      using namespace v7;     \
+      }
+#  endif
+#  define FMT_BEGIN_NAMESPACE \
+    namespace fmt {           \
+    FMT_INLINE_NAMESPACE v7 {
+#endif
+
+#if !defined(FMT_HEADER_ONLY) && defined(_WIN32)
+#  define FMT_CLASS_API FMT_SUPPRESS_MSC_WARNING(4275)
+#  ifdef FMT_EXPORT
+#    define FMT_API __declspec(dllexport)
+#    define FMT_EXTERN_TEMPLATE_API FMT_API
+#    define FMT_EXPORTED
+#  elif defined(FMT_SHARED)
+#    define FMT_API __declspec(dllimport)
+#    define FMT_EXTERN_TEMPLATE_API FMT_API
+#  endif
+#else
+#  define FMT_CLASS_API
+#endif
+#ifndef FMT_API
+#  define FMT_API
+#endif
+#ifndef FMT_EXTERN_TEMPLATE_API
+#  define FMT_EXTERN_TEMPLATE_API
+#endif
+#ifndef FMT_INSTANTIATION_DEF_API
+#  define FMT_INSTANTIATION_DEF_API FMT_API
+#endif
+
+#ifndef FMT_HEADER_ONLY
+#  define FMT_EXTERN extern
+#else
+#  define FMT_EXTERN
+#endif
+
+// libc++ supports string_view in pre-c++17.
+#if (FMT_HAS_INCLUDE(<string_view>) &&                       \
+     (__cplusplus > 201402L || defined(_LIBCPP_VERSION))) || \
+    (defined(_MSVC_LANG) && _MSVC_LANG > 201402L && _MSC_VER >= 1910)
+#  include <string_view>
+#  define FMT_USE_STRING_VIEW
+#elif FMT_HAS_INCLUDE("experimental/string_view") && __cplusplus >= 201402L
+#  include <experimental/string_view>
+#  define FMT_USE_EXPERIMENTAL_STRING_VIEW
+#endif
+
+#ifndef FMT_UNICODE
+#  define FMT_UNICODE !FMT_MSC_VER
+#endif
+#if FMT_UNICODE && FMT_MSC_VER
+#  pragma execution_character_set("utf-8")
+#endif
+
+FMT_BEGIN_NAMESPACE
+
+// Implementations of enable_if_t and other metafunctions for older systems.
+template <bool B, class T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+template <bool B, class T, class F>
+using conditional_t = typename std::conditional<B, T, F>::type;
+template <bool B> using bool_constant = std::integral_constant<bool, B>;
+template <typename T>
+using remove_reference_t = typename std::remove_reference<T>::type;
+template <typename T>
+using remove_const_t = typename std::remove_const<T>::type;
+template <typename T>
+using remove_cvref_t = typename std::remove_cv<remove_reference_t<T>>::type;
+template <typename T> struct type_identity { using type = T; };
+template <typename T> using type_identity_t = typename type_identity<T>::type;
+
+struct monostate {};
+
+// An enable_if helper to be used in template parameters which results in much
+// shorter symbols: https://godbolt.org/z/sWw4vP. Extra parentheses are needed
+// to workaround a bug in MSVC 2019 (see #1140 and #1186).
+#define FMT_ENABLE_IF(...) enable_if_t<(__VA_ARGS__), int> = 0
+
+namespace detail {
+
+// A helper function to suppress "conditional expression is constant" warnings.
+template <typename T> constexpr T const_check(T value) { return value; }
+
+FMT_NORETURN FMT_API void assert_fail(const char* file, int line,
+                                      const char* message);
+
+#ifndef FMT_ASSERT
+#  ifdef NDEBUG
+// FMT_ASSERT is not empty to avoid -Werror=empty-body.
+#    define FMT_ASSERT(condition, message) ((void)0)
+#  else
+#    define FMT_ASSERT(condition, message)                                    \
+      ((condition) /* void() fails with -Winvalid-constexpr on clang 4.0.1 */ \
+           ? (void)0                                                          \
+           : ::fmt::detail::assert_fail(__FILE__, __LINE__, (message)))
+#  endif
+#endif
+
+#if defined(FMT_USE_STRING_VIEW)
+template <typename Char> using std_string_view = std::basic_string_view<Char>;
+#elif defined(FMT_USE_EXPERIMENTAL_STRING_VIEW)
+template <typename Char>
+using std_string_view = std::experimental::basic_string_view<Char>;
+#else
+template <typename T> struct std_string_view {};
+#endif
+
+#ifdef FMT_USE_INT128
+// Do nothing.
+#elif defined(__SIZEOF_INT128__) && !FMT_NVCC && \
+    !(FMT_CLANG_VERSION && FMT_MSC_VER)
+#  define FMT_USE_INT128 1
+using int128_t = __int128_t;
+using uint128_t = __uint128_t;
+#else
+#  define FMT_USE_INT128 0
+#endif
+#if !FMT_USE_INT128
+struct int128_t {};
+struct uint128_t {};
+#endif
+
+// Casts a nonnegative integer to unsigned.
+template <typename Int>
+FMT_CONSTEXPR typename std::make_unsigned<Int>::type to_unsigned(Int value) {
+  FMT_ASSERT(value >= 0, "negative value");
+  return static_cast<typename std::make_unsigned<Int>::type>(value);
+}
+
+FMT_SUPPRESS_MSC_WARNING(4566) constexpr unsigned char micro[] = "\u00B5";
+
+template <typename Char> constexpr bool is_unicode() {
+  return FMT_UNICODE || sizeof(Char) != 1 ||
+         (sizeof(micro) == 3 && micro[0] == 0xC2 && micro[1] == 0xB5);
+}
+
+#ifdef __cpp_char8_t
+using char8_type = char8_t;
+#else
+enum char8_type : unsigned char {};
+#endif
+}  // namespace detail
+
+#ifdef FMT_USE_INTERNAL
+namespace internal = detail;  // DEPRECATED
+#endif
+
+/**
+  An implementation of ``std::basic_string_view`` for pre-C++17. It provides a
+  subset of the API. ``fmt::basic_string_view`` is used for format strings even
+  if ``std::string_view`` is available to prevent issues when a library is
+  compiled with a different ``-std`` option than the client code (which is not
+  recommended).
+ */
+template <typename Char> class basic_string_view {
+ private:
+  const Char* data_;
+  size_t size_;
+
+ public:
+  using value_type = Char;
+  using iterator = const Char*;
+
+  constexpr basic_string_view() FMT_NOEXCEPT : data_(nullptr), size_(0) {}
+
+  /** Constructs a string reference object from a C string and a size. */
+  constexpr basic_string_view(const Char* s, size_t count) FMT_NOEXCEPT
+      : data_(s),
+        size_(count) {}
+
+  /**
+    \rst
+    Constructs a string reference object from a C string computing
+    the size with ``std::char_traits<Char>::length``.
+    \endrst
+   */
+#if __cplusplus >= 201703L  // C++17's char_traits::length() is constexpr.
+  FMT_CONSTEXPR
+#endif
+  basic_string_view(const Char* s)
+      : data_(s), size_(std::char_traits<Char>::length(s)) {}
+
+  /** Constructs a string reference from a ``std::basic_string`` object. */
+  template <typename Traits, typename Alloc>
+  FMT_CONSTEXPR basic_string_view(
+      const std::basic_string<Char, Traits, Alloc>& s) FMT_NOEXCEPT
+      : data_(s.data()),
+        size_(s.size()) {}
+
+  template <typename S, FMT_ENABLE_IF(std::is_same<
+                                      S, detail::std_string_view<Char>>::value)>
+  FMT_CONSTEXPR basic_string_view(S s) FMT_NOEXCEPT : data_(s.data()),
+                                                      size_(s.size()) {}
+
+  /** Returns a pointer to the string data. */
+  constexpr const Char* data() const { return data_; }
+
+  /** Returns the string size. */
+  constexpr size_t size() const { return size_; }
+
+  constexpr iterator begin() const { return data_; }
+  constexpr iterator end() const { return data_ + size_; }
+
+  constexpr const Char& operator[](size_t pos) const { return data_[pos]; }
+
+  FMT_CONSTEXPR void remove_prefix(size_t n) {
+    data_ += n;
+    size_ -= n;
+  }
+
+  // Lexicographically compare this string reference to other.
+  int compare(basic_string_view other) const {
+    size_t str_size = size_ < other.size_ ? size_ : other.size_;
+    int result = std::char_traits<Char>::compare(data_, other.data_, str_size);
+    if (result == 0)
+      result = size_ == other.size_ ? 0 : (size_ < other.size_ ? -1 : 1);
+    return result;
+  }
+
+  friend bool operator==(basic_string_view lhs, basic_string_view rhs) {
+    return lhs.compare(rhs) == 0;
+  }
+  friend bool operator!=(basic_string_view lhs, basic_string_view rhs) {
+    return lhs.compare(rhs) != 0;
+  }
+  friend bool operator<(basic_string_view lhs, basic_string_view rhs) {
+    return lhs.compare(rhs) < 0;
+  }
+  friend bool operator<=(basic_string_view lhs, basic_string_view rhs) {
+    return lhs.compare(rhs) <= 0;
+  }
+  friend bool operator>(basic_string_view lhs, basic_string_view rhs) {
+    return lhs.compare(rhs) > 0;
+  }
+  friend bool operator>=(basic_string_view lhs, basic_string_view rhs) {
+    return lhs.compare(rhs) >= 0;
+  }
+};
+
+using string_view = basic_string_view<char>;
+using wstring_view = basic_string_view<wchar_t>;
+
+/** Specifies if ``T`` is a character type. Can be specialized by users. */
+template <typename T> struct is_char : std::false_type {};
+template <> struct is_char<char> : std::true_type {};
+template <> struct is_char<wchar_t> : std::true_type {};
+template <> struct is_char<detail::char8_type> : std::true_type {};
+template <> struct is_char<char16_t> : std::true_type {};
+template <> struct is_char<char32_t> : std::true_type {};
+
+/**
+  \rst
+  Returns a string view of `s`. In order to add custom string type support to
+  {fmt} provide an overload of `to_string_view` for it in the same namespace as
+  the type for the argument-dependent lookup to work.
+
+  **Example**::
+
+    namespace my_ns {
+    inline string_view to_string_view(const my_string& s) {
+      return {s.data(), s.length()};
+    }
+    }
+    std::string message = fmt::format(my_string("The answer is {}"), 42);
+  \endrst
+ */
+template <typename Char, FMT_ENABLE_IF(is_char<Char>::value)>
+inline basic_string_view<Char> to_string_view(const Char* s) {
+  return s;
+}
+
+template <typename Char, typename Traits, typename Alloc>
+inline basic_string_view<Char> to_string_view(
+    const std::basic_string<Char, Traits, Alloc>& s) {
+  return s;
+}
+
+template <typename Char>
+inline basic_string_view<Char> to_string_view(basic_string_view<Char> s) {
+  return s;
+}
+
+template <typename Char,
+          FMT_ENABLE_IF(!std::is_empty<detail::std_string_view<Char>>::value)>
+inline basic_string_view<Char> to_string_view(detail::std_string_view<Char> s) {
+  return s;
+}
+
+// A base class for compile-time strings. It is defined in the fmt namespace to
+// make formatting functions visible via ADL, e.g. format(FMT_STRING("{}"), 42).
+struct compile_string {};
+
+template <typename S>
+struct is_compile_string : std::is_base_of<compile_string, S> {};
+
+template <typename S, FMT_ENABLE_IF(is_compile_string<S>::value)>
+constexpr basic_string_view<typename S::char_type> to_string_view(const S& s) {
+  return s;
+}
+
+namespace detail {
+void to_string_view(...);
+using fmt::v7::to_string_view;
+
+// Specifies whether S is a string type convertible to fmt::basic_string_view.
+// It should be a constexpr function but MSVC 2017 fails to compile it in
+// enable_if and MSVC 2015 fails to compile it as an alias template.
+template <typename S>
+struct is_string : std::is_class<decltype(to_string_view(std::declval<S>()))> {
+};
+
+template <typename S, typename = void> struct char_t_impl {};
+template <typename S> struct char_t_impl<S, enable_if_t<is_string<S>::value>> {
+  using result = decltype(to_string_view(std::declval<S>()));
+  using type = typename result::value_type;
+};
+
+// Reports a compile-time error if S is not a valid format string.
+template <typename..., typename S, FMT_ENABLE_IF(!is_compile_string<S>::value)>
+FMT_INLINE void check_format_string(const S&) {
+#ifdef FMT_ENFORCE_COMPILE_STRING
+  static_assert(is_compile_string<S>::value,
+                "FMT_ENFORCE_COMPILE_STRING requires all format strings to use "
+                "FMT_STRING.");
+#endif
+}
+template <typename..., typename S, FMT_ENABLE_IF(is_compile_string<S>::value)>
+void check_format_string(S);
+
+struct error_handler {
+  constexpr error_handler() = default;
+  constexpr error_handler(const error_handler&) = default;
+
+  // This function is intentionally not constexpr to give a compile-time error.
+  FMT_NORETURN FMT_API void on_error(const char* message);
+};
+}  // namespace detail
+
+/** String's character type. */
+template <typename S> using char_t = typename detail::char_t_impl<S>::type;
+
+/**
+  \rst
+  Parsing context consisting of a format string range being parsed and an
+  argument counter for automatic indexing.
+
+  You can use one of the following type aliases for common character types:
+
+  +-----------------------+-------------------------------------+
+  | Type                  | Definition                          |
+  +=======================+=====================================+
+  | format_parse_context  | basic_format_parse_context<char>    |
+  +-----------------------+-------------------------------------+
+  | wformat_parse_context | basic_format_parse_context<wchar_t> |
+  +-----------------------+-------------------------------------+
+  \endrst
+ */
+template <typename Char, typename ErrorHandler = detail::error_handler>
+class basic_format_parse_context : private ErrorHandler {
+ private:
+  basic_string_view<Char> format_str_;
+  int next_arg_id_;
+
+ public:
+  using char_type = Char;
+  using iterator = typename basic_string_view<Char>::iterator;
+
+  explicit constexpr basic_format_parse_context(
+      basic_string_view<Char> format_str, ErrorHandler eh = {},
+      int next_arg_id = 0)
+      : ErrorHandler(eh), format_str_(format_str), next_arg_id_(next_arg_id) {}
+
+  /**
+    Returns an iterator to the beginning of the format string range being
+    parsed.
+   */
+  constexpr iterator begin() const FMT_NOEXCEPT { return format_str_.begin(); }
+
+  /**
+    Returns an iterator past the end of the format string range being parsed.
+   */
+  constexpr iterator end() const FMT_NOEXCEPT { return format_str_.end(); }
+
+  /** Advances the begin iterator to ``it``. */
+  FMT_CONSTEXPR void advance_to(iterator it) {
+    format_str_.remove_prefix(detail::to_unsigned(it - begin()));
+  }
+
+  /**
+    Reports an error if using the manual argument indexing; otherwise returns
+    the next argument index and switches to the automatic indexing.
+   */
+  FMT_CONSTEXPR int next_arg_id() {
+    // Don't check if the argument id is valid to avoid overhead and because it
+    // will be checked during formatting anyway.
+    if (next_arg_id_ >= 0) return next_arg_id_++;
+    on_error("cannot switch from manual to automatic argument indexing");
+    return 0;
+  }
+
+  /**
+    Reports an error if using the automatic argument indexing; otherwise
+    switches to the manual indexing.
+   */
+  FMT_CONSTEXPR void check_arg_id(int) {
+    if (next_arg_id_ > 0)
+      on_error("cannot switch from automatic to manual argument indexing");
+    else
+      next_arg_id_ = -1;
+  }
+
+  FMT_CONSTEXPR void check_arg_id(basic_string_view<Char>) {}
+
+  FMT_CONSTEXPR void on_error(const char* message) {
+    ErrorHandler::on_error(message);
+  }
+
+  constexpr ErrorHandler error_handler() const { return *this; }
+};
+
+using format_parse_context = basic_format_parse_context<char>;
+using wformat_parse_context = basic_format_parse_context<wchar_t>;
+
+template <typename Context> class basic_format_arg;
+template <typename Context> class basic_format_args;
+template <typename Context> class dynamic_format_arg_store;
+
+// A formatter for objects of type T.
+template <typename T, typename Char = char, typename Enable = void>
+struct formatter {
+  // A deleted default constructor indicates a disabled formatter.
+  formatter() = delete;
+};
+
+// Specifies if T has an enabled formatter specialization. A type can be
+// formattable even if it doesn't have a formatter e.g. via a conversion.
+template <typename T, typename Context>
+using has_formatter =
+    std::is_constructible<typename Context::template formatter_type<T>>;
+
+// Checks whether T is a container with contiguous storage.
+template <typename T> struct is_contiguous : std::false_type {};
+template <typename Char>
+struct is_contiguous<std::basic_string<Char>> : std::true_type {};
+
+namespace detail {
+
+// Extracts a reference to the container from back_insert_iterator.
+template <typename Container>
+inline Container& get_container(std::back_insert_iterator<Container> it) {
+  using bi_iterator = std::back_insert_iterator<Container>;
+  struct accessor : bi_iterator {
+    accessor(bi_iterator iter) : bi_iterator(iter) {}
+    using bi_iterator::container;
+  };
+  return *accessor(it).container;
+}
+
+/**
+  \rst
+  A contiguous memory buffer with an optional growing ability. It is an internal
+  class and shouldn't be used directly, only via `~fmt::basic_memory_buffer`.
+  \endrst
+ */
+template <typename T> class buffer {
+ private:
+  T* ptr_;
+  size_t size_;
+  size_t capacity_;
+
+ protected:
+  // Don't initialize ptr_ since it is not accessed to save a few cycles.
+  FMT_SUPPRESS_MSC_WARNING(26495)
+  buffer(size_t sz) FMT_NOEXCEPT : size_(sz), capacity_(sz) {}
+
+  buffer(T* p = nullptr, size_t sz = 0, size_t cap = 0) FMT_NOEXCEPT
+      : ptr_(p),
+        size_(sz),
+        capacity_(cap) {}
+
+  ~buffer() = default;
+
+  /** Sets the buffer data and capacity. */
+  void set(T* buf_data, size_t buf_capacity) FMT_NOEXCEPT {
+    ptr_ = buf_data;
+    capacity_ = buf_capacity;
+  }
+
+  /** Increases the buffer capacity to hold at least *capacity* elements. */
+  virtual void grow(size_t capacity) = 0;
+
+ public:
+  using value_type = T;
+  using const_reference = const T&;
+
+  buffer(const buffer&) = delete;
+  void operator=(const buffer&) = delete;
+
+  T* begin() FMT_NOEXCEPT { return ptr_; }
+  T* end() FMT_NOEXCEPT { return ptr_ + size_; }
+
+  const T* begin() const FMT_NOEXCEPT { return ptr_; }
+  const T* end() const FMT_NOEXCEPT { return ptr_ + size_; }
+
+  /** Returns the size of this buffer. */
+  size_t size() const FMT_NOEXCEPT { return size_; }
+
+  /** Returns the capacity of this buffer. */
+  size_t capacity() const FMT_NOEXCEPT { return capacity_; }
+
+  /** Returns a pointer to the buffer data. */
+  T* data() FMT_NOEXCEPT { return ptr_; }
+
+  /** Returns a pointer to the buffer data. */
+  const T* data() const FMT_NOEXCEPT { return ptr_; }
+
+  /** Clears this buffer. */
+  void clear() { size_ = 0; }
+
+  // Tries resizing the buffer to contain *count* elements. If T is a POD type
+  // the new elements may not be initialized.
+  void try_resize(size_t count) {
+    try_reserve(count);
+    size_ = count <= capacity_ ? count : capacity_;
+  }
+
+  // Tries increasing the buffer capacity to *new_capacity*. It can increase the
+  // capacity by a smaller amount than requested but guarantees there is space
+  // for at least one additional element either by increasing the capacity or by
+  // flushing the buffer if it is full.
+  void try_reserve(size_t new_capacity) {
+    if (new_capacity > capacity_) grow(new_capacity);
+  }
+
+  void push_back(const T& value) {
+    try_reserve(size_ + 1);
+    ptr_[size_++] = value;
+  }
+
+  /** Appends data to the end of the buffer. */
+  template <typename U> void append(const U* begin, const U* end);
+
+  template <typename I> T& operator[](I index) { return ptr_[index]; }
+  template <typename I> const T& operator[](I index) const {
+    return ptr_[index];
+  }
+};
+
+struct buffer_traits {
+  explicit buffer_traits(size_t) {}
+  size_t count() const { return 0; }
+  size_t limit(size_t size) { return size; }
+};
+
+class fixed_buffer_traits {
+ private:
+  size_t count_ = 0;
+  size_t limit_;
+
+ public:
+  explicit fixed_buffer_traits(size_t limit) : limit_(limit) {}
+  size_t count() const { return count_; }
+  size_t limit(size_t size) {
+    size_t n = limit_ > count_ ? limit_ - count_ : 0;
+    count_ += size;
+    return size < n ? size : n;
+  }
+};
+
+// A buffer that writes to an output iterator when flushed.
+template <typename OutputIt, typename T, typename Traits = buffer_traits>
+class iterator_buffer final : public Traits, public buffer<T> {
+ private:
+  OutputIt out_;
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+
+ protected:
+  void grow(size_t) final FMT_OVERRIDE {
+    if (this->size() == buffer_size) flush();
+  }
+  void flush();
+
+ public:
+  explicit iterator_buffer(OutputIt out, size_t n = buffer_size)
+      : Traits(n),
+        buffer<T>(data_, 0, buffer_size),
+        out_(out) {}
+  ~iterator_buffer() { flush(); }
+
+  OutputIt out() {
+    flush();
+    return out_;
+  }
+  size_t count() const { return Traits::count() + this->size(); }
+};
+
+template <typename T> class iterator_buffer<T*, T> final : public buffer<T> {
+ protected:
+  void grow(size_t) final FMT_OVERRIDE {}
+
+ public:
+  explicit iterator_buffer(T* out, size_t = 0) : buffer<T>(out, 0, ~size_t()) {}
+
+  T* out() { return &*this->end(); }
+};
+
+// A buffer that writes to a container with the contiguous storage.
+template <typename Container>
+class iterator_buffer<std::back_insert_iterator<Container>,
+                      enable_if_t<is_contiguous<Container>::value,
+                                  typename Container::value_type>>
+    final : public buffer<typename Container::value_type> {
+ private:
+  Container& container_;
+
+ protected:
+  void grow(size_t capacity) final FMT_OVERRIDE {
+    container_.resize(capacity);
+    this->set(&container_[0], capacity);
+  }
+
+ public:
+  explicit iterator_buffer(Container& c)
+      : buffer<typename Container::value_type>(c.size()), container_(c) {}
+  explicit iterator_buffer(std::back_insert_iterator<Container> out, size_t = 0)
+      : iterator_buffer(get_container(out)) {}
+  std::back_insert_iterator<Container> out() {
+    return std::back_inserter(container_);
+  }
+};
+
+// A buffer that counts the number of code units written discarding the output.
+template <typename T = char> class counting_buffer final : public buffer<T> {
+ private:
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+  size_t count_ = 0;
+
+ protected:
+  void grow(size_t) final FMT_OVERRIDE {
+    if (this->size() != buffer_size) return;
+    count_ += this->size();
+    this->clear();
+  }
+
+ public:
+  counting_buffer() : buffer<T>(data_, 0, buffer_size) {}
+
+  size_t count() { return count_ + this->size(); }
+};
+
+// An output iterator that appends to the buffer.
+// It is used to reduce symbol sizes for the common case.
+template <typename T>
+class buffer_appender : public std::back_insert_iterator<buffer<T>> {
+  using base = std::back_insert_iterator<buffer<T>>;
+
+ public:
+  explicit buffer_appender(buffer<T>& buf) : base(buf) {}
+  buffer_appender(base it) : base(it) {}
+
+  buffer_appender& operator++() {
+    base::operator++();
+    return *this;
+  }
+
+  buffer_appender operator++(int) {
+    buffer_appender tmp = *this;
+    ++*this;
+    return tmp;
+  }
+};
+
+// Maps an output iterator into a buffer.
+template <typename T, typename OutputIt>
+iterator_buffer<OutputIt, T> get_buffer(OutputIt);
+template <typename T> buffer<T>& get_buffer(buffer_appender<T>);
+
+template <typename OutputIt> OutputIt get_buffer_init(OutputIt out) {
+  return out;
+}
+template <typename T> buffer<T>& get_buffer_init(buffer_appender<T> out) {
+  return get_container(out);
+}
+
+template <typename Buffer>
+auto get_iterator(Buffer& buf) -> decltype(buf.out()) {
+  return buf.out();
+}
+template <typename T> buffer_appender<T> get_iterator(buffer<T>& buf) {
+  return buffer_appender<T>(buf);
+}
+
+template <typename T, typename Char = char, typename Enable = void>
+struct fallback_formatter {
+  fallback_formatter() = delete;
+};
+
+// Specifies if T has an enabled fallback_formatter specialization.
+template <typename T, typename Context>
+using has_fallback_formatter =
+    std::is_constructible<fallback_formatter<T, typename Context::char_type>>;
+
+struct view {};
+
+template <typename Char, typename T> struct named_arg : view {
+  const Char* name;
+  const T& value;
+  named_arg(const Char* n, const T& v) : name(n), value(v) {}
+};
+
+template <typename Char> struct named_arg_info {
+  const Char* name;
+  int id;
+};
+
+template <typename T, typename Char, size_t NUM_ARGS, size_t NUM_NAMED_ARGS>
+struct arg_data {
+  // args_[0].named_args points to named_args_ to avoid bloating format_args.
+  // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning.
+  T args_[1 + (NUM_ARGS != 0 ? NUM_ARGS : +1)];
+  named_arg_info<Char> named_args_[NUM_NAMED_ARGS];
+
+  template <typename... U>
+  arg_data(const U&... init) : args_{T(named_args_, NUM_NAMED_ARGS), init...} {}
+  arg_data(const arg_data& other) = delete;
+  const T* args() const { return args_ + 1; }
+  named_arg_info<Char>* named_args() { return named_args_; }
+};
+
+template <typename T, typename Char, size_t NUM_ARGS>
+struct arg_data<T, Char, NUM_ARGS, 0> {
+  // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning.
+  T args_[NUM_ARGS != 0 ? NUM_ARGS : +1];
+
+  template <typename... U>
+  FMT_INLINE arg_data(const U&... init) : args_{init...} {}
+  FMT_INLINE const T* args() const { return args_; }
+  FMT_INLINE std::nullptr_t named_args() { return nullptr; }
+};
+
+template <typename Char>
+inline void init_named_args(named_arg_info<Char>*, int, int) {}
+
+template <typename Char, typename T, typename... Tail>
+void init_named_args(named_arg_info<Char>* named_args, int arg_count,
+                     int named_arg_count, const T&, const Tail&... args) {
+  init_named_args(named_args, arg_count + 1, named_arg_count, args...);
+}
+
+template <typename Char, typename T, typename... Tail>
+void init_named_args(named_arg_info<Char>* named_args, int arg_count,
+                     int named_arg_count, const named_arg<Char, T>& arg,
+                     const Tail&... args) {
+  named_args[named_arg_count++] = {arg.name, arg_count};
+  init_named_args(named_args, arg_count + 1, named_arg_count, args...);
+}
+
+template <typename... Args>
+FMT_INLINE void init_named_args(std::nullptr_t, int, int, const Args&...) {}
+
+template <typename T> struct is_named_arg : std::false_type {};
+
+template <typename T, typename Char>
+struct is_named_arg<named_arg<Char, T>> : std::true_type {};
+
+template <bool B = false> constexpr size_t count() { return B ? 1 : 0; }
+template <bool B1, bool B2, bool... Tail> constexpr size_t count() {
+  return (B1 ? 1 : 0) + count<B2, Tail...>();
+}
+
+template <typename... Args> constexpr size_t count_named_args() {
+  return count<is_named_arg<Args>::value...>();
+}
+
+enum class type {
+  none_type,
+  // Integer types should go first,
+  int_type,
+  uint_type,
+  long_long_type,
+  ulong_long_type,
+  int128_type,
+  uint128_type,
+  bool_type,
+  char_type,
+  last_integer_type = char_type,
+  // followed by floating-point types.
+  float_type,
+  double_type,
+  long_double_type,
+  last_numeric_type = long_double_type,
+  cstring_type,
+  string_type,
+  pointer_type,
+  custom_type
+};
+
+// Maps core type T to the corresponding type enum constant.
+template <typename T, typename Char>
+struct type_constant : std::integral_constant<type, type::custom_type> {};
+
+#define FMT_TYPE_CONSTANT(Type, constant) \
+  template <typename Char>                \
+  struct type_constant<Type, Char>        \
+      : std::integral_constant<type, type::constant> {}
+
+FMT_TYPE_CONSTANT(int, int_type);
+FMT_TYPE_CONSTANT(unsigned, uint_type);
+FMT_TYPE_CONSTANT(long long, long_long_type);
+FMT_TYPE_CONSTANT(unsigned long long, ulong_long_type);
+FMT_TYPE_CONSTANT(int128_t, int128_type);
+FMT_TYPE_CONSTANT(uint128_t, uint128_type);
+FMT_TYPE_CONSTANT(bool, bool_type);
+FMT_TYPE_CONSTANT(Char, char_type);
+FMT_TYPE_CONSTANT(float, float_type);
+FMT_TYPE_CONSTANT(double, double_type);
+FMT_TYPE_CONSTANT(long double, long_double_type);
+FMT_TYPE_CONSTANT(const Char*, cstring_type);
+FMT_TYPE_CONSTANT(basic_string_view<Char>, string_type);
+FMT_TYPE_CONSTANT(const void*, pointer_type);
+
+constexpr bool is_integral_type(type t) {
+  return t > type::none_type && t <= type::last_integer_type;
+}
+
+constexpr bool is_arithmetic_type(type t) {
+  return t > type::none_type && t <= type::last_numeric_type;
+}
+
+template <typename Char> struct string_value {
+  const Char* data;
+  size_t size;
+};
+
+template <typename Char> struct named_arg_value {
+  const named_arg_info<Char>* data;
+  size_t size;
+};
+
+template <typename Context> struct custom_value {
+  using parse_context = typename Context::parse_context_type;
+  const void* value;
+  void (*format)(const void* arg, parse_context& parse_ctx, Context& ctx);
+};
+
+// A formatting argument value.
+template <typename Context> class value {
+ public:
+  using char_type = typename Context::char_type;
+
+  union {
+    int int_value;
+    unsigned uint_value;
+    long long long_long_value;
+    unsigned long long ulong_long_value;
+    int128_t int128_value;
+    uint128_t uint128_value;
+    bool bool_value;
+    char_type char_value;
+    float float_value;
+    double double_value;
+    long double long_double_value;
+    const void* pointer;
+    string_value<char_type> string;
+    custom_value<Context> custom;
+    named_arg_value<char_type> named_args;
+  };
+
+  constexpr FMT_INLINE value(int val = 0) : int_value(val) {}
+  constexpr FMT_INLINE value(unsigned val) : uint_value(val) {}
+  FMT_INLINE value(long long val) : long_long_value(val) {}
+  FMT_INLINE value(unsigned long long val) : ulong_long_value(val) {}
+  FMT_INLINE value(int128_t val) : int128_value(val) {}
+  FMT_INLINE value(uint128_t val) : uint128_value(val) {}
+  FMT_INLINE value(float val) : float_value(val) {}
+  FMT_INLINE value(double val) : double_value(val) {}
+  FMT_INLINE value(long double val) : long_double_value(val) {}
+  FMT_INLINE value(bool val) : bool_value(val) {}
+  FMT_INLINE value(char_type val) : char_value(val) {}
+  FMT_INLINE value(const char_type* val) { string.data = val; }
+  FMT_INLINE value(basic_string_view<char_type> val) {
+    string.data = val.data();
+    string.size = val.size();
+  }
+  FMT_INLINE value(const void* val) : pointer(val) {}
+  FMT_INLINE value(const named_arg_info<char_type>* args, size_t size)
+      : named_args{args, size} {}
+
+  template <typename T> FMT_INLINE value(const T& val) {
+    custom.value = &val;
+    // Get the formatter type through the context to allow different contexts
+    // have different extension points, e.g. `formatter<T>` for `format` and
+    // `printf_formatter<T>` for `printf`.
+    custom.format = format_custom_arg<
+        T, conditional_t<has_formatter<T, Context>::value,
+                         typename Context::template formatter_type<T>,
+                         fallback_formatter<T, char_type>>>;
+  }
+
+ private:
+  // Formats an argument of a custom type, such as a user-defined class.
+  template <typename T, typename Formatter>
+  static void format_custom_arg(const void* arg,
+                                typename Context::parse_context_type& parse_ctx,
+                                Context& ctx) {
+    Formatter f;
+    parse_ctx.advance_to(f.parse(parse_ctx));
+    ctx.advance_to(f.format(*static_cast<const T*>(arg), ctx));
+  }
+};
+
+template <typename Context, typename T>
+FMT_CONSTEXPR basic_format_arg<Context> make_arg(const T& value);
+
+// To minimize the number of types we need to deal with, long is translated
+// either to int or to long long depending on its size.
+enum { long_short = sizeof(long) == sizeof(int) };
+using long_type = conditional_t<long_short, int, long long>;
+using ulong_type = conditional_t<long_short, unsigned, unsigned long long>;
+
+struct unformattable {};
+
+// Maps formatting arguments to core types.
+template <typename Context> struct arg_mapper {
+  using char_type = typename Context::char_type;
+
+  FMT_CONSTEXPR int map(signed char val) { return val; }
+  FMT_CONSTEXPR unsigned map(unsigned char val) { return val; }
+  FMT_CONSTEXPR int map(short val) { return val; }
+  FMT_CONSTEXPR unsigned map(unsigned short val) { return val; }
+  FMT_CONSTEXPR int map(int val) { return val; }
+  FMT_CONSTEXPR unsigned map(unsigned val) { return val; }
+  FMT_CONSTEXPR long_type map(long val) { return val; }
+  FMT_CONSTEXPR ulong_type map(unsigned long val) { return val; }
+  FMT_CONSTEXPR long long map(long long val) { return val; }
+  FMT_CONSTEXPR unsigned long long map(unsigned long long val) { return val; }
+  FMT_CONSTEXPR int128_t map(int128_t val) { return val; }
+  FMT_CONSTEXPR uint128_t map(uint128_t val) { return val; }
+  FMT_CONSTEXPR bool map(bool val) { return val; }
+
+  template <typename T, FMT_ENABLE_IF(is_char<T>::value)>
+  FMT_CONSTEXPR char_type map(T val) {
+    static_assert(
+        std::is_same<T, char>::value || std::is_same<T, char_type>::value,
+        "mixing character types is disallowed");
+    return val;
+  }
+
+  FMT_CONSTEXPR float map(float val) { return val; }
+  FMT_CONSTEXPR double map(double val) { return val; }
+  FMT_CONSTEXPR long double map(long double val) { return val; }
+
+  FMT_CONSTEXPR const char_type* map(char_type* val) { return val; }
+  FMT_CONSTEXPR const char_type* map(const char_type* val) { return val; }
+  template <typename T, FMT_ENABLE_IF(is_string<T>::value)>
+  FMT_CONSTEXPR basic_string_view<char_type> map(const T& val) {
+    static_assert(std::is_same<char_type, char_t<T>>::value,
+                  "mixing character types is disallowed");
+    return to_string_view(val);
+  }
+  template <typename T,
+            FMT_ENABLE_IF(
+                std::is_constructible<basic_string_view<char_type>, T>::value &&
+                !is_string<T>::value && !has_formatter<T, Context>::value &&
+                !has_fallback_formatter<T, Context>::value)>
+  FMT_CONSTEXPR basic_string_view<char_type> map(const T& val) {
+    return basic_string_view<char_type>(val);
+  }
+  template <
+      typename T,
+      FMT_ENABLE_IF(
+          std::is_constructible<std_string_view<char_type>, T>::value &&
+          !std::is_constructible<basic_string_view<char_type>, T>::value &&
+          !is_string<T>::value && !has_formatter<T, Context>::value &&
+          !has_fallback_formatter<T, Context>::value)>
+  FMT_CONSTEXPR basic_string_view<char_type> map(const T& val) {
+    return std_string_view<char_type>(val);
+  }
+  FMT_CONSTEXPR const char* map(const signed char* val) {
+    static_assert(std::is_same<char_type, char>::value, "invalid string type");
+    return reinterpret_cast<const char*>(val);
+  }
+  FMT_CONSTEXPR const char* map(const unsigned char* val) {
+    static_assert(std::is_same<char_type, char>::value, "invalid string type");
+    return reinterpret_cast<const char*>(val);
+  }
+  FMT_CONSTEXPR const char* map(signed char* val) {
+    const auto* const_val = val;
+    return map(const_val);
+  }
+  FMT_CONSTEXPR const char* map(unsigned char* val) {
+    const auto* const_val = val;
+    return map(const_val);
+  }
+
+  FMT_CONSTEXPR const void* map(void* val) { return val; }
+  FMT_CONSTEXPR const void* map(const void* val) { return val; }
+  FMT_CONSTEXPR const void* map(std::nullptr_t val) { return val; }
+  template <typename T> FMT_CONSTEXPR int map(const T*) {
+    // Formatting of arbitrary pointers is disallowed. If you want to output
+    // a pointer cast it to "void *" or "const void *". In particular, this
+    // forbids formatting of "[const] volatile char *" which is printed as bool
+    // by iostreams.
+    static_assert(!sizeof(T), "formatting of non-void pointers is disallowed");
+    return 0;
+  }
+
+  template <typename T,
+            FMT_ENABLE_IF(std::is_enum<T>::value &&
+                          !has_formatter<T, Context>::value &&
+                          !has_fallback_formatter<T, Context>::value)>
+  FMT_CONSTEXPR auto map(const T& val)
+      -> decltype(std::declval<arg_mapper>().map(
+          static_cast<typename std::underlying_type<T>::type>(val))) {
+    return map(static_cast<typename std::underlying_type<T>::type>(val));
+  }
+  template <typename T,
+            FMT_ENABLE_IF(!is_string<T>::value && !is_char<T>::value &&
+                          (has_formatter<T, Context>::value ||
+                           has_fallback_formatter<T, Context>::value))>
+  FMT_CONSTEXPR const T& map(const T& val) {
+    return val;
+  }
+
+  template <typename T>
+  FMT_CONSTEXPR auto map(const named_arg<char_type, T>& val)
+      -> decltype(std::declval<arg_mapper>().map(val.value)) {
+    return map(val.value);
+  }
+
+  unformattable map(...) { return {}; }
+};
+
+// A type constant after applying arg_mapper<Context>.
+template <typename T, typename Context>
+using mapped_type_constant =
+    type_constant<decltype(arg_mapper<Context>().map(std::declval<const T&>())),
+                  typename Context::char_type>;
+
+enum { packed_arg_bits = 4 };
+// Maximum number of arguments with packed types.
+enum { max_packed_args = 62 / packed_arg_bits };
+enum : unsigned long long { is_unpacked_bit = 1ULL << 63 };
+enum : unsigned long long { has_named_args_bit = 1ULL << 62 };
+}  // namespace detail
+
+// A formatting argument. It is a trivially copyable/constructible type to
+// allow storage in basic_memory_buffer.
+template <typename Context> class basic_format_arg {
+ private:
+  detail::value<Context> value_;
+  detail::type type_;
+
+  template <typename ContextType, typename T>
+  friend FMT_CONSTEXPR basic_format_arg<ContextType> detail::make_arg(
+      const T& value);
+
+  template <typename Visitor, typename Ctx>
+  friend FMT_CONSTEXPR auto visit_format_arg(Visitor&& vis,
+                                             const basic_format_arg<Ctx>& arg)
+      -> decltype(vis(0));
+
+  friend class basic_format_args<Context>;
+  friend class dynamic_format_arg_store<Context>;
+
+  using char_type = typename Context::char_type;
+
+  template <typename T, typename Char, size_t NUM_ARGS, size_t NUM_NAMED_ARGS>
+  friend struct detail::arg_data;
+
+  basic_format_arg(const detail::named_arg_info<char_type>* args, size_t size)
+      : value_(args, size) {}
+
+ public:
+  class handle {
+   public:
+    explicit handle(detail::custom_value<Context> custom) : custom_(custom) {}
+
+    void format(typename Context::parse_context_type& parse_ctx,
+                Context& ctx) const {
+      custom_.format(custom_.value, parse_ctx, ctx);
+    }
+
+   private:
+    detail::custom_value<Context> custom_;
+  };
+
+  constexpr basic_format_arg() : type_(detail::type::none_type) {}
+
+  constexpr explicit operator bool() const FMT_NOEXCEPT {
+    return type_ != detail::type::none_type;
+  }
+
+  detail::type type() const { return type_; }
+
+  bool is_integral() const { return detail::is_integral_type(type_); }
+  bool is_arithmetic() const { return detail::is_arithmetic_type(type_); }
+};
+
+/**
+  \rst
+  Visits an argument dispatching to the appropriate visit method based on
+  the argument type. For example, if the argument type is ``double`` then
+  ``vis(value)`` will be called with the value of type ``double``.
+  \endrst
+ */
+template <typename Visitor, typename Context>
+FMT_CONSTEXPR_DECL FMT_INLINE auto visit_format_arg(
+    Visitor&& vis, const basic_format_arg<Context>& arg) -> decltype(vis(0)) {
+  using char_type = typename Context::char_type;
+  switch (arg.type_) {
+  case detail::type::none_type:
+    break;
+  case detail::type::int_type:
+    return vis(arg.value_.int_value);
+  case detail::type::uint_type:
+    return vis(arg.value_.uint_value);
+  case detail::type::long_long_type:
+    return vis(arg.value_.long_long_value);
+  case detail::type::ulong_long_type:
+    return vis(arg.value_.ulong_long_value);
+#if FMT_USE_INT128
+  case detail::type::int128_type:
+    return vis(arg.value_.int128_value);
+  case detail::type::uint128_type:
+    return vis(arg.value_.uint128_value);
+#else
+  case detail::type::int128_type:
+  case detail::type::uint128_type:
+    break;
+#endif
+  case detail::type::bool_type:
+    return vis(arg.value_.bool_value);
+  case detail::type::char_type:
+    return vis(arg.value_.char_value);
+  case detail::type::float_type:
+    return vis(arg.value_.float_value);
+  case detail::type::double_type:
+    return vis(arg.value_.double_value);
+  case detail::type::long_double_type:
+    return vis(arg.value_.long_double_value);
+  case detail::type::cstring_type:
+    return vis(arg.value_.string.data);
+  case detail::type::string_type:
+    return vis(basic_string_view<char_type>(arg.value_.string.data,
+                                            arg.value_.string.size));
+  case detail::type::pointer_type:
+    return vis(arg.value_.pointer);
+  case detail::type::custom_type:
+    return vis(typename basic_format_arg<Context>::handle(arg.value_.custom));
+  }
+  return vis(monostate());
+}
+
+template <typename T> struct formattable : std::false_type {};
+
+namespace detail {
+
+// A workaround for gcc 4.8 to make void_t work in a SFINAE context.
+template <typename... Ts> struct void_t_impl { using type = void; };
+template <typename... Ts>
+using void_t = typename detail::void_t_impl<Ts...>::type;
+
+template <typename It, typename T, typename Enable = void>
+struct is_output_iterator : std::false_type {};
+
+template <typename It, typename T>
+struct is_output_iterator<
+    It, T,
+    void_t<typename std::iterator_traits<It>::iterator_category,
+           decltype(*std::declval<It>() = std::declval<T>())>>
+    : std::true_type {};
+
+template <typename OutputIt>
+struct is_back_insert_iterator : std::false_type {};
+template <typename Container>
+struct is_back_insert_iterator<std::back_insert_iterator<Container>>
+    : std::true_type {};
+
+template <typename OutputIt>
+struct is_contiguous_back_insert_iterator : std::false_type {};
+template <typename Container>
+struct is_contiguous_back_insert_iterator<std::back_insert_iterator<Container>>
+    : is_contiguous<Container> {};
+template <typename Char>
+struct is_contiguous_back_insert_iterator<buffer_appender<Char>>
+    : std::true_type {};
+
+// A type-erased reference to an std::locale to avoid heavy <locale> include.
+class locale_ref {
+ private:
+  const void* locale_;  // A type-erased pointer to std::locale.
+
+ public:
+  locale_ref() : locale_(nullptr) {}
+  template <typename Locale> explicit locale_ref(const Locale& loc);
+
+  explicit operator bool() const FMT_NOEXCEPT { return locale_ != nullptr; }
+
+  template <typename Locale> Locale get() const;
+};
+
+template <typename> constexpr unsigned long long encode_types() { return 0; }
+
+template <typename Context, typename Arg, typename... Args>
+constexpr unsigned long long encode_types() {
+  return static_cast<unsigned>(mapped_type_constant<Arg, Context>::value) |
+         (encode_types<Context, Args...>() << packed_arg_bits);
+}
+
+template <typename Context, typename T>
+FMT_CONSTEXPR basic_format_arg<Context> make_arg(const T& value) {
+  basic_format_arg<Context> arg;
+  arg.type_ = mapped_type_constant<T, Context>::value;
+  arg.value_ = arg_mapper<Context>().map(value);
+  return arg;
+}
+
+template <typename T> int check(unformattable) {
+  static_assert(
+      formattable<T>(),
+      "Cannot format an argument. To make type T formattable provide a "
+      "formatter<T> specialization: https://fmt.dev/latest/api.html#udt");
+  return 0;
+}
+template <typename T, typename U> inline const U& check(const U& val) {
+  return val;
+}
+
+// The type template parameter is there to avoid an ODR violation when using
+// a fallback formatter in one translation unit and an implicit conversion in
+// another (not recommended).
+template <bool IS_PACKED, typename Context, type, typename T,
+          FMT_ENABLE_IF(IS_PACKED)>
+inline value<Context> make_arg(const T& val) {
+  return check<T>(arg_mapper<Context>().map(val));
+}
+
+template <bool IS_PACKED, typename Context, type, typename T,
+          FMT_ENABLE_IF(!IS_PACKED)>
+inline basic_format_arg<Context> make_arg(const T& value) {
+  return make_arg<Context>(value);
+}
+
+template <typename T> struct is_reference_wrapper : std::false_type {};
+template <typename T>
+struct is_reference_wrapper<std::reference_wrapper<T>> : std::true_type {};
+
+template <typename T> const T& unwrap(const T& v) { return v; }
+template <typename T> const T& unwrap(const std::reference_wrapper<T>& v) {
+  return static_cast<const T&>(v);
+}
+
+class dynamic_arg_list {
+  // Workaround for clang's -Wweak-vtables. Unlike for regular classes, for
+  // templates it doesn't complain about inability to deduce single translation
+  // unit for placing vtable. So storage_node_base is made a fake template.
+  template <typename = void> struct node {
+    virtual ~node() = default;
+    std::unique_ptr<node<>> next;
+  };
+
+  template <typename T> struct typed_node : node<> {
+    T value;
+
+    template <typename Arg>
+    FMT_CONSTEXPR typed_node(const Arg& arg) : value(arg) {}
+
+    template <typename Char>
+    FMT_CONSTEXPR typed_node(const basic_string_view<Char>& arg)
+        : value(arg.data(), arg.size()) {}
+  };
+
+  std::unique_ptr<node<>> head_;
+
+ public:
+  template <typename T, typename Arg> const T& push(const Arg& arg) {
+    auto new_node = std::unique_ptr<typed_node<T>>(new typed_node<T>(arg));
+    auto& value = new_node->value;
+    new_node->next = std::move(head_);
+    head_ = std::move(new_node);
+    return value;
+  }
+};
+}  // namespace detail
+
+// Formatting context.
+template <typename OutputIt, typename Char> class basic_format_context {
+ public:
+  /** The character type for the output. */
+  using char_type = Char;
+
+ private:
+  OutputIt out_;
+  basic_format_args<basic_format_context> args_;
+  detail::locale_ref loc_;
+
+ public:
+  using iterator = OutputIt;
+  using format_arg = basic_format_arg<basic_format_context>;
+  using parse_context_type = basic_format_parse_context<Char>;
+  template <typename T> using formatter_type = formatter<T, char_type>;
+
+  basic_format_context(const basic_format_context&) = delete;
+  void operator=(const basic_format_context&) = delete;
+  /**
+   Constructs a ``basic_format_context`` object. References to the arguments are
+   stored in the object so make sure they have appropriate lifetimes.
+   */
+  basic_format_context(OutputIt out,
+                       basic_format_args<basic_format_context> ctx_args,
+                       detail::locale_ref loc = detail::locale_ref())
+      : out_(out), args_(ctx_args), loc_(loc) {}
+
+  format_arg arg(int id) const { return args_.get(id); }
+  format_arg arg(basic_string_view<char_type> name) { return args_.get(name); }
+  int arg_id(basic_string_view<char_type> name) { return args_.get_id(name); }
+  const basic_format_args<basic_format_context>& args() const { return args_; }
+
+  detail::error_handler error_handler() { return {}; }
+  void on_error(const char* message) { error_handler().on_error(message); }
+
+  // Returns an iterator to the beginning of the output range.
+  iterator out() { return out_; }
+
+  // Advances the begin iterator to ``it``.
+  void advance_to(iterator it) {
+    if (!detail::is_back_insert_iterator<iterator>()) out_ = it;
+  }
+
+  detail::locale_ref locale() { return loc_; }
+};
+
+template <typename Char>
+using buffer_context =
+    basic_format_context<detail::buffer_appender<Char>, Char>;
+using format_context = buffer_context<char>;
+using wformat_context = buffer_context<wchar_t>;
+
+// Workaround an alias issue: https://stackoverflow.com/q/62767544/471164.
+#define FMT_BUFFER_CONTEXT(Char) \
+  basic_format_context<detail::buffer_appender<Char>, Char>
+
+/**
+  \rst
+  An array of references to arguments. It can be implicitly converted into
+  `~fmt::basic_format_args` for passing into type-erased formatting functions
+  such as `~fmt::vformat`.
+  \endrst
+ */
+template <typename Context, typename... Args>
+class format_arg_store
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+    // Workaround a GCC template argument substitution bug.
+    : public basic_format_args<Context>
+#endif
+{
+ private:
+  static const size_t num_args = sizeof...(Args);
+  static const size_t num_named_args = detail::count_named_args<Args...>();
+  static const bool is_packed = num_args <= detail::max_packed_args;
+
+  using value_type = conditional_t<is_packed, detail::value<Context>,
+                                   basic_format_arg<Context>>;
+
+  detail::arg_data<value_type, typename Context::char_type, num_args,
+                   num_named_args>
+      data_;
+
+  friend class basic_format_args<Context>;
+
+  static constexpr unsigned long long desc =
+      (is_packed ? detail::encode_types<Context, Args...>()
+                 : detail::is_unpacked_bit | num_args) |
+      (num_named_args != 0
+           ? static_cast<unsigned long long>(detail::has_named_args_bit)
+           : 0);
+
+ public:
+  format_arg_store(const Args&... args)
+      :
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+        basic_format_args<Context>(*this),
+#endif
+        data_{detail::make_arg<
+            is_packed, Context,
+            detail::mapped_type_constant<Args, Context>::value>(args)...} {
+    detail::init_named_args(data_.named_args(), 0, 0, args...);
+  }
+};
+
+/**
+  \rst
+  Constructs a `~fmt::format_arg_store` object that contains references to
+  arguments and can be implicitly converted to `~fmt::format_args`. `Context`
+  can be omitted in which case it defaults to `~fmt::context`.
+  See `~fmt::arg` for lifetime considerations.
+  \endrst
+ */
+template <typename Context = format_context, typename... Args>
+inline format_arg_store<Context, Args...> make_format_args(
+    const Args&... args) {
+  return {args...};
+}
+
+/**
+  \rst
+  Constructs a `~fmt::format_arg_store` object that contains references
+  to arguments and can be implicitly converted to `~fmt::format_args`.
+  If ``format_str`` is a compile-time string then `make_args_checked` checks
+  its validity at compile time.
+  \endrst
+ */
+template <typename... Args, typename S, typename Char = char_t<S>>
+inline auto make_args_checked(const S& format_str,
+                              const remove_reference_t<Args>&... args)
+    -> format_arg_store<buffer_context<Char>, remove_reference_t<Args>...> {
+  static_assert(
+      detail::count<(
+              std::is_base_of<detail::view, remove_reference_t<Args>>::value &&
+              std::is_reference<Args>::value)...>() == 0,
+      "passing views as lvalues is disallowed");
+  detail::check_format_string<Args...>(format_str);
+  return {args...};
+}
+
+/**
+  \rst
+  Returns a named argument to be used in a formatting function. It should only
+  be used in a call to a formatting function.
+
+  **Example**::
+
+    fmt::print("Elapsed time: {s:.2f} seconds", fmt::arg("s", 1.23));
+  \endrst
+ */
+template <typename Char, typename T>
+inline detail::named_arg<Char, T> arg(const Char* name, const T& arg) {
+  static_assert(!detail::is_named_arg<T>(), "nested named arguments");
+  return {name, arg};
+}
+
+/**
+  \rst
+  A dynamic version of `fmt::format_arg_store`.
+  It's equipped with a storage to potentially temporary objects which lifetimes
+  could be shorter than the format arguments object.
+
+  It can be implicitly converted into `~fmt::basic_format_args` for passing
+  into type-erased formatting functions such as `~fmt::vformat`.
+  \endrst
+ */
+template <typename Context>
+class dynamic_format_arg_store
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+    // Workaround a GCC template argument substitution bug.
+    : public basic_format_args<Context>
+#endif
+{
+ private:
+  using char_type = typename Context::char_type;
+
+  template <typename T> struct need_copy {
+    static constexpr detail::type mapped_type =
+        detail::mapped_type_constant<T, Context>::value;
+
+    enum {
+      value = !(detail::is_reference_wrapper<T>::value ||
+                std::is_same<T, basic_string_view<char_type>>::value ||
+                std::is_same<T, detail::std_string_view<char_type>>::value ||
+                (mapped_type != detail::type::cstring_type &&
+                 mapped_type != detail::type::string_type &&
+                 mapped_type != detail::type::custom_type))
+    };
+  };
+
+  template <typename T>
+  using stored_type = conditional_t<detail::is_string<T>::value,
+                                    std::basic_string<char_type>, T>;
+
+  // Storage of basic_format_arg must be contiguous.
+  std::vector<basic_format_arg<Context>> data_;
+  std::vector<detail::named_arg_info<char_type>> named_info_;
+
+  // Storage of arguments not fitting into basic_format_arg must grow
+  // without relocation because items in data_ refer to it.
+  detail::dynamic_arg_list dynamic_args_;
+
+  friend class basic_format_args<Context>;
+
+  unsigned long long get_types() const {
+    return detail::is_unpacked_bit | data_.size() |
+           (named_info_.empty()
+                ? 0ULL
+                : static_cast<unsigned long long>(detail::has_named_args_bit));
+  }
+
+  const basic_format_arg<Context>* data() const {
+    return named_info_.empty() ? data_.data() : data_.data() + 1;
+  }
+
+  template <typename T> void emplace_arg(const T& arg) {
+    data_.emplace_back(detail::make_arg<Context>(arg));
+  }
+
+  template <typename T>
+  void emplace_arg(const detail::named_arg<char_type, T>& arg) {
+    if (named_info_.empty()) {
+      constexpr const detail::named_arg_info<char_type>* zero_ptr{nullptr};
+      data_.insert(data_.begin(), {zero_ptr, 0});
+    }
+    data_.emplace_back(detail::make_arg<Context>(detail::unwrap(arg.value)));
+    auto pop_one = [](std::vector<basic_format_arg<Context>>* data) {
+      data->pop_back();
+    };
+    std::unique_ptr<std::vector<basic_format_arg<Context>>, decltype(pop_one)>
+        guard{&data_, pop_one};
+    named_info_.push_back({arg.name, static_cast<int>(data_.size() - 2u)});
+    data_[0].value_.named_args = {named_info_.data(), named_info_.size()};
+    guard.release();
+  }
+
+ public:
+  /**
+    \rst
+    Adds an argument into the dynamic store for later passing to a formatting
+    function.
+
+    Note that custom types and string types (but not string views) are copied
+    into the store dynamically allocating memory if necessary.
+
+    **Example**::
+
+      fmt::dynamic_format_arg_store<fmt::format_context> store;
+      store.push_back(42);
+      store.push_back("abc");
+      store.push_back(1.5f);
+      std::string result = fmt::vformat("{} and {} and {}", store);
+    \endrst
+  */
+  template <typename T> void push_back(const T& arg) {
+    if (detail::const_check(need_copy<T>::value))
+      emplace_arg(dynamic_args_.push<stored_type<T>>(arg));
+    else
+      emplace_arg(detail::unwrap(arg));
+  }
+
+  /**
+    \rst
+    Adds a reference to the argument into the dynamic store for later passing to
+    a formatting function. Supports named arguments wrapped in
+    ``std::reference_wrapper`` via ``std::ref()``/``std::cref()``.
+
+    **Example**::
+
+      fmt::dynamic_format_arg_store<fmt::format_context> store;
+      char str[] = "1234567890";
+      store.push_back(std::cref(str));
+      int a1_val{42};
+      auto a1 = fmt::arg("a1_", a1_val);
+      store.push_back(std::cref(a1));
+
+      // Changing str affects the output but only for string and custom types.
+      str[0] = 'X';
+
+      std::string result = fmt::vformat("{} and {a1_}");
+      assert(result == "X234567890 and 42");
+    \endrst
+  */
+  template <typename T> void push_back(std::reference_wrapper<T> arg) {
+    static_assert(
+        detail::is_named_arg<typename std::remove_cv<T>::type>::value ||
+            need_copy<T>::value,
+        "objects of built-in types and string views are always copied");
+    emplace_arg(arg.get());
+  }
+
+  /**
+    Adds named argument into the dynamic store for later passing to a formatting
+    function. ``std::reference_wrapper`` is supported to avoid copying of the
+    argument.
+  */
+  template <typename T>
+  void push_back(const detail::named_arg<char_type, T>& arg) {
+    const char_type* arg_name =
+        dynamic_args_.push<std::basic_string<char_type>>(arg.name).c_str();
+    if (detail::const_check(need_copy<T>::value)) {
+      emplace_arg(
+          fmt::arg(arg_name, dynamic_args_.push<stored_type<T>>(arg.value)));
+    } else {
+      emplace_arg(fmt::arg(arg_name, arg.value));
+    }
+  }
+
+  /** Erase all elements from the store */
+  void clear() {
+    data_.clear();
+    named_info_.clear();
+    dynamic_args_ = detail::dynamic_arg_list();
+  }
+
+  /**
+    \rst
+    Reserves space to store at least *new_cap* arguments including
+    *new_cap_named* named arguments.
+    \endrst
+  */
+  void reserve(size_t new_cap, size_t new_cap_named) {
+    FMT_ASSERT(new_cap >= new_cap_named,
+               "Set of arguments includes set of named arguments");
+    data_.reserve(new_cap);
+    named_info_.reserve(new_cap_named);
+  }
+};
+
+/**
+  \rst
+  A view of a collection of formatting arguments. To avoid lifetime issues it
+  should only be used as a parameter type in type-erased functions such as
+  ``vformat``::
+
+    void vlog(string_view format_str, format_args args);  // OK
+    format_args args = make_format_args(42);  // Error: dangling reference
+  \endrst
+ */
+template <typename Context> class basic_format_args {
+ public:
+  using size_type = int;
+  using format_arg = basic_format_arg<Context>;
+
+ private:
+  // A descriptor that contains information about formatting arguments.
+  // If the number of arguments is less or equal to max_packed_args then
+  // argument types are passed in the descriptor. This reduces binary code size
+  // per formatting function call.
+  unsigned long long desc_;
+  union {
+    // If is_packed() returns true then argument values are stored in values_;
+    // otherwise they are stored in args_. This is done to improve cache
+    // locality and reduce compiled code size since storing larger objects
+    // may require more code (at least on x86-64) even if the same amount of
+    // data is actually copied to stack. It saves ~10% on the bloat test.
+    const detail::value<Context>* values_;
+    const format_arg* args_;
+  };
+
+  bool is_packed() const { return (desc_ & detail::is_unpacked_bit) == 0; }
+  bool has_named_args() const {
+    return (desc_ & detail::has_named_args_bit) != 0;
+  }
+
+  detail::type type(int index) const {
+    int shift = index * detail::packed_arg_bits;
+    unsigned int mask = (1 << detail::packed_arg_bits) - 1;
+    return static_cast<detail::type>((desc_ >> shift) & mask);
+  }
+
+  basic_format_args(unsigned long long desc,
+                    const detail::value<Context>* values)
+      : desc_(desc), values_(values) {}
+  basic_format_args(unsigned long long desc, const format_arg* args)
+      : desc_(desc), args_(args) {}
+
+ public:
+  basic_format_args() : desc_(0) {}
+
+  /**
+   \rst
+   Constructs a `basic_format_args` object from `~fmt::format_arg_store`.
+   \endrst
+   */
+  template <typename... Args>
+  FMT_INLINE basic_format_args(const format_arg_store<Context, Args...>& store)
+      : basic_format_args(store.desc, store.data_.args()) {}
+
+  /**
+   \rst
+   Constructs a `basic_format_args` object from
+   `~fmt::dynamic_format_arg_store`.
+   \endrst
+   */
+  FMT_INLINE basic_format_args(const dynamic_format_arg_store<Context>& store)
+      : basic_format_args(store.get_types(), store.data()) {}
+
+  /**
+   \rst
+   Constructs a `basic_format_args` object from a dynamic set of arguments.
+   \endrst
+   */
+  basic_format_args(const format_arg* args, int count)
+      : basic_format_args(detail::is_unpacked_bit | detail::to_unsigned(count),
+                          args) {}
+
+  /** Returns the argument with the specified id. */
+  format_arg get(int id) const {
+    format_arg arg;
+    if (!is_packed()) {
+      if (id < max_size()) arg = args_[id];
+      return arg;
+    }
+    if (id >= detail::max_packed_args) return arg;
+    arg.type_ = type(id);
+    if (arg.type_ == detail::type::none_type) return arg;
+    arg.value_ = values_[id];
+    return arg;
+  }
+
+  template <typename Char> format_arg get(basic_string_view<Char> name) const {
+    int id = get_id(name);
+    return id >= 0 ? get(id) : format_arg();
+  }
+
+  template <typename Char> int get_id(basic_string_view<Char> name) const {
+    if (!has_named_args()) return -1;
+    const auto& named_args =
+        (is_packed() ? values_[-1] : args_[-1].value_).named_args;
+    for (size_t i = 0; i < named_args.size; ++i) {
+      if (named_args.data[i].name == name) return named_args.data[i].id;
+    }
+    return -1;
+  }
+
+  int max_size() const {
+    unsigned long long max_packed = detail::max_packed_args;
+    return static_cast<int>(is_packed() ? max_packed
+                                        : desc_ & ~detail::is_unpacked_bit);
+  }
+};
+
+#ifdef FMT_ARM_ABI_COMPATIBILITY
+/** An alias to ``basic_format_args<format_context>``. */
+// Separate types would result in shorter symbols but break ABI compatibility
+// between clang and gcc on ARM (#1919).
+using format_args = basic_format_args<format_context>;
+using wformat_args = basic_format_args<wformat_context>;
+#else
+// DEPRECATED! These are kept for ABI compatibility.
+// It is a separate type rather than an alias to make symbols readable.
+struct format_args : basic_format_args<format_context> {
+  template <typename... Args>
+  FMT_INLINE format_args(const Args&... args) : basic_format_args(args...) {}
+};
+struct wformat_args : basic_format_args<wformat_context> {
+  using basic_format_args::basic_format_args;
+};
+#endif
+
+namespace detail {
+
+template <typename Char, FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
+std::basic_string<Char> vformat(
+    basic_string_view<Char> format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args);
+
+FMT_API std::string vformat(string_view format_str, format_args args);
+
+template <typename Char>
+void vformat_to(
+    buffer<Char>& buf, basic_string_view<Char> format_str,
+    basic_format_args<FMT_BUFFER_CONTEXT(type_identity_t<Char>)> args,
+    detail::locale_ref loc = {});
+
+template <typename Char, typename Args,
+          FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
+inline void vprint_mojibake(std::FILE*, basic_string_view<Char>, const Args&) {}
+
+FMT_API void vprint_mojibake(std::FILE*, string_view, format_args);
+#ifndef _WIN32
+inline void vprint_mojibake(std::FILE*, string_view, format_args) {}
+#endif
+}  // namespace detail
+
+/** Formats a string and writes the output to ``out``. */
+// GCC 8 and earlier cannot handle std::back_insert_iterator<Container> with
+// vformat_to<ArgFormatter>(...) overload, so SFINAE on iterator type instead.
+template <typename OutputIt, typename S, typename Char = char_t<S>,
+          bool enable = detail::is_output_iterator<OutputIt, Char>::value>
+auto vformat_to(OutputIt out, const S& format_str,
+                basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> typename std::enable_if<enable, OutputIt>::type {
+  decltype(detail::get_buffer<Char>(out)) buf(detail::get_buffer_init(out));
+  detail::vformat_to(buf, to_string_view(format_str), args);
+  return detail::get_iterator(buf);
+}
+
+/**
+ \rst
+ Formats arguments, writes the result to the output iterator ``out`` and returns
+ the iterator past the end of the output range.
+
+ **Example**::
+
+   std::vector<char> out;
+   fmt::format_to(std::back_inserter(out), "{}", 42);
+ \endrst
+ */
+// We cannot use FMT_ENABLE_IF because of a bug in gcc 8.3.
+template <typename OutputIt, typename S, typename... Args,
+          bool enable = detail::is_output_iterator<OutputIt, char_t<S>>::value>
+inline auto format_to(OutputIt out, const S& format_str, Args&&... args) ->
+    typename std::enable_if<enable, OutputIt>::type {
+  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
+  return vformat_to(out, to_string_view(format_str), vargs);
+}
+
+template <typename OutputIt> struct format_to_n_result {
+  /** Iterator past the end of the output range. */
+  OutputIt out;
+  /** Total (not truncated) output size. */
+  size_t size;
+};
+
+template <typename OutputIt, typename Char, typename... Args,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value)>
+inline format_to_n_result<OutputIt> vformat_to_n(
+    OutputIt out, size_t n, basic_string_view<Char> format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+  detail::iterator_buffer<OutputIt, Char, detail::fixed_buffer_traits> buf(out,
+                                                                           n);
+  detail::vformat_to(buf, format_str, args);
+  return {buf.out(), buf.count()};
+}
+
+/**
+ \rst
+ Formats arguments, writes up to ``n`` characters of the result to the output
+ iterator ``out`` and returns the total output size and the iterator past the
+ end of the output range.
+ \endrst
+ */
+template <typename OutputIt, typename S, typename... Args,
+          bool enable = detail::is_output_iterator<OutputIt, char_t<S>>::value>
+inline auto format_to_n(OutputIt out, size_t n, const S& format_str,
+                        const Args&... args) ->
+    typename std::enable_if<enable, format_to_n_result<OutputIt>>::type {
+  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
+  return vformat_to_n(out, n, to_string_view(format_str), vargs);
+}
+
+/**
+  Returns the number of characters in the output of
+  ``format(format_str, args...)``.
+ */
+template <typename... Args>
+inline size_t formatted_size(string_view format_str, Args&&... args) {
+  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
+  detail::counting_buffer<> buf;
+  detail::vformat_to(buf, format_str, vargs);
+  return buf.count();
+}
+
+template <typename S, typename Char = char_t<S>>
+FMT_INLINE std::basic_string<Char> vformat(
+    const S& format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+  return detail::vformat(to_string_view(format_str), args);
+}
+
+/**
+  \rst
+  Formats arguments and returns the result as a string.
+
+  **Example**::
+
+    #include <fmt/core.h>
+    std::string message = fmt::format("The answer is {}", 42);
+  \endrst
+*/
+// Pass char_t as a default template parameter instead of using
+// std::basic_string<char_t<S>> to reduce the symbol size.
+template <typename S, typename... Args, typename Char = char_t<S>>
+FMT_INLINE std::basic_string<Char> format(const S& format_str, Args&&... args) {
+  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
+  return detail::vformat(to_string_view(format_str), vargs);
+}
+
+FMT_API void vprint(string_view, format_args);
+FMT_API void vprint(std::FILE*, string_view, format_args);
+
+/**
+  \rst
+  Formats ``args`` according to specifications in ``format_str`` and writes the
+  output to the file ``f``. Strings are assumed to be Unicode-encoded unless the
+  ``FMT_UNICODE`` macro is set to 0.
+
+  **Example**::
+
+    fmt::print(stderr, "Don't {}!", "panic");
+  \endrst
+ */
+template <typename S, typename... Args, typename Char = char_t<S>>
+inline void print(std::FILE* f, const S& format_str, Args&&... args) {
+  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
+  return detail::is_unicode<Char>()
+             ? vprint(f, to_string_view(format_str), vargs)
+             : detail::vprint_mojibake(f, to_string_view(format_str), vargs);
+}
+
+/**
+  \rst
+  Formats ``args`` according to specifications in ``format_str`` and writes
+  the output to ``stdout``. Strings are assumed to be Unicode-encoded unless
+  the ``FMT_UNICODE`` macro is set to 0.
+
+  **Example**::
+
+    fmt::print("Elapsed time: {0:.2f} seconds", 1.23);
+  \endrst
+ */
+template <typename S, typename... Args, typename Char = char_t<S>>
+inline void print(const S& format_str, Args&&... args) {
+  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
+  return detail::is_unicode<Char>()
+             ? vprint(to_string_view(format_str), vargs)
+             : detail::vprint_mojibake(stdout, to_string_view(format_str),
+                                       vargs);
+}
+FMT_END_NAMESPACE
+
+#endif  // FMT_CORE_H_
diff --git a/include/vtkdiy2/thirdparty/fmt/format-inl.h b/include/vtkdiy2/thirdparty/fmt/format-inl.h
new file mode 100644
index 00000000000..8f2fe7354a9
--- /dev/null
+++ b/include/vtkdiy2/thirdparty/fmt/format-inl.h
@@ -0,0 +1,2801 @@
+// Formatting library for C++ - implementation
+//
+// Copyright (c) 2012 - 2016, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_FORMAT_INL_H_
+#define FMT_FORMAT_INL_H_
+
+#include <cassert>
+#include <cctype>
+#include <climits>
+#include <cmath>
+#include <cstdarg>
+#include <cstring>  // std::memmove
+#include <cwchar>
+#include <exception>
+
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
+#  include <locale>
+#endif
+
+#ifdef _WIN32
+#  include <io.h>  // _isatty
+#endif
+
+#include "format.h"
+
+// Dummy implementations of strerror_r and strerror_s called if corresponding
+// system functions are not available.
+inline fmt::detail::null<> strerror_r(int, char*, ...) { return {}; }
+inline fmt::detail::null<> strerror_s(char*, size_t, ...) { return {}; }
+
+FMT_BEGIN_NAMESPACE
+namespace detail {
+
+FMT_FUNC void assert_fail(const char* file, int line, const char* message) {
+  // Use unchecked std::fprintf to avoid triggering another assertion when
+  // writing to stderr fails
+  std::fprintf(stderr, "%s:%d: assertion failed: %s", file, line, message);
+  // Chosen instead of std::abort to satisfy Clang in CUDA mode during device
+  // code pass.
+  std::terminate();
+}
+
+#ifndef _MSC_VER
+#  define FMT_SNPRINTF snprintf
+#else  // _MSC_VER
+inline int fmt_snprintf(char* buffer, size_t size, const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  int result = vsnprintf_s(buffer, size, _TRUNCATE, format, args);
+  va_end(args);
+  return result;
+}
+#  define FMT_SNPRINTF fmt_snprintf
+#endif  // _MSC_VER
+
+// A portable thread-safe version of strerror.
+// Sets buffer to point to a string describing the error code.
+// This can be either a pointer to a string stored in buffer,
+// or a pointer to some static immutable string.
+// Returns one of the following values:
+//   0      - success
+//   ERANGE - buffer is not large enough to store the error message
+//   other  - failure
+// Buffer should be at least of size 1.
+inline int safe_strerror(int error_code, char*& buffer,
+                         size_t buffer_size) FMT_NOEXCEPT {
+  FMT_ASSERT(buffer != nullptr && buffer_size != 0, "invalid buffer");
+
+  class dispatcher {
+   private:
+    int error_code_;
+    char*& buffer_;
+    size_t buffer_size_;
+
+    // A noop assignment operator to avoid bogus warnings.
+    void operator=(const dispatcher&) {}
+
+    // Handle the result of XSI-compliant version of strerror_r.
+    int handle(int result) {
+      // glibc versions before 2.13 return result in errno.
+      return result == -1 ? errno : result;
+    }
+
+    // Handle the result of GNU-specific version of strerror_r.
+    FMT_MAYBE_UNUSED
+    int handle(char* message) {
+      // If the buffer is full then the message is probably truncated.
+      if (message == buffer_ && strlen(buffer_) == buffer_size_ - 1)
+        return ERANGE;
+      buffer_ = message;
+      return 0;
+    }
+
+    // Handle the case when strerror_r is not available.
+    FMT_MAYBE_UNUSED
+    int handle(detail::null<>) {
+      return fallback(strerror_s(buffer_, buffer_size_, error_code_));
+    }
+
+    // Fallback to strerror_s when strerror_r is not available.
+    FMT_MAYBE_UNUSED
+    int fallback(int result) {
+      // If the buffer is full then the message is probably truncated.
+      return result == 0 && strlen(buffer_) == buffer_size_ - 1 ? ERANGE
+                                                                : result;
+    }
+
+#if !FMT_MSC_VER
+    // Fallback to strerror if strerror_r and strerror_s are not available.
+    int fallback(detail::null<>) {
+      errno = 0;
+      buffer_ = strerror(error_code_);
+      return errno;
+    }
+#endif
+
+   public:
+    dispatcher(int err_code, char*& buf, size_t buf_size)
+        : error_code_(err_code), buffer_(buf), buffer_size_(buf_size) {}
+
+    int run() { return handle(strerror_r(error_code_, buffer_, buffer_size_)); }
+  };
+  return dispatcher(error_code, buffer, buffer_size).run();
+}
+
+FMT_FUNC void format_error_code(detail::buffer<char>& out, int error_code,
+                                string_view message) FMT_NOEXCEPT {
+  // Report error code making sure that the output fits into
+  // inline_buffer_size to avoid dynamic memory allocation and potential
+  // bad_alloc.
+  out.try_resize(0);
+  static const char SEP[] = ": ";
+  static const char ERROR_STR[] = "error ";
+  // Subtract 2 to account for terminating null characters in SEP and ERROR_STR.
+  size_t error_code_size = sizeof(SEP) + sizeof(ERROR_STR) - 2;
+  auto abs_value = static_cast<uint32_or_64_or_128_t<int>>(error_code);
+  if (detail::is_negative(error_code)) {
+    abs_value = 0 - abs_value;
+    ++error_code_size;
+  }
+  error_code_size += detail::to_unsigned(detail::count_digits(abs_value));
+  auto it = buffer_appender<char>(out);
+  if (message.size() <= inline_buffer_size - error_code_size)
+    format_to(it, "{}{}", message, SEP);
+  format_to(it, "{}{}", ERROR_STR, error_code);
+  assert(out.size() <= inline_buffer_size);
+}
+
+FMT_FUNC void report_error(format_func func, int error_code,
+                           string_view message) FMT_NOEXCEPT {
+  memory_buffer full_message;
+  func(full_message, error_code, message);
+  // Don't use fwrite_fully because the latter may throw.
+  (void)std::fwrite(full_message.data(), full_message.size(), 1, stderr);
+  std::fputc('\n', stderr);
+}
+
+// A wrapper around fwrite that throws on error.
+inline void fwrite_fully(const void* ptr, size_t size, size_t count,
+                         FILE* stream) {
+  size_t written = std::fwrite(ptr, size, count, stream);
+  if (written < count) FMT_THROW(system_error(errno, "cannot write to file"));
+}
+}  // namespace detail
+
+#if !defined(FMT_STATIC_THOUSANDS_SEPARATOR)
+namespace detail {
+
+template <typename Locale>
+locale_ref::locale_ref(const Locale& loc) : locale_(&loc) {
+  static_assert(std::is_same<Locale, std::locale>::value, "");
+}
+
+template <typename Locale> Locale locale_ref::get() const {
+  static_assert(std::is_same<Locale, std::locale>::value, "");
+  return locale_ ? *static_cast<const std::locale*>(locale_) : std::locale();
+}
+
+template <typename Char> FMT_FUNC std::string grouping_impl(locale_ref loc) {
+  return std::use_facet<std::numpunct<Char>>(loc.get<std::locale>()).grouping();
+}
+template <typename Char> FMT_FUNC Char thousands_sep_impl(locale_ref loc) {
+  return std::use_facet<std::numpunct<Char>>(loc.get<std::locale>())
+      .thousands_sep();
+}
+template <typename Char> FMT_FUNC Char decimal_point_impl(locale_ref loc) {
+  return std::use_facet<std::numpunct<Char>>(loc.get<std::locale>())
+      .decimal_point();
+}
+}  // namespace detail
+#else
+template <typename Char>
+FMT_FUNC std::string detail::grouping_impl(locale_ref) {
+  return "\03";
+}
+template <typename Char> FMT_FUNC Char detail::thousands_sep_impl(locale_ref) {
+  return FMT_STATIC_THOUSANDS_SEPARATOR;
+}
+template <typename Char> FMT_FUNC Char detail::decimal_point_impl(locale_ref) {
+  return '.';
+}
+#endif
+
+FMT_API FMT_FUNC format_error::~format_error() FMT_NOEXCEPT = default;
+FMT_API FMT_FUNC system_error::~system_error() FMT_NOEXCEPT = default;
+
+FMT_FUNC void system_error::init(int err_code, string_view format_str,
+                                 format_args args) {
+  error_code_ = err_code;
+  memory_buffer buffer;
+  format_system_error(buffer, err_code, vformat(format_str, args));
+  std::runtime_error& base = *this;
+  base = std::runtime_error(to_string(buffer));
+}
+
+namespace detail {
+
+template <> FMT_FUNC int count_digits<4>(detail::fallback_uintptr n) {
+  // fallback_uintptr is always stored in little endian.
+  int i = static_cast<int>(sizeof(void*)) - 1;
+  while (i > 0 && n.value[i] == 0) --i;
+  auto char_digits = std::numeric_limits<unsigned char>::digits / 4;
+  return i >= 0 ? i * char_digits + count_digits<4, unsigned>(n.value[i]) : 1;
+}
+
+template <typename T>
+const typename basic_data<T>::digit_pair basic_data<T>::digits[] = {
+    {'0', '0'}, {'0', '1'}, {'0', '2'}, {'0', '3'}, {'0', '4'}, {'0', '5'},
+    {'0', '6'}, {'0', '7'}, {'0', '8'}, {'0', '9'}, {'1', '0'}, {'1', '1'},
+    {'1', '2'}, {'1', '3'}, {'1', '4'}, {'1', '5'}, {'1', '6'}, {'1', '7'},
+    {'1', '8'}, {'1', '9'}, {'2', '0'}, {'2', '1'}, {'2', '2'}, {'2', '3'},
+    {'2', '4'}, {'2', '5'}, {'2', '6'}, {'2', '7'}, {'2', '8'}, {'2', '9'},
+    {'3', '0'}, {'3', '1'}, {'3', '2'}, {'3', '3'}, {'3', '4'}, {'3', '5'},
+    {'3', '6'}, {'3', '7'}, {'3', '8'}, {'3', '9'}, {'4', '0'}, {'4', '1'},
+    {'4', '2'}, {'4', '3'}, {'4', '4'}, {'4', '5'}, {'4', '6'}, {'4', '7'},
+    {'4', '8'}, {'4', '9'}, {'5', '0'}, {'5', '1'}, {'5', '2'}, {'5', '3'},
+    {'5', '4'}, {'5', '5'}, {'5', '6'}, {'5', '7'}, {'5', '8'}, {'5', '9'},
+    {'6', '0'}, {'6', '1'}, {'6', '2'}, {'6', '3'}, {'6', '4'}, {'6', '5'},
+    {'6', '6'}, {'6', '7'}, {'6', '8'}, {'6', '9'}, {'7', '0'}, {'7', '1'},
+    {'7', '2'}, {'7', '3'}, {'7', '4'}, {'7', '5'}, {'7', '6'}, {'7', '7'},
+    {'7', '8'}, {'7', '9'}, {'8', '0'}, {'8', '1'}, {'8', '2'}, {'8', '3'},
+    {'8', '4'}, {'8', '5'}, {'8', '6'}, {'8', '7'}, {'8', '8'}, {'8', '9'},
+    {'9', '0'}, {'9', '1'}, {'9', '2'}, {'9', '3'}, {'9', '4'}, {'9', '5'},
+    {'9', '6'}, {'9', '7'}, {'9', '8'}, {'9', '9'}};
+
+template <typename T>
+const char basic_data<T>::hex_digits[] = "0123456789abcdef";
+
+#define FMT_POWERS_OF_10(factor)                                             \
+  factor * 10, (factor)*100, (factor)*1000, (factor)*10000, (factor)*100000, \
+      (factor)*1000000, (factor)*10000000, (factor)*100000000,               \
+      (factor)*1000000000
+
+template <typename T>
+const uint64_t basic_data<T>::powers_of_10_64[] = {
+    1, FMT_POWERS_OF_10(1), FMT_POWERS_OF_10(1000000000ULL),
+    10000000000000000000ULL};
+
+template <typename T>
+const uint32_t basic_data<T>::zero_or_powers_of_10_32[] = {0,
+                                                           FMT_POWERS_OF_10(1)};
+template <typename T>
+const uint64_t basic_data<T>::zero_or_powers_of_10_64[] = {
+    0, FMT_POWERS_OF_10(1), FMT_POWERS_OF_10(1000000000ULL),
+    10000000000000000000ULL};
+
+template <typename T>
+const uint32_t basic_data<T>::zero_or_powers_of_10_32_new[] = {
+    0, 0, FMT_POWERS_OF_10(1)};
+
+template <typename T>
+const uint64_t basic_data<T>::zero_or_powers_of_10_64_new[] = {
+    0, 0, FMT_POWERS_OF_10(1), FMT_POWERS_OF_10(1000000000ULL),
+    10000000000000000000ULL};
+
+// Normalized 64-bit significands of pow(10, k), for k = -348, -340, ..., 340.
+// These are generated by support/compute-powers.py.
+template <typename T>
+const uint64_t basic_data<T>::grisu_pow10_significands[] = {
+    0xfa8fd5a0081c0288, 0xbaaee17fa23ebf76, 0x8b16fb203055ac76,
+    0xcf42894a5dce35ea, 0x9a6bb0aa55653b2d, 0xe61acf033d1a45df,
+    0xab70fe17c79ac6ca, 0xff77b1fcbebcdc4f, 0xbe5691ef416bd60c,
+    0x8dd01fad907ffc3c, 0xd3515c2831559a83, 0x9d71ac8fada6c9b5,
+    0xea9c227723ee8bcb, 0xaecc49914078536d, 0x823c12795db6ce57,
+    0xc21094364dfb5637, 0x9096ea6f3848984f, 0xd77485cb25823ac7,
+    0xa086cfcd97bf97f4, 0xef340a98172aace5, 0xb23867fb2a35b28e,
+    0x84c8d4dfd2c63f3b, 0xc5dd44271ad3cdba, 0x936b9fcebb25c996,
+    0xdbac6c247d62a584, 0xa3ab66580d5fdaf6, 0xf3e2f893dec3f126,
+    0xb5b5ada8aaff80b8, 0x87625f056c7c4a8b, 0xc9bcff6034c13053,
+    0x964e858c91ba2655, 0xdff9772470297ebd, 0xa6dfbd9fb8e5b88f,
+    0xf8a95fcf88747d94, 0xb94470938fa89bcf, 0x8a08f0f8bf0f156b,
+    0xcdb02555653131b6, 0x993fe2c6d07b7fac, 0xe45c10c42a2b3b06,
+    0xaa242499697392d3, 0xfd87b5f28300ca0e, 0xbce5086492111aeb,
+    0x8cbccc096f5088cc, 0xd1b71758e219652c, 0x9c40000000000000,
+    0xe8d4a51000000000, 0xad78ebc5ac620000, 0x813f3978f8940984,
+    0xc097ce7bc90715b3, 0x8f7e32ce7bea5c70, 0xd5d238a4abe98068,
+    0x9f4f2726179a2245, 0xed63a231d4c4fb27, 0xb0de65388cc8ada8,
+    0x83c7088e1aab65db, 0xc45d1df942711d9a, 0x924d692ca61be758,
+    0xda01ee641a708dea, 0xa26da3999aef774a, 0xf209787bb47d6b85,
+    0xb454e4a179dd1877, 0x865b86925b9bc5c2, 0xc83553c5c8965d3d,
+    0x952ab45cfa97a0b3, 0xde469fbd99a05fe3, 0xa59bc234db398c25,
+    0xf6c69a72a3989f5c, 0xb7dcbf5354e9bece, 0x88fcf317f22241e2,
+    0xcc20ce9bd35c78a5, 0x98165af37b2153df, 0xe2a0b5dc971f303a,
+    0xa8d9d1535ce3b396, 0xfb9b7cd9a4a7443c, 0xbb764c4ca7a44410,
+    0x8bab8eefb6409c1a, 0xd01fef10a657842c, 0x9b10a4e5e9913129,
+    0xe7109bfba19c0c9d, 0xac2820d9623bf429, 0x80444b5e7aa7cf85,
+    0xbf21e44003acdd2d, 0x8e679c2f5e44ff8f, 0xd433179d9c8cb841,
+    0x9e19db92b4e31ba9, 0xeb96bf6ebadf77d9, 0xaf87023b9bf0ee6b,
+};
+
+// Binary exponents of pow(10, k), for k = -348, -340, ..., 340, corresponding
+// to significands above.
+template <typename T>
+const int16_t basic_data<T>::grisu_pow10_exponents[] = {
+    -1220, -1193, -1166, -1140, -1113, -1087, -1060, -1034, -1007, -980, -954,
+    -927,  -901,  -874,  -847,  -821,  -794,  -768,  -741,  -715,  -688, -661,
+    -635,  -608,  -582,  -555,  -529,  -502,  -475,  -449,  -422,  -396, -369,
+    -343,  -316,  -289,  -263,  -236,  -210,  -183,  -157,  -130,  -103, -77,
+    -50,   -24,   3,     30,    56,    83,    109,   136,   162,   189,  216,
+    242,   269,   295,   322,   348,   375,   402,   428,   455,   481,  508,
+    534,   561,   588,   614,   641,   667,   694,   720,   747,   774,  800,
+    827,   853,   880,   907,   933,   960,   986,   1013,  1039,  1066};
+
+template <typename T>
+const divtest_table_entry<uint32_t> basic_data<T>::divtest_table_for_pow5_32[] =
+    {{0x00000001, 0xffffffff}, {0xcccccccd, 0x33333333},
+     {0xc28f5c29, 0x0a3d70a3}, {0x26e978d5, 0x020c49ba},
+     {0x3afb7e91, 0x0068db8b}, {0x0bcbe61d, 0x0014f8b5},
+     {0x68c26139, 0x000431bd}, {0xae8d46a5, 0x0000d6bf},
+     {0x22e90e21, 0x00002af3}, {0x3a2e9c6d, 0x00000897},
+     {0x3ed61f49, 0x000001b7}};
+
+template <typename T>
+const divtest_table_entry<uint64_t> basic_data<T>::divtest_table_for_pow5_64[] =
+    {{0x0000000000000001, 0xffffffffffffffff},
+     {0xcccccccccccccccd, 0x3333333333333333},
+     {0x8f5c28f5c28f5c29, 0x0a3d70a3d70a3d70},
+     {0x1cac083126e978d5, 0x020c49ba5e353f7c},
+     {0xd288ce703afb7e91, 0x0068db8bac710cb2},
+     {0x5d4e8fb00bcbe61d, 0x0014f8b588e368f0},
+     {0x790fb65668c26139, 0x000431bde82d7b63},
+     {0xe5032477ae8d46a5, 0x0000d6bf94d5e57a},
+     {0xc767074b22e90e21, 0x00002af31dc46118},
+     {0x8e47ce423a2e9c6d, 0x0000089705f4136b},
+     {0x4fa7f60d3ed61f49, 0x000001b7cdfd9d7b},
+     {0x0fee64690c913975, 0x00000057f5ff85e5},
+     {0x3662e0e1cf503eb1, 0x000000119799812d},
+     {0xa47a2cf9f6433fbd, 0x0000000384b84d09},
+     {0x54186f653140a659, 0x00000000b424dc35},
+     {0x7738164770402145, 0x0000000024075f3d},
+     {0xe4a4d1417cd9a041, 0x000000000734aca5},
+     {0xc75429d9e5c5200d, 0x000000000170ef54},
+     {0xc1773b91fac10669, 0x000000000049c977},
+     {0x26b172506559ce15, 0x00000000000ec1e4},
+     {0xd489e3a9addec2d1, 0x000000000002f394},
+     {0x90e860bb892c8d5d, 0x000000000000971d},
+     {0x502e79bf1b6f4f79, 0x0000000000001e39},
+     {0xdcd618596be30fe5, 0x000000000000060b}};
+
+template <typename T>
+const uint64_t basic_data<T>::dragonbox_pow10_significands_64[] = {
+    0x81ceb32c4b43fcf5, 0xa2425ff75e14fc32, 0xcad2f7f5359a3b3f,
+    0xfd87b5f28300ca0e, 0x9e74d1b791e07e49, 0xc612062576589ddb,
+    0xf79687aed3eec552, 0x9abe14cd44753b53, 0xc16d9a0095928a28,
+    0xf1c90080baf72cb2, 0x971da05074da7bef, 0xbce5086492111aeb,
+    0xec1e4a7db69561a6, 0x9392ee8e921d5d08, 0xb877aa3236a4b44a,
+    0xe69594bec44de15c, 0x901d7cf73ab0acda, 0xb424dc35095cd810,
+    0xe12e13424bb40e14, 0x8cbccc096f5088cc, 0xafebff0bcb24aaff,
+    0xdbe6fecebdedd5bf, 0x89705f4136b4a598, 0xabcc77118461cefd,
+    0xd6bf94d5e57a42bd, 0x8637bd05af6c69b6, 0xa7c5ac471b478424,
+    0xd1b71758e219652c, 0x83126e978d4fdf3c, 0xa3d70a3d70a3d70b,
+    0xcccccccccccccccd, 0x8000000000000000, 0xa000000000000000,
+    0xc800000000000000, 0xfa00000000000000, 0x9c40000000000000,
+    0xc350000000000000, 0xf424000000000000, 0x9896800000000000,
+    0xbebc200000000000, 0xee6b280000000000, 0x9502f90000000000,
+    0xba43b74000000000, 0xe8d4a51000000000, 0x9184e72a00000000,
+    0xb5e620f480000000, 0xe35fa931a0000000, 0x8e1bc9bf04000000,
+    0xb1a2bc2ec5000000, 0xde0b6b3a76400000, 0x8ac7230489e80000,
+    0xad78ebc5ac620000, 0xd8d726b7177a8000, 0x878678326eac9000,
+    0xa968163f0a57b400, 0xd3c21bcecceda100, 0x84595161401484a0,
+    0xa56fa5b99019a5c8, 0xcecb8f27f4200f3a, 0x813f3978f8940984,
+    0xa18f07d736b90be5, 0xc9f2c9cd04674ede, 0xfc6f7c4045812296,
+    0x9dc5ada82b70b59d, 0xc5371912364ce305, 0xf684df56c3e01bc6,
+    0x9a130b963a6c115c, 0xc097ce7bc90715b3, 0xf0bdc21abb48db20,
+    0x96769950b50d88f4, 0xbc143fa4e250eb31, 0xeb194f8e1ae525fd,
+    0x92efd1b8d0cf37be, 0xb7abc627050305ad, 0xe596b7b0c643c719,
+    0x8f7e32ce7bea5c6f, 0xb35dbf821ae4f38b, 0xe0352f62a19e306e};
+
+template <typename T>
+const uint128_wrapper basic_data<T>::dragonbox_pow10_significands_128[] = {
+#if FMT_USE_FULL_CACHE_DRAGONBOX
+    {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b},
+    {0x9faacf3df73609b1, 0x77b191618c54e9ad},
+    {0xc795830d75038c1d, 0xd59df5b9ef6a2418},
+    {0xf97ae3d0d2446f25, 0x4b0573286b44ad1e},
+    {0x9becce62836ac577, 0x4ee367f9430aec33},
+    {0xc2e801fb244576d5, 0x229c41f793cda740},
+    {0xf3a20279ed56d48a, 0x6b43527578c11110},
+    {0x9845418c345644d6, 0x830a13896b78aaaa},
+    {0xbe5691ef416bd60c, 0x23cc986bc656d554},
+    {0xedec366b11c6cb8f, 0x2cbfbe86b7ec8aa9},
+    {0x94b3a202eb1c3f39, 0x7bf7d71432f3d6aa},
+    {0xb9e08a83a5e34f07, 0xdaf5ccd93fb0cc54},
+    {0xe858ad248f5c22c9, 0xd1b3400f8f9cff69},
+    {0x91376c36d99995be, 0x23100809b9c21fa2},
+    {0xb58547448ffffb2d, 0xabd40a0c2832a78b},
+    {0xe2e69915b3fff9f9, 0x16c90c8f323f516d},
+    {0x8dd01fad907ffc3b, 0xae3da7d97f6792e4},
+    {0xb1442798f49ffb4a, 0x99cd11cfdf41779d},
+    {0xdd95317f31c7fa1d, 0x40405643d711d584},
+    {0x8a7d3eef7f1cfc52, 0x482835ea666b2573},
+    {0xad1c8eab5ee43b66, 0xda3243650005eed0},
+    {0xd863b256369d4a40, 0x90bed43e40076a83},
+    {0x873e4f75e2224e68, 0x5a7744a6e804a292},
+    {0xa90de3535aaae202, 0x711515d0a205cb37},
+    {0xd3515c2831559a83, 0x0d5a5b44ca873e04},
+    {0x8412d9991ed58091, 0xe858790afe9486c3},
+    {0xa5178fff668ae0b6, 0x626e974dbe39a873},
+    {0xce5d73ff402d98e3, 0xfb0a3d212dc81290},
+    {0x80fa687f881c7f8e, 0x7ce66634bc9d0b9a},
+    {0xa139029f6a239f72, 0x1c1fffc1ebc44e81},
+    {0xc987434744ac874e, 0xa327ffb266b56221},
+    {0xfbe9141915d7a922, 0x4bf1ff9f0062baa9},
+    {0x9d71ac8fada6c9b5, 0x6f773fc3603db4aa},
+    {0xc4ce17b399107c22, 0xcb550fb4384d21d4},
+    {0xf6019da07f549b2b, 0x7e2a53a146606a49},
+    {0x99c102844f94e0fb, 0x2eda7444cbfc426e},
+    {0xc0314325637a1939, 0xfa911155fefb5309},
+    {0xf03d93eebc589f88, 0x793555ab7eba27cb},
+    {0x96267c7535b763b5, 0x4bc1558b2f3458df},
+    {0xbbb01b9283253ca2, 0x9eb1aaedfb016f17},
+    {0xea9c227723ee8bcb, 0x465e15a979c1cadd},
+    {0x92a1958a7675175f, 0x0bfacd89ec191eca},
+    {0xb749faed14125d36, 0xcef980ec671f667c},
+    {0xe51c79a85916f484, 0x82b7e12780e7401b},
+    {0x8f31cc0937ae58d2, 0xd1b2ecb8b0908811},
+    {0xb2fe3f0b8599ef07, 0x861fa7e6dcb4aa16},
+    {0xdfbdcece67006ac9, 0x67a791e093e1d49b},
+    {0x8bd6a141006042bd, 0xe0c8bb2c5c6d24e1},
+    {0xaecc49914078536d, 0x58fae9f773886e19},
+    {0xda7f5bf590966848, 0xaf39a475506a899f},
+    {0x888f99797a5e012d, 0x6d8406c952429604},
+    {0xaab37fd7d8f58178, 0xc8e5087ba6d33b84},
+    {0xd5605fcdcf32e1d6, 0xfb1e4a9a90880a65},
+    {0x855c3be0a17fcd26, 0x5cf2eea09a550680},
+    {0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481f},
+    {0xd0601d8efc57b08b, 0xf13b94daf124da27},
+    {0x823c12795db6ce57, 0x76c53d08d6b70859},
+    {0xa2cb1717b52481ed, 0x54768c4b0c64ca6f},
+    {0xcb7ddcdda26da268, 0xa9942f5dcf7dfd0a},
+    {0xfe5d54150b090b02, 0xd3f93b35435d7c4d},
+    {0x9efa548d26e5a6e1, 0xc47bc5014a1a6db0},
+    {0xc6b8e9b0709f109a, 0x359ab6419ca1091c},
+    {0xf867241c8cc6d4c0, 0xc30163d203c94b63},
+    {0x9b407691d7fc44f8, 0x79e0de63425dcf1e},
+    {0xc21094364dfb5636, 0x985915fc12f542e5},
+    {0xf294b943e17a2bc4, 0x3e6f5b7b17b2939e},
+    {0x979cf3ca6cec5b5a, 0xa705992ceecf9c43},
+    {0xbd8430bd08277231, 0x50c6ff782a838354},
+    {0xece53cec4a314ebd, 0xa4f8bf5635246429},
+    {0x940f4613ae5ed136, 0x871b7795e136be9a},
+    {0xb913179899f68584, 0x28e2557b59846e40},
+    {0xe757dd7ec07426e5, 0x331aeada2fe589d0},
+    {0x9096ea6f3848984f, 0x3ff0d2c85def7622},
+    {0xb4bca50b065abe63, 0x0fed077a756b53aa},
+    {0xe1ebce4dc7f16dfb, 0xd3e8495912c62895},
+    {0x8d3360f09cf6e4bd, 0x64712dd7abbbd95d},
+    {0xb080392cc4349dec, 0xbd8d794d96aacfb4},
+    {0xdca04777f541c567, 0xecf0d7a0fc5583a1},
+    {0x89e42caaf9491b60, 0xf41686c49db57245},
+    {0xac5d37d5b79b6239, 0x311c2875c522ced6},
+    {0xd77485cb25823ac7, 0x7d633293366b828c},
+    {0x86a8d39ef77164bc, 0xae5dff9c02033198},
+    {0xa8530886b54dbdeb, 0xd9f57f830283fdfd},
+    {0xd267caa862a12d66, 0xd072df63c324fd7c},
+    {0x8380dea93da4bc60, 0x4247cb9e59f71e6e},
+    {0xa46116538d0deb78, 0x52d9be85f074e609},
+    {0xcd795be870516656, 0x67902e276c921f8c},
+    {0x806bd9714632dff6, 0x00ba1cd8a3db53b7},
+    {0xa086cfcd97bf97f3, 0x80e8a40eccd228a5},
+    {0xc8a883c0fdaf7df0, 0x6122cd128006b2ce},
+    {0xfad2a4b13d1b5d6c, 0x796b805720085f82},
+    {0x9cc3a6eec6311a63, 0xcbe3303674053bb1},
+    {0xc3f490aa77bd60fc, 0xbedbfc4411068a9d},
+    {0xf4f1b4d515acb93b, 0xee92fb5515482d45},
+    {0x991711052d8bf3c5, 0x751bdd152d4d1c4b},
+    {0xbf5cd54678eef0b6, 0xd262d45a78a0635e},
+    {0xef340a98172aace4, 0x86fb897116c87c35},
+    {0x9580869f0e7aac0e, 0xd45d35e6ae3d4da1},
+    {0xbae0a846d2195712, 0x8974836059cca10a},
+    {0xe998d258869facd7, 0x2bd1a438703fc94c},
+    {0x91ff83775423cc06, 0x7b6306a34627ddd0},
+    {0xb67f6455292cbf08, 0x1a3bc84c17b1d543},
+    {0xe41f3d6a7377eeca, 0x20caba5f1d9e4a94},
+    {0x8e938662882af53e, 0x547eb47b7282ee9d},
+    {0xb23867fb2a35b28d, 0xe99e619a4f23aa44},
+    {0xdec681f9f4c31f31, 0x6405fa00e2ec94d5},
+    {0x8b3c113c38f9f37e, 0xde83bc408dd3dd05},
+    {0xae0b158b4738705e, 0x9624ab50b148d446},
+    {0xd98ddaee19068c76, 0x3badd624dd9b0958},
+    {0x87f8a8d4cfa417c9, 0xe54ca5d70a80e5d7},
+    {0xa9f6d30a038d1dbc, 0x5e9fcf4ccd211f4d},
+    {0xd47487cc8470652b, 0x7647c32000696720},
+    {0x84c8d4dfd2c63f3b, 0x29ecd9f40041e074},
+    {0xa5fb0a17c777cf09, 0xf468107100525891},
+    {0xcf79cc9db955c2cc, 0x7182148d4066eeb5},
+    {0x81ac1fe293d599bf, 0xc6f14cd848405531},
+    {0xa21727db38cb002f, 0xb8ada00e5a506a7d},
+    {0xca9cf1d206fdc03b, 0xa6d90811f0e4851d},
+    {0xfd442e4688bd304a, 0x908f4a166d1da664},
+    {0x9e4a9cec15763e2e, 0x9a598e4e043287ff},
+    {0xc5dd44271ad3cdba, 0x40eff1e1853f29fe},
+    {0xf7549530e188c128, 0xd12bee59e68ef47d},
+    {0x9a94dd3e8cf578b9, 0x82bb74f8301958cf},
+    {0xc13a148e3032d6e7, 0xe36a52363c1faf02},
+    {0xf18899b1bc3f8ca1, 0xdc44e6c3cb279ac2},
+    {0x96f5600f15a7b7e5, 0x29ab103a5ef8c0ba},
+    {0xbcb2b812db11a5de, 0x7415d448f6b6f0e8},
+    {0xebdf661791d60f56, 0x111b495b3464ad22},
+    {0x936b9fcebb25c995, 0xcab10dd900beec35},
+    {0xb84687c269ef3bfb, 0x3d5d514f40eea743},
+    {0xe65829b3046b0afa, 0x0cb4a5a3112a5113},
+    {0x8ff71a0fe2c2e6dc, 0x47f0e785eaba72ac},
+    {0xb3f4e093db73a093, 0x59ed216765690f57},
+    {0xe0f218b8d25088b8, 0x306869c13ec3532d},
+    {0x8c974f7383725573, 0x1e414218c73a13fc},
+    {0xafbd2350644eeacf, 0xe5d1929ef90898fb},
+    {0xdbac6c247d62a583, 0xdf45f746b74abf3a},
+    {0x894bc396ce5da772, 0x6b8bba8c328eb784},
+    {0xab9eb47c81f5114f, 0x066ea92f3f326565},
+    {0xd686619ba27255a2, 0xc80a537b0efefebe},
+    {0x8613fd0145877585, 0xbd06742ce95f5f37},
+    {0xa798fc4196e952e7, 0x2c48113823b73705},
+    {0xd17f3b51fca3a7a0, 0xf75a15862ca504c6},
+    {0x82ef85133de648c4, 0x9a984d73dbe722fc},
+    {0xa3ab66580d5fdaf5, 0xc13e60d0d2e0ebbb},
+    {0xcc963fee10b7d1b3, 0x318df905079926a9},
+    {0xffbbcfe994e5c61f, 0xfdf17746497f7053},
+    {0x9fd561f1fd0f9bd3, 0xfeb6ea8bedefa634},
+    {0xc7caba6e7c5382c8, 0xfe64a52ee96b8fc1},
+    {0xf9bd690a1b68637b, 0x3dfdce7aa3c673b1},
+    {0x9c1661a651213e2d, 0x06bea10ca65c084f},
+    {0xc31bfa0fe5698db8, 0x486e494fcff30a63},
+    {0xf3e2f893dec3f126, 0x5a89dba3c3efccfb},
+    {0x986ddb5c6b3a76b7, 0xf89629465a75e01d},
+    {0xbe89523386091465, 0xf6bbb397f1135824},
+    {0xee2ba6c0678b597f, 0x746aa07ded582e2d},
+    {0x94db483840b717ef, 0xa8c2a44eb4571cdd},
+    {0xba121a4650e4ddeb, 0x92f34d62616ce414},
+    {0xe896a0d7e51e1566, 0x77b020baf9c81d18},
+    {0x915e2486ef32cd60, 0x0ace1474dc1d122f},
+    {0xb5b5ada8aaff80b8, 0x0d819992132456bb},
+    {0xe3231912d5bf60e6, 0x10e1fff697ed6c6a},
+    {0x8df5efabc5979c8f, 0xca8d3ffa1ef463c2},
+    {0xb1736b96b6fd83b3, 0xbd308ff8a6b17cb3},
+    {0xddd0467c64bce4a0, 0xac7cb3f6d05ddbdf},
+    {0x8aa22c0dbef60ee4, 0x6bcdf07a423aa96c},
+    {0xad4ab7112eb3929d, 0x86c16c98d2c953c7},
+    {0xd89d64d57a607744, 0xe871c7bf077ba8b8},
+    {0x87625f056c7c4a8b, 0x11471cd764ad4973},
+    {0xa93af6c6c79b5d2d, 0xd598e40d3dd89bd0},
+    {0xd389b47879823479, 0x4aff1d108d4ec2c4},
+    {0x843610cb4bf160cb, 0xcedf722a585139bb},
+    {0xa54394fe1eedb8fe, 0xc2974eb4ee658829},
+    {0xce947a3da6a9273e, 0x733d226229feea33},
+    {0x811ccc668829b887, 0x0806357d5a3f5260},
+    {0xa163ff802a3426a8, 0xca07c2dcb0cf26f8},
+    {0xc9bcff6034c13052, 0xfc89b393dd02f0b6},
+    {0xfc2c3f3841f17c67, 0xbbac2078d443ace3},
+    {0x9d9ba7832936edc0, 0xd54b944b84aa4c0e},
+    {0xc5029163f384a931, 0x0a9e795e65d4df12},
+    {0xf64335bcf065d37d, 0x4d4617b5ff4a16d6},
+    {0x99ea0196163fa42e, 0x504bced1bf8e4e46},
+    {0xc06481fb9bcf8d39, 0xe45ec2862f71e1d7},
+    {0xf07da27a82c37088, 0x5d767327bb4e5a4d},
+    {0x964e858c91ba2655, 0x3a6a07f8d510f870},
+    {0xbbe226efb628afea, 0x890489f70a55368c},
+    {0xeadab0aba3b2dbe5, 0x2b45ac74ccea842f},
+    {0x92c8ae6b464fc96f, 0x3b0b8bc90012929e},
+    {0xb77ada0617e3bbcb, 0x09ce6ebb40173745},
+    {0xe55990879ddcaabd, 0xcc420a6a101d0516},
+    {0x8f57fa54c2a9eab6, 0x9fa946824a12232e},
+    {0xb32df8e9f3546564, 0x47939822dc96abfa},
+    {0xdff9772470297ebd, 0x59787e2b93bc56f8},
+    {0x8bfbea76c619ef36, 0x57eb4edb3c55b65b},
+    {0xaefae51477a06b03, 0xede622920b6b23f2},
+    {0xdab99e59958885c4, 0xe95fab368e45ecee},
+    {0x88b402f7fd75539b, 0x11dbcb0218ebb415},
+    {0xaae103b5fcd2a881, 0xd652bdc29f26a11a},
+    {0xd59944a37c0752a2, 0x4be76d3346f04960},
+    {0x857fcae62d8493a5, 0x6f70a4400c562ddc},
+    {0xa6dfbd9fb8e5b88e, 0xcb4ccd500f6bb953},
+    {0xd097ad07a71f26b2, 0x7e2000a41346a7a8},
+    {0x825ecc24c873782f, 0x8ed400668c0c28c9},
+    {0xa2f67f2dfa90563b, 0x728900802f0f32fb},
+    {0xcbb41ef979346bca, 0x4f2b40a03ad2ffba},
+    {0xfea126b7d78186bc, 0xe2f610c84987bfa9},
+    {0x9f24b832e6b0f436, 0x0dd9ca7d2df4d7ca},
+    {0xc6ede63fa05d3143, 0x91503d1c79720dbc},
+    {0xf8a95fcf88747d94, 0x75a44c6397ce912b},
+    {0x9b69dbe1b548ce7c, 0xc986afbe3ee11abb},
+    {0xc24452da229b021b, 0xfbe85badce996169},
+    {0xf2d56790ab41c2a2, 0xfae27299423fb9c4},
+    {0x97c560ba6b0919a5, 0xdccd879fc967d41b},
+    {0xbdb6b8e905cb600f, 0x5400e987bbc1c921},
+    {0xed246723473e3813, 0x290123e9aab23b69},
+    {0x9436c0760c86e30b, 0xf9a0b6720aaf6522},
+    {0xb94470938fa89bce, 0xf808e40e8d5b3e6a},
+    {0xe7958cb87392c2c2, 0xb60b1d1230b20e05},
+    {0x90bd77f3483bb9b9, 0xb1c6f22b5e6f48c3},
+    {0xb4ecd5f01a4aa828, 0x1e38aeb6360b1af4},
+    {0xe2280b6c20dd5232, 0x25c6da63c38de1b1},
+    {0x8d590723948a535f, 0x579c487e5a38ad0f},
+    {0xb0af48ec79ace837, 0x2d835a9df0c6d852},
+    {0xdcdb1b2798182244, 0xf8e431456cf88e66},
+    {0x8a08f0f8bf0f156b, 0x1b8e9ecb641b5900},
+    {0xac8b2d36eed2dac5, 0xe272467e3d222f40},
+    {0xd7adf884aa879177, 0x5b0ed81dcc6abb10},
+    {0x86ccbb52ea94baea, 0x98e947129fc2b4ea},
+    {0xa87fea27a539e9a5, 0x3f2398d747b36225},
+    {0xd29fe4b18e88640e, 0x8eec7f0d19a03aae},
+    {0x83a3eeeef9153e89, 0x1953cf68300424ad},
+    {0xa48ceaaab75a8e2b, 0x5fa8c3423c052dd8},
+    {0xcdb02555653131b6, 0x3792f412cb06794e},
+    {0x808e17555f3ebf11, 0xe2bbd88bbee40bd1},
+    {0xa0b19d2ab70e6ed6, 0x5b6aceaeae9d0ec5},
+    {0xc8de047564d20a8b, 0xf245825a5a445276},
+    {0xfb158592be068d2e, 0xeed6e2f0f0d56713},
+    {0x9ced737bb6c4183d, 0x55464dd69685606c},
+    {0xc428d05aa4751e4c, 0xaa97e14c3c26b887},
+    {0xf53304714d9265df, 0xd53dd99f4b3066a9},
+    {0x993fe2c6d07b7fab, 0xe546a8038efe402a},
+    {0xbf8fdb78849a5f96, 0xde98520472bdd034},
+    {0xef73d256a5c0f77c, 0x963e66858f6d4441},
+    {0x95a8637627989aad, 0xdde7001379a44aa9},
+    {0xbb127c53b17ec159, 0x5560c018580d5d53},
+    {0xe9d71b689dde71af, 0xaab8f01e6e10b4a7},
+    {0x9226712162ab070d, 0xcab3961304ca70e9},
+    {0xb6b00d69bb55c8d1, 0x3d607b97c5fd0d23},
+    {0xe45c10c42a2b3b05, 0x8cb89a7db77c506b},
+    {0x8eb98a7a9a5b04e3, 0x77f3608e92adb243},
+    {0xb267ed1940f1c61c, 0x55f038b237591ed4},
+    {0xdf01e85f912e37a3, 0x6b6c46dec52f6689},
+    {0x8b61313bbabce2c6, 0x2323ac4b3b3da016},
+    {0xae397d8aa96c1b77, 0xabec975e0a0d081b},
+    {0xd9c7dced53c72255, 0x96e7bd358c904a22},
+    {0x881cea14545c7575, 0x7e50d64177da2e55},
+    {0xaa242499697392d2, 0xdde50bd1d5d0b9ea},
+    {0xd4ad2dbfc3d07787, 0x955e4ec64b44e865},
+    {0x84ec3c97da624ab4, 0xbd5af13bef0b113f},
+    {0xa6274bbdd0fadd61, 0xecb1ad8aeacdd58f},
+    {0xcfb11ead453994ba, 0x67de18eda5814af3},
+    {0x81ceb32c4b43fcf4, 0x80eacf948770ced8},
+    {0xa2425ff75e14fc31, 0xa1258379a94d028e},
+    {0xcad2f7f5359a3b3e, 0x096ee45813a04331},
+    {0xfd87b5f28300ca0d, 0x8bca9d6e188853fd},
+    {0x9e74d1b791e07e48, 0x775ea264cf55347e},
+    {0xc612062576589dda, 0x95364afe032a819e},
+    {0xf79687aed3eec551, 0x3a83ddbd83f52205},
+    {0x9abe14cd44753b52, 0xc4926a9672793543},
+    {0xc16d9a0095928a27, 0x75b7053c0f178294},
+    {0xf1c90080baf72cb1, 0x5324c68b12dd6339},
+    {0x971da05074da7bee, 0xd3f6fc16ebca5e04},
+    {0xbce5086492111aea, 0x88f4bb1ca6bcf585},
+    {0xec1e4a7db69561a5, 0x2b31e9e3d06c32e6},
+    {0x9392ee8e921d5d07, 0x3aff322e62439fd0},
+    {0xb877aa3236a4b449, 0x09befeb9fad487c3},
+    {0xe69594bec44de15b, 0x4c2ebe687989a9b4},
+    {0x901d7cf73ab0acd9, 0x0f9d37014bf60a11},
+    {0xb424dc35095cd80f, 0x538484c19ef38c95},
+    {0xe12e13424bb40e13, 0x2865a5f206b06fba},
+    {0x8cbccc096f5088cb, 0xf93f87b7442e45d4},
+    {0xafebff0bcb24aafe, 0xf78f69a51539d749},
+    {0xdbe6fecebdedd5be, 0xb573440e5a884d1c},
+    {0x89705f4136b4a597, 0x31680a88f8953031},
+    {0xabcc77118461cefc, 0xfdc20d2b36ba7c3e},
+    {0xd6bf94d5e57a42bc, 0x3d32907604691b4d},
+    {0x8637bd05af6c69b5, 0xa63f9a49c2c1b110},
+    {0xa7c5ac471b478423, 0x0fcf80dc33721d54},
+    {0xd1b71758e219652b, 0xd3c36113404ea4a9},
+    {0x83126e978d4fdf3b, 0x645a1cac083126ea},
+    {0xa3d70a3d70a3d70a, 0x3d70a3d70a3d70a4},
+    {0xcccccccccccccccc, 0xcccccccccccccccd},
+    {0x8000000000000000, 0x0000000000000000},
+    {0xa000000000000000, 0x0000000000000000},
+    {0xc800000000000000, 0x0000000000000000},
+    {0xfa00000000000000, 0x0000000000000000},
+    {0x9c40000000000000, 0x0000000000000000},
+    {0xc350000000000000, 0x0000000000000000},
+    {0xf424000000000000, 0x0000000000000000},
+    {0x9896800000000000, 0x0000000000000000},
+    {0xbebc200000000000, 0x0000000000000000},
+    {0xee6b280000000000, 0x0000000000000000},
+    {0x9502f90000000000, 0x0000000000000000},
+    {0xba43b74000000000, 0x0000000000000000},
+    {0xe8d4a51000000000, 0x0000000000000000},
+    {0x9184e72a00000000, 0x0000000000000000},
+    {0xb5e620f480000000, 0x0000000000000000},
+    {0xe35fa931a0000000, 0x0000000000000000},
+    {0x8e1bc9bf04000000, 0x0000000000000000},
+    {0xb1a2bc2ec5000000, 0x0000000000000000},
+    {0xde0b6b3a76400000, 0x0000000000000000},
+    {0x8ac7230489e80000, 0x0000000000000000},
+    {0xad78ebc5ac620000, 0x0000000000000000},
+    {0xd8d726b7177a8000, 0x0000000000000000},
+    {0x878678326eac9000, 0x0000000000000000},
+    {0xa968163f0a57b400, 0x0000000000000000},
+    {0xd3c21bcecceda100, 0x0000000000000000},
+    {0x84595161401484a0, 0x0000000000000000},
+    {0xa56fa5b99019a5c8, 0x0000000000000000},
+    {0xcecb8f27f4200f3a, 0x0000000000000000},
+    {0x813f3978f8940984, 0x4000000000000000},
+    {0xa18f07d736b90be5, 0x5000000000000000},
+    {0xc9f2c9cd04674ede, 0xa400000000000000},
+    {0xfc6f7c4045812296, 0x4d00000000000000},
+    {0x9dc5ada82b70b59d, 0xf020000000000000},
+    {0xc5371912364ce305, 0x6c28000000000000},
+    {0xf684df56c3e01bc6, 0xc732000000000000},
+    {0x9a130b963a6c115c, 0x3c7f400000000000},
+    {0xc097ce7bc90715b3, 0x4b9f100000000000},
+    {0xf0bdc21abb48db20, 0x1e86d40000000000},
+    {0x96769950b50d88f4, 0x1314448000000000},
+    {0xbc143fa4e250eb31, 0x17d955a000000000},
+    {0xeb194f8e1ae525fd, 0x5dcfab0800000000},
+    {0x92efd1b8d0cf37be, 0x5aa1cae500000000},
+    {0xb7abc627050305ad, 0xf14a3d9e40000000},
+    {0xe596b7b0c643c719, 0x6d9ccd05d0000000},
+    {0x8f7e32ce7bea5c6f, 0xe4820023a2000000},
+    {0xb35dbf821ae4f38b, 0xdda2802c8a800000},
+    {0xe0352f62a19e306e, 0xd50b2037ad200000},
+    {0x8c213d9da502de45, 0x4526f422cc340000},
+    {0xaf298d050e4395d6, 0x9670b12b7f410000},
+    {0xdaf3f04651d47b4c, 0x3c0cdd765f114000},
+    {0x88d8762bf324cd0f, 0xa5880a69fb6ac800},
+    {0xab0e93b6efee0053, 0x8eea0d047a457a00},
+    {0xd5d238a4abe98068, 0x72a4904598d6d880},
+    {0x85a36366eb71f041, 0x47a6da2b7f864750},
+    {0xa70c3c40a64e6c51, 0x999090b65f67d924},
+    {0xd0cf4b50cfe20765, 0xfff4b4e3f741cf6d},
+    {0x82818f1281ed449f, 0xbff8f10e7a8921a4},
+    {0xa321f2d7226895c7, 0xaff72d52192b6a0d},
+    {0xcbea6f8ceb02bb39, 0x9bf4f8a69f764490},
+    {0xfee50b7025c36a08, 0x02f236d04753d5b4},
+    {0x9f4f2726179a2245, 0x01d762422c946590},
+    {0xc722f0ef9d80aad6, 0x424d3ad2b7b97ef5},
+    {0xf8ebad2b84e0d58b, 0xd2e0898765a7deb2},
+    {0x9b934c3b330c8577, 0x63cc55f49f88eb2f},
+    {0xc2781f49ffcfa6d5, 0x3cbf6b71c76b25fb},
+    {0xf316271c7fc3908a, 0x8bef464e3945ef7a},
+    {0x97edd871cfda3a56, 0x97758bf0e3cbb5ac},
+    {0xbde94e8e43d0c8ec, 0x3d52eeed1cbea317},
+    {0xed63a231d4c4fb27, 0x4ca7aaa863ee4bdd},
+    {0x945e455f24fb1cf8, 0x8fe8caa93e74ef6a},
+    {0xb975d6b6ee39e436, 0xb3e2fd538e122b44},
+    {0xe7d34c64a9c85d44, 0x60dbbca87196b616},
+    {0x90e40fbeea1d3a4a, 0xbc8955e946fe31cd},
+    {0xb51d13aea4a488dd, 0x6babab6398bdbe41},
+    {0xe264589a4dcdab14, 0xc696963c7eed2dd1},
+    {0x8d7eb76070a08aec, 0xfc1e1de5cf543ca2},
+    {0xb0de65388cc8ada8, 0x3b25a55f43294bcb},
+    {0xdd15fe86affad912, 0x49ef0eb713f39ebe},
+    {0x8a2dbf142dfcc7ab, 0x6e3569326c784337},
+    {0xacb92ed9397bf996, 0x49c2c37f07965404},
+    {0xd7e77a8f87daf7fb, 0xdc33745ec97be906},
+    {0x86f0ac99b4e8dafd, 0x69a028bb3ded71a3},
+    {0xa8acd7c0222311bc, 0xc40832ea0d68ce0c},
+    {0xd2d80db02aabd62b, 0xf50a3fa490c30190},
+    {0x83c7088e1aab65db, 0x792667c6da79e0fa},
+    {0xa4b8cab1a1563f52, 0x577001b891185938},
+    {0xcde6fd5e09abcf26, 0xed4c0226b55e6f86},
+    {0x80b05e5ac60b6178, 0x544f8158315b05b4},
+    {0xa0dc75f1778e39d6, 0x696361ae3db1c721},
+    {0xc913936dd571c84c, 0x03bc3a19cd1e38e9},
+    {0xfb5878494ace3a5f, 0x04ab48a04065c723},
+    {0x9d174b2dcec0e47b, 0x62eb0d64283f9c76},
+    {0xc45d1df942711d9a, 0x3ba5d0bd324f8394},
+    {0xf5746577930d6500, 0xca8f44ec7ee36479},
+    {0x9968bf6abbe85f20, 0x7e998b13cf4e1ecb},
+    {0xbfc2ef456ae276e8, 0x9e3fedd8c321a67e},
+    {0xefb3ab16c59b14a2, 0xc5cfe94ef3ea101e},
+    {0x95d04aee3b80ece5, 0xbba1f1d158724a12},
+    {0xbb445da9ca61281f, 0x2a8a6e45ae8edc97},
+    {0xea1575143cf97226, 0xf52d09d71a3293bd},
+    {0x924d692ca61be758, 0x593c2626705f9c56},
+    {0xb6e0c377cfa2e12e, 0x6f8b2fb00c77836c},
+    {0xe498f455c38b997a, 0x0b6dfb9c0f956447},
+    {0x8edf98b59a373fec, 0x4724bd4189bd5eac},
+    {0xb2977ee300c50fe7, 0x58edec91ec2cb657},
+    {0xdf3d5e9bc0f653e1, 0x2f2967b66737e3ed},
+    {0x8b865b215899f46c, 0xbd79e0d20082ee74},
+    {0xae67f1e9aec07187, 0xecd8590680a3aa11},
+    {0xda01ee641a708de9, 0xe80e6f4820cc9495},
+    {0x884134fe908658b2, 0x3109058d147fdcdd},
+    {0xaa51823e34a7eede, 0xbd4b46f0599fd415},
+    {0xd4e5e2cdc1d1ea96, 0x6c9e18ac7007c91a},
+    {0x850fadc09923329e, 0x03e2cf6bc604ddb0},
+    {0xa6539930bf6bff45, 0x84db8346b786151c},
+    {0xcfe87f7cef46ff16, 0xe612641865679a63},
+    {0x81f14fae158c5f6e, 0x4fcb7e8f3f60c07e},
+    {0xa26da3999aef7749, 0xe3be5e330f38f09d},
+    {0xcb090c8001ab551c, 0x5cadf5bfd3072cc5},
+    {0xfdcb4fa002162a63, 0x73d9732fc7c8f7f6},
+    {0x9e9f11c4014dda7e, 0x2867e7fddcdd9afa},
+    {0xc646d63501a1511d, 0xb281e1fd541501b8},
+    {0xf7d88bc24209a565, 0x1f225a7ca91a4226},
+    {0x9ae757596946075f, 0x3375788de9b06958},
+    {0xc1a12d2fc3978937, 0x0052d6b1641c83ae},
+    {0xf209787bb47d6b84, 0xc0678c5dbd23a49a},
+    {0x9745eb4d50ce6332, 0xf840b7ba963646e0},
+    {0xbd176620a501fbff, 0xb650e5a93bc3d898},
+    {0xec5d3fa8ce427aff, 0xa3e51f138ab4cebe},
+    {0x93ba47c980e98cdf, 0xc66f336c36b10137},
+    {0xb8a8d9bbe123f017, 0xb80b0047445d4184},
+    {0xe6d3102ad96cec1d, 0xa60dc059157491e5},
+    {0x9043ea1ac7e41392, 0x87c89837ad68db2f},
+    {0xb454e4a179dd1877, 0x29babe4598c311fb},
+    {0xe16a1dc9d8545e94, 0xf4296dd6fef3d67a},
+    {0x8ce2529e2734bb1d, 0x1899e4a65f58660c},
+    {0xb01ae745b101e9e4, 0x5ec05dcff72e7f8f},
+    {0xdc21a1171d42645d, 0x76707543f4fa1f73},
+    {0x899504ae72497eba, 0x6a06494a791c53a8},
+    {0xabfa45da0edbde69, 0x0487db9d17636892},
+    {0xd6f8d7509292d603, 0x45a9d2845d3c42b6},
+    {0x865b86925b9bc5c2, 0x0b8a2392ba45a9b2},
+    {0xa7f26836f282b732, 0x8e6cac7768d7141e},
+    {0xd1ef0244af2364ff, 0x3207d795430cd926},
+    {0x8335616aed761f1f, 0x7f44e6bd49e807b8},
+    {0xa402b9c5a8d3a6e7, 0x5f16206c9c6209a6},
+    {0xcd036837130890a1, 0x36dba887c37a8c0f},
+    {0x802221226be55a64, 0xc2494954da2c9789},
+    {0xa02aa96b06deb0fd, 0xf2db9baa10b7bd6c},
+    {0xc83553c5c8965d3d, 0x6f92829494e5acc7},
+    {0xfa42a8b73abbf48c, 0xcb772339ba1f17f9},
+    {0x9c69a97284b578d7, 0xff2a760414536efb},
+    {0xc38413cf25e2d70d, 0xfef5138519684aba},
+    {0xf46518c2ef5b8cd1, 0x7eb258665fc25d69},
+    {0x98bf2f79d5993802, 0xef2f773ffbd97a61},
+    {0xbeeefb584aff8603, 0xaafb550ffacfd8fa},
+    {0xeeaaba2e5dbf6784, 0x95ba2a53f983cf38},
+    {0x952ab45cfa97a0b2, 0xdd945a747bf26183},
+    {0xba756174393d88df, 0x94f971119aeef9e4},
+    {0xe912b9d1478ceb17, 0x7a37cd5601aab85d},
+    {0x91abb422ccb812ee, 0xac62e055c10ab33a},
+    {0xb616a12b7fe617aa, 0x577b986b314d6009},
+    {0xe39c49765fdf9d94, 0xed5a7e85fda0b80b},
+    {0x8e41ade9fbebc27d, 0x14588f13be847307},
+    {0xb1d219647ae6b31c, 0x596eb2d8ae258fc8},
+    {0xde469fbd99a05fe3, 0x6fca5f8ed9aef3bb},
+    {0x8aec23d680043bee, 0x25de7bb9480d5854},
+    {0xada72ccc20054ae9, 0xaf561aa79a10ae6a},
+    {0xd910f7ff28069da4, 0x1b2ba1518094da04},
+    {0x87aa9aff79042286, 0x90fb44d2f05d0842},
+    {0xa99541bf57452b28, 0x353a1607ac744a53},
+    {0xd3fa922f2d1675f2, 0x42889b8997915ce8},
+    {0x847c9b5d7c2e09b7, 0x69956135febada11},
+    {0xa59bc234db398c25, 0x43fab9837e699095},
+    {0xcf02b2c21207ef2e, 0x94f967e45e03f4bb},
+    {0x8161afb94b44f57d, 0x1d1be0eebac278f5},
+    {0xa1ba1ba79e1632dc, 0x6462d92a69731732},
+    {0xca28a291859bbf93, 0x7d7b8f7503cfdcfe},
+    {0xfcb2cb35e702af78, 0x5cda735244c3d43e},
+    {0x9defbf01b061adab, 0x3a0888136afa64a7},
+    {0xc56baec21c7a1916, 0x088aaa1845b8fdd0},
+    {0xf6c69a72a3989f5b, 0x8aad549e57273d45},
+    {0x9a3c2087a63f6399, 0x36ac54e2f678864b},
+    {0xc0cb28a98fcf3c7f, 0x84576a1bb416a7dd},
+    {0xf0fdf2d3f3c30b9f, 0x656d44a2a11c51d5},
+    {0x969eb7c47859e743, 0x9f644ae5a4b1b325},
+    {0xbc4665b596706114, 0x873d5d9f0dde1fee},
+    {0xeb57ff22fc0c7959, 0xa90cb506d155a7ea},
+    {0x9316ff75dd87cbd8, 0x09a7f12442d588f2},
+    {0xb7dcbf5354e9bece, 0x0c11ed6d538aeb2f},
+    {0xe5d3ef282a242e81, 0x8f1668c8a86da5fa},
+    {0x8fa475791a569d10, 0xf96e017d694487bc},
+    {0xb38d92d760ec4455, 0x37c981dcc395a9ac},
+    {0xe070f78d3927556a, 0x85bbe253f47b1417},
+    {0x8c469ab843b89562, 0x93956d7478ccec8e},
+    {0xaf58416654a6babb, 0x387ac8d1970027b2},
+    {0xdb2e51bfe9d0696a, 0x06997b05fcc0319e},
+    {0x88fcf317f22241e2, 0x441fece3bdf81f03},
+    {0xab3c2fddeeaad25a, 0xd527e81cad7626c3},
+    {0xd60b3bd56a5586f1, 0x8a71e223d8d3b074},
+    {0x85c7056562757456, 0xf6872d5667844e49},
+    {0xa738c6bebb12d16c, 0xb428f8ac016561db},
+    {0xd106f86e69d785c7, 0xe13336d701beba52},
+    {0x82a45b450226b39c, 0xecc0024661173473},
+    {0xa34d721642b06084, 0x27f002d7f95d0190},
+    {0xcc20ce9bd35c78a5, 0x31ec038df7b441f4},
+    {0xff290242c83396ce, 0x7e67047175a15271},
+    {0x9f79a169bd203e41, 0x0f0062c6e984d386},
+    {0xc75809c42c684dd1, 0x52c07b78a3e60868},
+    {0xf92e0c3537826145, 0xa7709a56ccdf8a82},
+    {0x9bbcc7a142b17ccb, 0x88a66076400bb691},
+    {0xc2abf989935ddbfe, 0x6acff893d00ea435},
+    {0xf356f7ebf83552fe, 0x0583f6b8c4124d43},
+    {0x98165af37b2153de, 0xc3727a337a8b704a},
+    {0xbe1bf1b059e9a8d6, 0x744f18c0592e4c5c},
+    {0xeda2ee1c7064130c, 0x1162def06f79df73},
+    {0x9485d4d1c63e8be7, 0x8addcb5645ac2ba8},
+    {0xb9a74a0637ce2ee1, 0x6d953e2bd7173692},
+    {0xe8111c87c5c1ba99, 0xc8fa8db6ccdd0437},
+    {0x910ab1d4db9914a0, 0x1d9c9892400a22a2},
+    {0xb54d5e4a127f59c8, 0x2503beb6d00cab4b},
+    {0xe2a0b5dc971f303a, 0x2e44ae64840fd61d},
+    {0x8da471a9de737e24, 0x5ceaecfed289e5d2},
+    {0xb10d8e1456105dad, 0x7425a83e872c5f47},
+    {0xdd50f1996b947518, 0xd12f124e28f77719},
+    {0x8a5296ffe33cc92f, 0x82bd6b70d99aaa6f},
+    {0xace73cbfdc0bfb7b, 0x636cc64d1001550b},
+    {0xd8210befd30efa5a, 0x3c47f7e05401aa4e},
+    {0x8714a775e3e95c78, 0x65acfaec34810a71},
+    {0xa8d9d1535ce3b396, 0x7f1839a741a14d0d},
+    {0xd31045a8341ca07c, 0x1ede48111209a050},
+    {0x83ea2b892091e44d, 0x934aed0aab460432},
+    {0xa4e4b66b68b65d60, 0xf81da84d5617853f},
+    {0xce1de40642e3f4b9, 0x36251260ab9d668e},
+    {0x80d2ae83e9ce78f3, 0xc1d72b7c6b426019},
+    {0xa1075a24e4421730, 0xb24cf65b8612f81f},
+    {0xc94930ae1d529cfc, 0xdee033f26797b627},
+    {0xfb9b7cd9a4a7443c, 0x169840ef017da3b1},
+    {0x9d412e0806e88aa5, 0x8e1f289560ee864e},
+    {0xc491798a08a2ad4e, 0xf1a6f2bab92a27e2},
+    {0xf5b5d7ec8acb58a2, 0xae10af696774b1db},
+    {0x9991a6f3d6bf1765, 0xacca6da1e0a8ef29},
+    {0xbff610b0cc6edd3f, 0x17fd090a58d32af3},
+    {0xeff394dcff8a948e, 0xddfc4b4cef07f5b0},
+    {0x95f83d0a1fb69cd9, 0x4abdaf101564f98e},
+    {0xbb764c4ca7a4440f, 0x9d6d1ad41abe37f1},
+    {0xea53df5fd18d5513, 0x84c86189216dc5ed},
+    {0x92746b9be2f8552c, 0x32fd3cf5b4e49bb4},
+    {0xb7118682dbb66a77, 0x3fbc8c33221dc2a1},
+    {0xe4d5e82392a40515, 0x0fabaf3feaa5334a},
+    {0x8f05b1163ba6832d, 0x29cb4d87f2a7400e},
+    {0xb2c71d5bca9023f8, 0x743e20e9ef511012},
+    {0xdf78e4b2bd342cf6, 0x914da9246b255416},
+    {0x8bab8eefb6409c1a, 0x1ad089b6c2f7548e},
+    {0xae9672aba3d0c320, 0xa184ac2473b529b1},
+    {0xda3c0f568cc4f3e8, 0xc9e5d72d90a2741e},
+    {0x8865899617fb1871, 0x7e2fa67c7a658892},
+    {0xaa7eebfb9df9de8d, 0xddbb901b98feeab7},
+    {0xd51ea6fa85785631, 0x552a74227f3ea565},
+    {0x8533285c936b35de, 0xd53a88958f87275f},
+    {0xa67ff273b8460356, 0x8a892abaf368f137},
+    {0xd01fef10a657842c, 0x2d2b7569b0432d85},
+    {0x8213f56a67f6b29b, 0x9c3b29620e29fc73},
+    {0xa298f2c501f45f42, 0x8349f3ba91b47b8f},
+    {0xcb3f2f7642717713, 0x241c70a936219a73},
+    {0xfe0efb53d30dd4d7, 0xed238cd383aa0110},
+    {0x9ec95d1463e8a506, 0xf4363804324a40aa},
+    {0xc67bb4597ce2ce48, 0xb143c6053edcd0d5},
+    {0xf81aa16fdc1b81da, 0xdd94b7868e94050a},
+    {0x9b10a4e5e9913128, 0xca7cf2b4191c8326},
+    {0xc1d4ce1f63f57d72, 0xfd1c2f611f63a3f0},
+    {0xf24a01a73cf2dccf, 0xbc633b39673c8cec},
+    {0x976e41088617ca01, 0xd5be0503e085d813},
+    {0xbd49d14aa79dbc82, 0x4b2d8644d8a74e18},
+    {0xec9c459d51852ba2, 0xddf8e7d60ed1219e},
+    {0x93e1ab8252f33b45, 0xcabb90e5c942b503},
+    {0xb8da1662e7b00a17, 0x3d6a751f3b936243},
+    {0xe7109bfba19c0c9d, 0x0cc512670a783ad4},
+    {0x906a617d450187e2, 0x27fb2b80668b24c5},
+    {0xb484f9dc9641e9da, 0xb1f9f660802dedf6},
+    {0xe1a63853bbd26451, 0x5e7873f8a0396973},
+    {0x8d07e33455637eb2, 0xdb0b487b6423e1e8},
+    {0xb049dc016abc5e5f, 0x91ce1a9a3d2cda62},
+    {0xdc5c5301c56b75f7, 0x7641a140cc7810fb},
+    {0x89b9b3e11b6329ba, 0xa9e904c87fcb0a9d},
+    {0xac2820d9623bf429, 0x546345fa9fbdcd44},
+    {0xd732290fbacaf133, 0xa97c177947ad4095},
+    {0x867f59a9d4bed6c0, 0x49ed8eabcccc485d},
+    {0xa81f301449ee8c70, 0x5c68f256bfff5a74},
+    {0xd226fc195c6a2f8c, 0x73832eec6fff3111},
+    {0x83585d8fd9c25db7, 0xc831fd53c5ff7eab},
+    {0xa42e74f3d032f525, 0xba3e7ca8b77f5e55},
+    {0xcd3a1230c43fb26f, 0x28ce1bd2e55f35eb},
+    {0x80444b5e7aa7cf85, 0x7980d163cf5b81b3},
+    {0xa0555e361951c366, 0xd7e105bcc332621f},
+    {0xc86ab5c39fa63440, 0x8dd9472bf3fefaa7},
+    {0xfa856334878fc150, 0xb14f98f6f0feb951},
+    {0x9c935e00d4b9d8d2, 0x6ed1bf9a569f33d3},
+    {0xc3b8358109e84f07, 0x0a862f80ec4700c8},
+    {0xf4a642e14c6262c8, 0xcd27bb612758c0fa},
+    {0x98e7e9cccfbd7dbd, 0x8038d51cb897789c},
+    {0xbf21e44003acdd2c, 0xe0470a63e6bd56c3},
+    {0xeeea5d5004981478, 0x1858ccfce06cac74},
+    {0x95527a5202df0ccb, 0x0f37801e0c43ebc8},
+    {0xbaa718e68396cffd, 0xd30560258f54e6ba},
+    {0xe950df20247c83fd, 0x47c6b82ef32a2069},
+    {0x91d28b7416cdd27e, 0x4cdc331d57fa5441},
+    {0xb6472e511c81471d, 0xe0133fe4adf8e952},
+    {0xe3d8f9e563a198e5, 0x58180fddd97723a6},
+    {0x8e679c2f5e44ff8f, 0x570f09eaa7ea7648},
+    {0xb201833b35d63f73, 0x2cd2cc6551e513da},
+    {0xde81e40a034bcf4f, 0xf8077f7ea65e58d1},
+    {0x8b112e86420f6191, 0xfb04afaf27faf782},
+    {0xadd57a27d29339f6, 0x79c5db9af1f9b563},
+    {0xd94ad8b1c7380874, 0x18375281ae7822bc},
+    {0x87cec76f1c830548, 0x8f2293910d0b15b5},
+    {0xa9c2794ae3a3c69a, 0xb2eb3875504ddb22},
+    {0xd433179d9c8cb841, 0x5fa60692a46151eb},
+    {0x849feec281d7f328, 0xdbc7c41ba6bcd333},
+    {0xa5c7ea73224deff3, 0x12b9b522906c0800},
+    {0xcf39e50feae16bef, 0xd768226b34870a00},
+    {0x81842f29f2cce375, 0xe6a1158300d46640},
+    {0xa1e53af46f801c53, 0x60495ae3c1097fd0},
+    {0xca5e89b18b602368, 0x385bb19cb14bdfc4},
+    {0xfcf62c1dee382c42, 0x46729e03dd9ed7b5},
+    {0x9e19db92b4e31ba9, 0x6c07a2c26a8346d1},
+    {0xc5a05277621be293, 0xc7098b7305241885},
+    {0xf70867153aa2db38, 0xb8cbee4fc66d1ea7}
+#else
+    {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b},
+    {0xce5d73ff402d98e3, 0xfb0a3d212dc81290},
+    {0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481f},
+    {0x86a8d39ef77164bc, 0xae5dff9c02033198},
+    {0xd98ddaee19068c76, 0x3badd624dd9b0958},
+    {0xafbd2350644eeacf, 0xe5d1929ef90898fb},
+    {0x8df5efabc5979c8f, 0xca8d3ffa1ef463c2},
+    {0xe55990879ddcaabd, 0xcc420a6a101d0516},
+    {0xb94470938fa89bce, 0xf808e40e8d5b3e6a},
+    {0x95a8637627989aad, 0xdde7001379a44aa9},
+    {0xf1c90080baf72cb1, 0x5324c68b12dd6339},
+    {0xc350000000000000, 0x0000000000000000},
+    {0x9dc5ada82b70b59d, 0xf020000000000000},
+    {0xfee50b7025c36a08, 0x02f236d04753d5b4},
+    {0xcde6fd5e09abcf26, 0xed4c0226b55e6f86},
+    {0xa6539930bf6bff45, 0x84db8346b786151c},
+    {0x865b86925b9bc5c2, 0x0b8a2392ba45a9b2},
+    {0xd910f7ff28069da4, 0x1b2ba1518094da04},
+    {0xaf58416654a6babb, 0x387ac8d1970027b2},
+    {0x8da471a9de737e24, 0x5ceaecfed289e5d2},
+    {0xe4d5e82392a40515, 0x0fabaf3feaa5334a},
+    {0xb8da1662e7b00a17, 0x3d6a751f3b936243},
+    {0x95527a5202df0ccb, 0x0f37801e0c43ebc8}
+#endif
+};
+
+#if !FMT_USE_FULL_CACHE_DRAGONBOX
+template <typename T>
+const uint64_t basic_data<T>::powers_of_5_64[] = {
+    0x0000000000000001, 0x0000000000000005, 0x0000000000000019,
+    0x000000000000007d, 0x0000000000000271, 0x0000000000000c35,
+    0x0000000000003d09, 0x000000000001312d, 0x000000000005f5e1,
+    0x00000000001dcd65, 0x00000000009502f9, 0x0000000002e90edd,
+    0x000000000e8d4a51, 0x0000000048c27395, 0x000000016bcc41e9,
+    0x000000071afd498d, 0x0000002386f26fc1, 0x000000b1a2bc2ec5,
+    0x000003782dace9d9, 0x00001158e460913d, 0x000056bc75e2d631,
+    0x0001b1ae4d6e2ef5, 0x000878678326eac9, 0x002a5a058fc295ed,
+    0x00d3c21bcecceda1, 0x0422ca8b0a00a425, 0x14adf4b7320334b9};
+
+template <typename T>
+const uint32_t basic_data<T>::dragonbox_pow10_recovery_errors[] = {
+    0x50001400, 0x54044100, 0x54014555, 0x55954415, 0x54115555, 0x00000001,
+    0x50000000, 0x00104000, 0x54010004, 0x05004001, 0x55555544, 0x41545555,
+    0x54040551, 0x15445545, 0x51555514, 0x10000015, 0x00101100, 0x01100015,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x04450514, 0x45414110,
+    0x55555145, 0x50544050, 0x15040155, 0x11054140, 0x50111514, 0x11451454,
+    0x00400541, 0x00000000, 0x55555450, 0x10056551, 0x10054011, 0x55551014,
+    0x69514555, 0x05151109, 0x00155555};
+#endif
+
+template <typename T>
+const char basic_data<T>::foreground_color[] = "\x1b[38;2;";
+template <typename T>
+const char basic_data<T>::background_color[] = "\x1b[48;2;";
+template <typename T> const char basic_data<T>::reset_color[] = "\x1b[0m";
+template <typename T> const wchar_t basic_data<T>::wreset_color[] = L"\x1b[0m";
+template <typename T> const char basic_data<T>::signs[] = {0, '-', '+', ' '};
+template <typename T>
+const char basic_data<T>::left_padding_shifts[] = {31, 31, 0, 1, 0};
+template <typename T>
+const char basic_data<T>::right_padding_shifts[] = {0, 31, 0, 1, 0};
+
+template <typename T> struct bits {
+  static FMT_CONSTEXPR_DECL const int value =
+      static_cast<int>(sizeof(T) * std::numeric_limits<unsigned char>::digits);
+};
+
+class fp;
+template <int SHIFT = 0> fp normalize(fp value);
+
+// Lower (upper) boundary is a value half way between a floating-point value
+// and its predecessor (successor). Boundaries have the same exponent as the
+// value so only significands are stored.
+struct boundaries {
+  uint64_t lower;
+  uint64_t upper;
+};
+
+// A handmade floating-point number f * pow(2, e).
+class fp {
+ private:
+  using significand_type = uint64_t;
+
+  template <typename Float>
+  using is_supported_float = bool_constant<sizeof(Float) == sizeof(uint64_t) ||
+                                           sizeof(Float) == sizeof(uint32_t)>;
+
+ public:
+  significand_type f;
+  int e;
+
+  // All sizes are in bits.
+  // Subtract 1 to account for an implicit most significant bit in the
+  // normalized form.
+  static FMT_CONSTEXPR_DECL const int double_significand_size =
+      std::numeric_limits<double>::digits - 1;
+  static FMT_CONSTEXPR_DECL const uint64_t implicit_bit =
+      1ULL << double_significand_size;
+  static FMT_CONSTEXPR_DECL const int significand_size =
+      bits<significand_type>::value;
+
+  fp() : f(0), e(0) {}
+  fp(uint64_t f_val, int e_val) : f(f_val), e(e_val) {}
+
+  // Constructs fp from an IEEE754 double. It is a template to prevent compile
+  // errors on platforms where double is not IEEE754.
+  template <typename Double> explicit fp(Double d) { assign(d); }
+
+  // Assigns d to this and return true iff predecessor is closer than successor.
+  template <typename Float, FMT_ENABLE_IF(is_supported_float<Float>::value)>
+  bool assign(Float d) {
+    // Assume float is in the format [sign][exponent][significand].
+    using limits = std::numeric_limits<Float>;
+    const int float_significand_size = limits::digits - 1;
+    const int exponent_size =
+        bits<Float>::value - float_significand_size - 1;  // -1 for sign
+    const uint64_t float_implicit_bit = 1ULL << float_significand_size;
+    const uint64_t significand_mask = float_implicit_bit - 1;
+    const uint64_t exponent_mask = (~0ULL >> 1) & ~significand_mask;
+    const int exponent_bias = (1 << exponent_size) - limits::max_exponent - 1;
+    constexpr bool is_double = sizeof(Float) == sizeof(uint64_t);
+    auto u = bit_cast<conditional_t<is_double, uint64_t, uint32_t>>(d);
+    f = u & significand_mask;
+    int biased_e =
+        static_cast<int>((u & exponent_mask) >> float_significand_size);
+    // Predecessor is closer if d is a normalized power of 2 (f == 0) other than
+    // the smallest normalized number (biased_e > 1).
+    bool is_predecessor_closer = f == 0 && biased_e > 1;
+    if (biased_e != 0)
+      f += float_implicit_bit;
+    else
+      biased_e = 1;  // Subnormals use biased exponent 1 (min exponent).
+    e = biased_e - exponent_bias - float_significand_size;
+    return is_predecessor_closer;
+  }
+
+  template <typename Float, FMT_ENABLE_IF(!is_supported_float<Float>::value)>
+  bool assign(Float) {
+    *this = fp();
+    return false;
+  }
+};
+
+// Normalizes the value converted from double and multiplied by (1 << SHIFT).
+template <int SHIFT> fp normalize(fp value) {
+  // Handle subnormals.
+  const auto shifted_implicit_bit = fp::implicit_bit << SHIFT;
+  while ((value.f & shifted_implicit_bit) == 0) {
+    value.f <<= 1;
+    --value.e;
+  }
+  // Subtract 1 to account for hidden bit.
+  const auto offset =
+      fp::significand_size - fp::double_significand_size - SHIFT - 1;
+  value.f <<= offset;
+  value.e -= offset;
+  return value;
+}
+
+inline bool operator==(fp x, fp y) { return x.f == y.f && x.e == y.e; }
+
+// Computes lhs * rhs / pow(2, 64) rounded to nearest with half-up tie breaking.
+inline uint64_t multiply(uint64_t lhs, uint64_t rhs) {
+#if FMT_USE_INT128
+  auto product = static_cast<__uint128_t>(lhs) * rhs;
+  auto f = static_cast<uint64_t>(product >> 64);
+  return (static_cast<uint64_t>(product) & (1ULL << 63)) != 0 ? f + 1 : f;
+#else
+  // Multiply 32-bit parts of significands.
+  uint64_t mask = (1ULL << 32) - 1;
+  uint64_t a = lhs >> 32, b = lhs & mask;
+  uint64_t c = rhs >> 32, d = rhs & mask;
+  uint64_t ac = a * c, bc = b * c, ad = a * d, bd = b * d;
+  // Compute mid 64-bit of result and round.
+  uint64_t mid = (bd >> 32) + (ad & mask) + (bc & mask) + (1U << 31);
+  return ac + (ad >> 32) + (bc >> 32) + (mid >> 32);
+#endif
+}
+
+inline fp operator*(fp x, fp y) { return {multiply(x.f, y.f), x.e + y.e + 64}; }
+
+// Returns a cached power of 10 `c_k = c_k.f * pow(2, c_k.e)` such that its
+// (binary) exponent satisfies `min_exponent <= c_k.e <= min_exponent + 28`.
+inline fp get_cached_power(int min_exponent, int& pow10_exponent) {
+  const int shift = 32;
+  const auto significand = static_cast<int64_t>(data::log10_2_significand);
+  int index = static_cast<int>(
+      ((min_exponent + fp::significand_size - 1) * (significand >> shift) +
+       ((int64_t(1) << shift) - 1))  // ceil
+      >> 32                          // arithmetic shift
+  );
+  // Decimal exponent of the first (smallest) cached power of 10.
+  const int first_dec_exp = -348;
+  // Difference between 2 consecutive decimal exponents in cached powers of 10.
+  const int dec_exp_step = 8;
+  index = (index - first_dec_exp - 1) / dec_exp_step + 1;
+  pow10_exponent = first_dec_exp + index * dec_exp_step;
+  return {data::grisu_pow10_significands[index],
+          data::grisu_pow10_exponents[index]};
+}
+
+// A simple accumulator to hold the sums of terms in bigint::square if uint128_t
+// is not available.
+struct accumulator {
+  uint64_t lower;
+  uint64_t upper;
+
+  accumulator() : lower(0), upper(0) {}
+  explicit operator uint32_t() const { return static_cast<uint32_t>(lower); }
+
+  void operator+=(uint64_t n) {
+    lower += n;
+    if (lower < n) ++upper;
+  }
+  void operator>>=(int shift) {
+    assert(shift == 32);
+    (void)shift;
+    lower = (upper << 32) | (lower >> 32);
+    upper >>= 32;
+  }
+};
+
+class bigint {
+ private:
+  // A bigint is stored as an array of bigits (big digits), with bigit at index
+  // 0 being the least significant one.
+  using bigit = uint32_t;
+  using double_bigit = uint64_t;
+  enum { bigits_capacity = 32 };
+  basic_memory_buffer<bigit, bigits_capacity> bigits_;
+  int exp_;
+
+  bigit operator[](int index) const { return bigits_[to_unsigned(index)]; }
+  bigit& operator[](int index) { return bigits_[to_unsigned(index)]; }
+
+  static FMT_CONSTEXPR_DECL const int bigit_bits = bits<bigit>::value;
+
+  friend struct formatter<bigint>;
+
+  void subtract_bigits(int index, bigit other, bigit& borrow) {
+    auto result = static_cast<double_bigit>((*this)[index]) - other - borrow;
+    (*this)[index] = static_cast<bigit>(result);
+    borrow = static_cast<bigit>(result >> (bigit_bits * 2 - 1));
+  }
+
+  void remove_leading_zeros() {
+    int num_bigits = static_cast<int>(bigits_.size()) - 1;
+    while (num_bigits > 0 && (*this)[num_bigits] == 0) --num_bigits;
+    bigits_.resize(to_unsigned(num_bigits + 1));
+  }
+
+  // Computes *this -= other assuming aligned bigints and *this >= other.
+  void subtract_aligned(const bigint& other) {
+    FMT_ASSERT(other.exp_ >= exp_, "unaligned bigints");
+    FMT_ASSERT(compare(*this, other) >= 0, "");
+    bigit borrow = 0;
+    int i = other.exp_ - exp_;
+    for (size_t j = 0, n = other.bigits_.size(); j != n; ++i, ++j)
+      subtract_bigits(i, other.bigits_[j], borrow);
+    while (borrow > 0) subtract_bigits(i, 0, borrow);
+    remove_leading_zeros();
+  }
+
+  void multiply(uint32_t value) {
+    const double_bigit wide_value = value;
+    bigit carry = 0;
+    for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
+      double_bigit result = bigits_[i] * wide_value + carry;
+      bigits_[i] = static_cast<bigit>(result);
+      carry = static_cast<bigit>(result >> bigit_bits);
+    }
+    if (carry != 0) bigits_.push_back(carry);
+  }
+
+  void multiply(uint64_t value) {
+    const bigit mask = ~bigit(0);
+    const double_bigit lower = value & mask;
+    const double_bigit upper = value >> bigit_bits;
+    double_bigit carry = 0;
+    for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
+      double_bigit result = bigits_[i] * lower + (carry & mask);
+      carry =
+          bigits_[i] * upper + (result >> bigit_bits) + (carry >> bigit_bits);
+      bigits_[i] = static_cast<bigit>(result);
+    }
+    while (carry != 0) {
+      bigits_.push_back(carry & mask);
+      carry >>= bigit_bits;
+    }
+  }
+
+ public:
+  bigint() : exp_(0) {}
+  explicit bigint(uint64_t n) { assign(n); }
+  ~bigint() { assert(bigits_.capacity() <= bigits_capacity); }
+
+  bigint(const bigint&) = delete;
+  void operator=(const bigint&) = delete;
+
+  void assign(const bigint& other) {
+    auto size = other.bigits_.size();
+    bigits_.resize(size);
+    auto data = other.bigits_.data();
+    std::copy(data, data + size, make_checked(bigits_.data(), size));
+    exp_ = other.exp_;
+  }
+
+  void assign(uint64_t n) {
+    size_t num_bigits = 0;
+    do {
+      bigits_[num_bigits++] = n & ~bigit(0);
+      n >>= bigit_bits;
+    } while (n != 0);
+    bigits_.resize(num_bigits);
+    exp_ = 0;
+  }
+
+  int num_bigits() const { return static_cast<int>(bigits_.size()) + exp_; }
+
+  FMT_NOINLINE bigint& operator<<=(int shift) {
+    assert(shift >= 0);
+    exp_ += shift / bigit_bits;
+    shift %= bigit_bits;
+    if (shift == 0) return *this;
+    bigit carry = 0;
+    for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
+      bigit c = bigits_[i] >> (bigit_bits - shift);
+      bigits_[i] = (bigits_[i] << shift) + carry;
+      carry = c;
+    }
+    if (carry != 0) bigits_.push_back(carry);
+    return *this;
+  }
+
+  template <typename Int> bigint& operator*=(Int value) {
+    FMT_ASSERT(value > 0, "");
+    multiply(uint32_or_64_or_128_t<Int>(value));
+    return *this;
+  }
+
+  friend int compare(const bigint& lhs, const bigint& rhs) {
+    int num_lhs_bigits = lhs.num_bigits(), num_rhs_bigits = rhs.num_bigits();
+    if (num_lhs_bigits != num_rhs_bigits)
+      return num_lhs_bigits > num_rhs_bigits ? 1 : -1;
+    int i = static_cast<int>(lhs.bigits_.size()) - 1;
+    int j = static_cast<int>(rhs.bigits_.size()) - 1;
+    int end = i - j;
+    if (end < 0) end = 0;
+    for (; i >= end; --i, --j) {
+      bigit lhs_bigit = lhs[i], rhs_bigit = rhs[j];
+      if (lhs_bigit != rhs_bigit) return lhs_bigit > rhs_bigit ? 1 : -1;
+    }
+    if (i != j) return i > j ? 1 : -1;
+    return 0;
+  }
+
+  // Returns compare(lhs1 + lhs2, rhs).
+  friend int add_compare(const bigint& lhs1, const bigint& lhs2,
+                         const bigint& rhs) {
+    int max_lhs_bigits = (std::max)(lhs1.num_bigits(), lhs2.num_bigits());
+    int num_rhs_bigits = rhs.num_bigits();
+    if (max_lhs_bigits + 1 < num_rhs_bigits) return -1;
+    if (max_lhs_bigits > num_rhs_bigits) return 1;
+    auto get_bigit = [](const bigint& n, int i) -> bigit {
+      return i >= n.exp_ && i < n.num_bigits() ? n[i - n.exp_] : 0;
+    };
+    double_bigit borrow = 0;
+    int min_exp = (std::min)((std::min)(lhs1.exp_, lhs2.exp_), rhs.exp_);
+    for (int i = num_rhs_bigits - 1; i >= min_exp; --i) {
+      double_bigit sum =
+          static_cast<double_bigit>(get_bigit(lhs1, i)) + get_bigit(lhs2, i);
+      bigit rhs_bigit = get_bigit(rhs, i);
+      if (sum > rhs_bigit + borrow) return 1;
+      borrow = rhs_bigit + borrow - sum;
+      if (borrow > 1) return -1;
+      borrow <<= bigit_bits;
+    }
+    return borrow != 0 ? -1 : 0;
+  }
+
+  // Assigns pow(10, exp) to this bigint.
+  void assign_pow10(int exp) {
+    assert(exp >= 0);
+    if (exp == 0) return assign(1);
+    // Find the top bit.
+    int bitmask = 1;
+    while (exp >= bitmask) bitmask <<= 1;
+    bitmask >>= 1;
+    // pow(10, exp) = pow(5, exp) * pow(2, exp). First compute pow(5, exp) by
+    // repeated squaring and multiplication.
+    assign(5);
+    bitmask >>= 1;
+    while (bitmask != 0) {
+      square();
+      if ((exp & bitmask) != 0) *this *= 5;
+      bitmask >>= 1;
+    }
+    *this <<= exp;  // Multiply by pow(2, exp) by shifting.
+  }
+
+  void square() {
+    basic_memory_buffer<bigit, bigits_capacity> n(std::move(bigits_));
+    int num_bigits = static_cast<int>(bigits_.size());
+    int num_result_bigits = 2 * num_bigits;
+    bigits_.resize(to_unsigned(num_result_bigits));
+    using accumulator_t = conditional_t<FMT_USE_INT128, uint128_t, accumulator>;
+    auto sum = accumulator_t();
+    for (int bigit_index = 0; bigit_index < num_bigits; ++bigit_index) {
+      // Compute bigit at position bigit_index of the result by adding
+      // cross-product terms n[i] * n[j] such that i + j == bigit_index.
+      for (int i = 0, j = bigit_index; j >= 0; ++i, --j) {
+        // Most terms are multiplied twice which can be optimized in the future.
+        sum += static_cast<double_bigit>(n[i]) * n[j];
+      }
+      (*this)[bigit_index] = static_cast<bigit>(sum);
+      sum >>= bits<bigit>::value;  // Compute the carry.
+    }
+    // Do the same for the top half.
+    for (int bigit_index = num_bigits; bigit_index < num_result_bigits;
+         ++bigit_index) {
+      for (int j = num_bigits - 1, i = bigit_index - j; i < num_bigits;)
+        sum += static_cast<double_bigit>(n[i++]) * n[j--];
+      (*this)[bigit_index] = static_cast<bigit>(sum);
+      sum >>= bits<bigit>::value;
+    }
+    --num_result_bigits;
+    remove_leading_zeros();
+    exp_ *= 2;
+  }
+
+  // If this bigint has a bigger exponent than other, adds trailing zero to make
+  // exponents equal. This simplifies some operations such as subtraction.
+  void align(const bigint& other) {
+    int exp_difference = exp_ - other.exp_;
+    if (exp_difference <= 0) return;
+    int num_bigits = static_cast<int>(bigits_.size());
+    bigits_.resize(to_unsigned(num_bigits + exp_difference));
+    for (int i = num_bigits - 1, j = i + exp_difference; i >= 0; --i, --j)
+      bigits_[j] = bigits_[i];
+    std::uninitialized_fill_n(bigits_.data(), exp_difference, 0);
+    exp_ -= exp_difference;
+  }
+
+  // Divides this bignum by divisor, assigning the remainder to this and
+  // returning the quotient.
+  int divmod_assign(const bigint& divisor) {
+    FMT_ASSERT(this != &divisor, "");
+    if (compare(*this, divisor) < 0) return 0;
+    FMT_ASSERT(divisor.bigits_[divisor.bigits_.size() - 1u] != 0, "");
+    align(divisor);
+    int quotient = 0;
+    do {
+      subtract_aligned(divisor);
+      ++quotient;
+    } while (compare(*this, divisor) >= 0);
+    return quotient;
+  }
+};
+
+enum class round_direction { unknown, up, down };
+
+// Given the divisor (normally a power of 10), the remainder = v % divisor for
+// some number v and the error, returns whether v should be rounded up, down, or
+// whether the rounding direction can't be determined due to error.
+// error should be less than divisor / 2.
+inline round_direction get_round_direction(uint64_t divisor, uint64_t remainder,
+                                           uint64_t error) {
+  FMT_ASSERT(remainder < divisor, "");  // divisor - remainder won't overflow.
+  FMT_ASSERT(error < divisor, "");      // divisor - error won't overflow.
+  FMT_ASSERT(error < divisor - error, "");  // error * 2 won't overflow.
+  // Round down if (remainder + error) * 2 <= divisor.
+  if (remainder <= divisor - remainder && error * 2 <= divisor - remainder * 2)
+    return round_direction::down;
+  // Round up if (remainder - error) * 2 >= divisor.
+  if (remainder >= error &&
+      remainder - error >= divisor - (remainder - error)) {
+    return round_direction::up;
+  }
+  return round_direction::unknown;
+}
+
+namespace digits {
+enum result {
+  more,  // Generate more digits.
+  done,  // Done generating digits.
+  error  // Digit generation cancelled due to an error.
+};
+}
+
+// Generates output using the Grisu digit-gen algorithm.
+// error: the size of the region (lower, upper) outside of which numbers
+// definitely do not round to value (Delta in Grisu3).
+template <typename Handler>
+FMT_ALWAYS_INLINE digits::result grisu_gen_digits(fp value, uint64_t error,
+                                                  int& exp, Handler& handler) {
+  const fp one(1ULL << -value.e, value.e);
+  // The integral part of scaled value (p1 in Grisu) = value / one. It cannot be
+  // zero because it contains a product of two 64-bit numbers with MSB set (due
+  // to normalization) - 1, shifted right by at most 60 bits.
+  auto integral = static_cast<uint32_t>(value.f >> -one.e);
+  FMT_ASSERT(integral != 0, "");
+  FMT_ASSERT(integral == value.f >> -one.e, "");
+  // The fractional part of scaled value (p2 in Grisu) c = value % one.
+  uint64_t fractional = value.f & (one.f - 1);
+  exp = count_digits(integral);  // kappa in Grisu.
+  // Divide by 10 to prevent overflow.
+  auto result = handler.on_start(data::powers_of_10_64[exp - 1] << -one.e,
+                                 value.f / 10, error * 10, exp);
+  if (result != digits::more) return result;
+  // Generate digits for the integral part. This can produce up to 10 digits.
+  do {
+    uint32_t digit = 0;
+    auto divmod_integral = [&](uint32_t divisor) {
+      digit = integral / divisor;
+      integral %= divisor;
+    };
+    // This optimization by Milo Yip reduces the number of integer divisions by
+    // one per iteration.
+    switch (exp) {
+    case 10:
+      divmod_integral(1000000000);
+      break;
+    case 9:
+      divmod_integral(100000000);
+      break;
+    case 8:
+      divmod_integral(10000000);
+      break;
+    case 7:
+      divmod_integral(1000000);
+      break;
+    case 6:
+      divmod_integral(100000);
+      break;
+    case 5:
+      divmod_integral(10000);
+      break;
+    case 4:
+      divmod_integral(1000);
+      break;
+    case 3:
+      divmod_integral(100);
+      break;
+    case 2:
+      divmod_integral(10);
+      break;
+    case 1:
+      digit = integral;
+      integral = 0;
+      break;
+    default:
+      FMT_ASSERT(false, "invalid number of digits");
+    }
+    --exp;
+    auto remainder = (static_cast<uint64_t>(integral) << -one.e) + fractional;
+    result = handler.on_digit(static_cast<char>('0' + digit),
+                              data::powers_of_10_64[exp] << -one.e, remainder,
+                              error, exp, true);
+    if (result != digits::more) return result;
+  } while (exp > 0);
+  // Generate digits for the fractional part.
+  for (;;) {
+    fractional *= 10;
+    error *= 10;
+    char digit = static_cast<char>('0' + (fractional >> -one.e));
+    fractional &= one.f - 1;
+    --exp;
+    result = handler.on_digit(digit, one.f, fractional, error, exp, false);
+    if (result != digits::more) return result;
+  }
+}
+
+// The fixed precision digit handler.
+struct fixed_handler {
+  char* buf;
+  int size;
+  int precision;
+  int exp10;
+  bool fixed;
+
+  digits::result on_start(uint64_t divisor, uint64_t remainder, uint64_t error,
+                          int& exp) {
+    // Non-fixed formats require at least one digit and no precision adjustment.
+    if (!fixed) return digits::more;
+    // Adjust fixed precision by exponent because it is relative to decimal
+    // point.
+    precision += exp + exp10;
+    // Check if precision is satisfied just by leading zeros, e.g.
+    // format("{:.2f}", 0.001) gives "0.00" without generating any digits.
+    if (precision > 0) return digits::more;
+    if (precision < 0) return digits::done;
+    auto dir = get_round_direction(divisor, remainder, error);
+    if (dir == round_direction::unknown) return digits::error;
+    buf[size++] = dir == round_direction::up ? '1' : '0';
+    return digits::done;
+  }
+
+  digits::result on_digit(char digit, uint64_t divisor, uint64_t remainder,
+                          uint64_t error, int, bool integral) {
+    FMT_ASSERT(remainder < divisor, "");
+    buf[size++] = digit;
+    if (!integral && error >= remainder) return digits::error;
+    if (size < precision) return digits::more;
+    if (!integral) {
+      // Check if error * 2 < divisor with overflow prevention.
+      // The check is not needed for the integral part because error = 1
+      // and divisor > (1 << 32) there.
+      if (error >= divisor || error >= divisor - error) return digits::error;
+    } else {
+      FMT_ASSERT(error == 1 && divisor > 2, "");
+    }
+    auto dir = get_round_direction(divisor, remainder, error);
+    if (dir != round_direction::up)
+      return dir == round_direction::down ? digits::done : digits::error;
+    ++buf[size - 1];
+    for (int i = size - 1; i > 0 && buf[i] > '9'; --i) {
+      buf[i] = '0';
+      ++buf[i - 1];
+    }
+    if (buf[0] > '9') {
+      buf[0] = '1';
+      if (fixed)
+        buf[size++] = '0';
+      else
+        ++exp10;
+    }
+    return digits::done;
+  }
+};
+
+// Implementation of Dragonbox algorithm: https://github.com/jk-jeon/dragonbox.
+namespace dragonbox {
+// Computes 128-bit result of multiplication of two 64-bit unsigned integers.
+FMT_SAFEBUFFERS inline uint128_wrapper umul128(uint64_t x,
+                                               uint64_t y) FMT_NOEXCEPT {
+#if FMT_USE_INT128
+  return static_cast<uint128_t>(x) * static_cast<uint128_t>(y);
+#elif defined(_MSC_VER) && defined(_M_X64)
+  uint128_wrapper result;
+  result.low_ = _umul128(x, y, &result.high_);
+  return result;
+#else
+  const uint64_t mask = (uint64_t(1) << 32) - uint64_t(1);
+
+  uint64_t a = x >> 32;
+  uint64_t b = x & mask;
+  uint64_t c = y >> 32;
+  uint64_t d = y & mask;
+
+  uint64_t ac = a * c;
+  uint64_t bc = b * c;
+  uint64_t ad = a * d;
+  uint64_t bd = b * d;
+
+  uint64_t intermediate = (bd >> 32) + (ad & mask) + (bc & mask);
+
+  return {ac + (intermediate >> 32) + (ad >> 32) + (bc >> 32),
+          (intermediate << 32) + (bd & mask)};
+#endif
+}
+
+// Computes upper 64 bits of multiplication of two 64-bit unsigned integers.
+FMT_SAFEBUFFERS inline uint64_t umul128_upper64(uint64_t x,
+                                                uint64_t y) FMT_NOEXCEPT {
+#if FMT_USE_INT128
+  auto p = static_cast<uint128_t>(x) * static_cast<uint128_t>(y);
+  return static_cast<uint64_t>(p >> 64);
+#elif defined(_MSC_VER) && defined(_M_X64)
+  return __umulh(x, y);
+#else
+  return umul128(x, y).high();
+#endif
+}
+
+// Computes upper 64 bits of multiplication of a 64-bit unsigned integer and a
+// 128-bit unsigned integer.
+FMT_SAFEBUFFERS inline uint64_t umul192_upper64(uint64_t x, uint128_wrapper y)
+    FMT_NOEXCEPT {
+  uint128_wrapper g0 = umul128(x, y.high());
+  g0 += umul128_upper64(x, y.low());
+  return g0.high();
+}
+
+// Computes upper 32 bits of multiplication of a 32-bit unsigned integer and a
+// 64-bit unsigned integer.
+inline uint32_t umul96_upper32(uint32_t x, uint64_t y) FMT_NOEXCEPT {
+  return static_cast<uint32_t>(umul128_upper64(x, y));
+}
+
+// Computes middle 64 bits of multiplication of a 64-bit unsigned integer and a
+// 128-bit unsigned integer.
+FMT_SAFEBUFFERS inline uint64_t umul192_middle64(uint64_t x, uint128_wrapper y)
+    FMT_NOEXCEPT {
+  uint64_t g01 = x * y.high();
+  uint64_t g10 = umul128_upper64(x, y.low());
+  return g01 + g10;
+}
+
+// Computes lower 64 bits of multiplication of a 32-bit unsigned integer and a
+// 64-bit unsigned integer.
+inline uint64_t umul96_lower64(uint32_t x, uint64_t y) FMT_NOEXCEPT {
+  return x * y;
+}
+
+// Computes floor(log10(pow(2, e))) for e in [-1700, 1700] using the method from
+// https://fmt.dev/papers/Grisu-Exact.pdf#page=5, section 3.4.
+inline int floor_log10_pow2(int e) FMT_NOEXCEPT {
+  FMT_ASSERT(e <= 1700 && e >= -1700, "too large exponent");
+  const int shift = 22;
+  return (e * static_cast<int>(data::log10_2_significand >> (64 - shift))) >>
+         shift;
+}
+
+// Various fast log computations.
+inline int floor_log2_pow10(int e) FMT_NOEXCEPT {
+  FMT_ASSERT(e <= 1233 && e >= -1233, "too large exponent");
+  const uint64_t log2_10_integer_part = 3;
+  const uint64_t log2_10_fractional_digits = 0x5269e12f346e2bf9;
+  const int shift_amount = 19;
+  return (e * static_cast<int>(
+                  (log2_10_integer_part << shift_amount) |
+                  (log2_10_fractional_digits >> (64 - shift_amount)))) >>
+         shift_amount;
+}
+inline int floor_log10_pow2_minus_log10_4_over_3(int e) FMT_NOEXCEPT {
+  FMT_ASSERT(e <= 1700 && e >= -1700, "too large exponent");
+  const uint64_t log10_4_over_3_fractional_digits = 0x1ffbfc2bbc780375;
+  const int shift_amount = 22;
+  return (e * static_cast<int>(data::log10_2_significand >>
+                               (64 - shift_amount)) -
+          static_cast<int>(log10_4_over_3_fractional_digits >>
+                           (64 - shift_amount))) >>
+         shift_amount;
+}
+
+// Returns true iff x is divisible by pow(2, exp).
+inline bool divisible_by_power_of_2(uint32_t x, int exp) FMT_NOEXCEPT {
+  FMT_ASSERT(exp >= 1, "");
+  FMT_ASSERT(x != 0, "");
+#ifdef FMT_BUILTIN_CTZ
+  return FMT_BUILTIN_CTZ(x) >= exp;
+#else
+  return exp < num_bits<uint32_t>() && x == ((x >> exp) << exp);
+#endif
+}
+inline bool divisible_by_power_of_2(uint64_t x, int exp) FMT_NOEXCEPT {
+  FMT_ASSERT(exp >= 1, "");
+  FMT_ASSERT(x != 0, "");
+#ifdef FMT_BUILTIN_CTZLL
+  return FMT_BUILTIN_CTZLL(x) >= exp;
+#else
+  return exp < num_bits<uint64_t>() && x == ((x >> exp) << exp);
+#endif
+}
+
+// Returns true iff x is divisible by pow(5, exp).
+inline bool divisible_by_power_of_5(uint32_t x, int exp) FMT_NOEXCEPT {
+  FMT_ASSERT(exp <= 10, "too large exponent");
+  return x * data::divtest_table_for_pow5_32[exp].mod_inv <=
+         data::divtest_table_for_pow5_32[exp].max_quotient;
+}
+inline bool divisible_by_power_of_5(uint64_t x, int exp) FMT_NOEXCEPT {
+  FMT_ASSERT(exp <= 23, "too large exponent");
+  return x * data::divtest_table_for_pow5_64[exp].mod_inv <=
+         data::divtest_table_for_pow5_64[exp].max_quotient;
+}
+
+// Replaces n by floor(n / pow(5, N)) returning true if and only if n is
+// divisible by pow(5, N).
+// Precondition: n <= 2 * pow(5, N + 1).
+template <int N>
+bool check_divisibility_and_divide_by_pow5(uint32_t& n) FMT_NOEXCEPT {
+  static constexpr struct {
+    uint32_t magic_number;
+    int bits_for_comparison;
+    uint32_t threshold;
+    int shift_amount;
+  } infos[] = {{0xcccd, 16, 0x3333, 18}, {0xa429, 8, 0x0a, 20}};
+  constexpr auto info = infos[N - 1];
+  n *= info.magic_number;
+  const uint32_t comparison_mask = (1u << info.bits_for_comparison) - 1;
+  bool result = (n & comparison_mask) <= info.threshold;
+  n >>= info.shift_amount;
+  return result;
+}
+
+// Computes floor(n / pow(10, N)) for small n and N.
+// Precondition: n <= pow(10, N + 1).
+template <int N> uint32_t small_division_by_pow10(uint32_t n) FMT_NOEXCEPT {
+  static constexpr struct {
+    uint32_t magic_number;
+    int shift_amount;
+    uint32_t divisor_times_10;
+  } infos[] = {{0xcccd, 19, 100}, {0xa3d8, 22, 1000}};
+  constexpr auto info = infos[N - 1];
+  FMT_ASSERT(n <= info.divisor_times_10, "n is too large");
+  return n * info.magic_number >> info.shift_amount;
+}
+
+// Computes floor(n / 10^(kappa + 1)) (float)
+inline uint32_t divide_by_10_to_kappa_plus_1(uint32_t n) FMT_NOEXCEPT {
+  return n / float_info<float>::big_divisor;
+}
+// Computes floor(n / 10^(kappa + 1)) (double)
+inline uint64_t divide_by_10_to_kappa_plus_1(uint64_t n) FMT_NOEXCEPT {
+  return umul128_upper64(n, 0x83126e978d4fdf3c) >> 9;
+}
+
+// Various subroutines using pow10 cache
+template <class T> struct cache_accessor;
+
+template <> struct cache_accessor<float> {
+  using carrier_uint = float_info<float>::carrier_uint;
+  using cache_entry_type = uint64_t;
+
+  static uint64_t get_cached_power(int k) FMT_NOEXCEPT {
+    FMT_ASSERT(k >= float_info<float>::min_k && k <= float_info<float>::max_k,
+               "k is out of range");
+    return data::dragonbox_pow10_significands_64[k - float_info<float>::min_k];
+  }
+
+  static carrier_uint compute_mul(carrier_uint u,
+                                  const cache_entry_type& cache) FMT_NOEXCEPT {
+    return umul96_upper32(u, cache);
+  }
+
+  static uint32_t compute_delta(const cache_entry_type& cache,
+                                int beta_minus_1) FMT_NOEXCEPT {
+    return static_cast<uint32_t>(cache >> (64 - 1 - beta_minus_1));
+  }
+
+  static bool compute_mul_parity(carrier_uint two_f,
+                                 const cache_entry_type& cache,
+                                 int beta_minus_1) FMT_NOEXCEPT {
+    FMT_ASSERT(beta_minus_1 >= 1, "");
+    FMT_ASSERT(beta_minus_1 < 64, "");
+
+    return ((umul96_lower64(two_f, cache) >> (64 - beta_minus_1)) & 1) != 0;
+  }
+
+  static carrier_uint compute_left_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta_minus_1) FMT_NOEXCEPT {
+    return static_cast<carrier_uint>(
+        (cache - (cache >> (float_info<float>::significand_bits + 2))) >>
+        (64 - float_info<float>::significand_bits - 1 - beta_minus_1));
+  }
+
+  static carrier_uint compute_right_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta_minus_1) FMT_NOEXCEPT {
+    return static_cast<carrier_uint>(
+        (cache + (cache >> (float_info<float>::significand_bits + 1))) >>
+        (64 - float_info<float>::significand_bits - 1 - beta_minus_1));
+  }
+
+  static carrier_uint compute_round_up_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta_minus_1) FMT_NOEXCEPT {
+    return (static_cast<carrier_uint>(
+                cache >>
+                (64 - float_info<float>::significand_bits - 2 - beta_minus_1)) +
+            1) /
+           2;
+  }
+};
+
+template <> struct cache_accessor<double> {
+  using carrier_uint = float_info<double>::carrier_uint;
+  using cache_entry_type = uint128_wrapper;
+
+  static uint128_wrapper get_cached_power(int k) FMT_NOEXCEPT {
+    FMT_ASSERT(k >= float_info<double>::min_k && k <= float_info<double>::max_k,
+               "k is out of range");
+
+#if FMT_USE_FULL_CACHE_DRAGONBOX
+    return data::dragonbox_pow10_significands_128[k -
+                                                  float_info<double>::min_k];
+#else
+    static const int compression_ratio = 27;
+
+    // Compute base index.
+    int cache_index = (k - float_info<double>::min_k) / compression_ratio;
+    int kb = cache_index * compression_ratio + float_info<double>::min_k;
+    int offset = k - kb;
+
+    // Get base cache.
+    uint128_wrapper base_cache =
+        data::dragonbox_pow10_significands_128[cache_index];
+    if (offset == 0) return base_cache;
+
+    // Compute the required amount of bit-shift.
+    int alpha = floor_log2_pow10(kb + offset) - floor_log2_pow10(kb) - offset;
+    FMT_ASSERT(alpha > 0 && alpha < 64, "shifting error detected");
+
+    // Try to recover the real cache.
+    uint64_t pow5 = data::powers_of_5_64[offset];
+    uint128_wrapper recovered_cache = umul128(base_cache.high(), pow5);
+    uint128_wrapper middle_low =
+        umul128(base_cache.low() - (kb < 0 ? 1u : 0u), pow5);
+
+    recovered_cache += middle_low.high();
+
+    uint64_t high_to_middle = recovered_cache.high() << (64 - alpha);
+    uint64_t middle_to_low = recovered_cache.low() << (64 - alpha);
+
+    recovered_cache =
+        uint128_wrapper{(recovered_cache.low() >> alpha) | high_to_middle,
+                        ((middle_low.low() >> alpha) | middle_to_low)};
+
+    if (kb < 0) recovered_cache += 1;
+
+    // Get error.
+    int error_idx = (k - float_info<double>::min_k) / 16;
+    uint32_t error = (data::dragonbox_pow10_recovery_errors[error_idx] >>
+                      ((k - float_info<double>::min_k) % 16) * 2) &
+                     0x3;
+
+    // Add the error back.
+    FMT_ASSERT(recovered_cache.low() + error >= recovered_cache.low(), "");
+    return {recovered_cache.high(), recovered_cache.low() + error};
+#endif
+  }
+
+  static carrier_uint compute_mul(carrier_uint u,
+                                  const cache_entry_type& cache) FMT_NOEXCEPT {
+    return umul192_upper64(u, cache);
+  }
+
+  static uint32_t compute_delta(cache_entry_type const& cache,
+                                int beta_minus_1) FMT_NOEXCEPT {
+    return static_cast<uint32_t>(cache.high() >> (64 - 1 - beta_minus_1));
+  }
+
+  static bool compute_mul_parity(carrier_uint two_f,
+                                 const cache_entry_type& cache,
+                                 int beta_minus_1) FMT_NOEXCEPT {
+    FMT_ASSERT(beta_minus_1 >= 1, "");
+    FMT_ASSERT(beta_minus_1 < 64, "");
+
+    return ((umul192_middle64(two_f, cache) >> (64 - beta_minus_1)) & 1) != 0;
+  }
+
+  static carrier_uint compute_left_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta_minus_1) FMT_NOEXCEPT {
+    return (cache.high() -
+            (cache.high() >> (float_info<double>::significand_bits + 2))) >>
+           (64 - float_info<double>::significand_bits - 1 - beta_minus_1);
+  }
+
+  static carrier_uint compute_right_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta_minus_1) FMT_NOEXCEPT {
+    return (cache.high() +
+            (cache.high() >> (float_info<double>::significand_bits + 1))) >>
+           (64 - float_info<double>::significand_bits - 1 - beta_minus_1);
+  }
+
+  static carrier_uint compute_round_up_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta_minus_1) FMT_NOEXCEPT {
+    return ((cache.high() >>
+             (64 - float_info<double>::significand_bits - 2 - beta_minus_1)) +
+            1) /
+           2;
+  }
+};
+
+// Various integer checks
+template <class T>
+bool is_left_endpoint_integer_shorter_interval(int exponent) FMT_NOEXCEPT {
+  return exponent >=
+             float_info<
+                 T>::case_shorter_interval_left_endpoint_lower_threshold &&
+         exponent <=
+             float_info<T>::case_shorter_interval_left_endpoint_upper_threshold;
+}
+template <class T>
+bool is_endpoint_integer(typename float_info<T>::carrier_uint two_f,
+                         int exponent, int minus_k) FMT_NOEXCEPT {
+  if (exponent < float_info<T>::case_fc_pm_half_lower_threshold) return false;
+  // For k >= 0.
+  if (exponent <= float_info<T>::case_fc_pm_half_upper_threshold) return true;
+  // For k < 0.
+  if (exponent > float_info<T>::divisibility_check_by_5_threshold) return false;
+  return divisible_by_power_of_5(two_f, minus_k);
+}
+
+template <class T>
+bool is_center_integer(typename float_info<T>::carrier_uint two_f, int exponent,
+                       int minus_k) FMT_NOEXCEPT {
+  // Exponent for 5 is negative.
+  if (exponent > float_info<T>::divisibility_check_by_5_threshold) return false;
+  if (exponent > float_info<T>::case_fc_upper_threshold)
+    return divisible_by_power_of_5(two_f, minus_k);
+  // Both exponents are nonnegative.
+  if (exponent >= float_info<T>::case_fc_lower_threshold) return true;
+  // Exponent for 2 is negative.
+  return divisible_by_power_of_2(two_f, minus_k - exponent + 1);
+}
+
+// Remove trailing zeros from n and return the number of zeros removed (float)
+FMT_ALWAYS_INLINE int remove_trailing_zeros(uint32_t& n) FMT_NOEXCEPT {
+#ifdef FMT_BUILTIN_CTZ
+  int t = FMT_BUILTIN_CTZ(n);
+#else
+  int t = ctz(n);
+#endif
+  if (t > float_info<float>::max_trailing_zeros)
+    t = float_info<float>::max_trailing_zeros;
+
+  const uint32_t mod_inv1 = 0xcccccccd;
+  const uint32_t max_quotient1 = 0x33333333;
+  const uint32_t mod_inv2 = 0xc28f5c29;
+  const uint32_t max_quotient2 = 0x0a3d70a3;
+
+  int s = 0;
+  for (; s < t - 1; s += 2) {
+    if (n * mod_inv2 > max_quotient2) break;
+    n *= mod_inv2;
+  }
+  if (s < t && n * mod_inv1 <= max_quotient1) {
+    n *= mod_inv1;
+    ++s;
+  }
+  n >>= s;
+  return s;
+}
+
+// Removes trailing zeros and returns the number of zeros removed (double)
+FMT_ALWAYS_INLINE int remove_trailing_zeros(uint64_t& n) FMT_NOEXCEPT {
+#ifdef FMT_BUILTIN_CTZLL
+  int t = FMT_BUILTIN_CTZLL(n);
+#else
+  int t = ctzll(n);
+#endif
+  if (t > float_info<double>::max_trailing_zeros)
+    t = float_info<double>::max_trailing_zeros;
+  // Divide by 10^8 and reduce to 32-bits
+  // Since ret_value.significand <= (2^64 - 1) / 1000 < 10^17,
+  // both of the quotient and the r should fit in 32-bits
+
+  const uint32_t mod_inv1 = 0xcccccccd;
+  const uint32_t max_quotient1 = 0x33333333;
+  const uint64_t mod_inv8 = 0xc767074b22e90e21;
+  const uint64_t max_quotient8 = 0x00002af31dc46118;
+
+  // If the number is divisible by 1'0000'0000, work with the quotient
+  if (t >= 8) {
+    auto quotient_candidate = n * mod_inv8;
+
+    if (quotient_candidate <= max_quotient8) {
+      auto quotient = static_cast<uint32_t>(quotient_candidate >> 8);
+
+      int s = 8;
+      for (; s < t; ++s) {
+        if (quotient * mod_inv1 > max_quotient1) break;
+        quotient *= mod_inv1;
+      }
+      quotient >>= (s - 8);
+      n = quotient;
+      return s;
+    }
+  }
+
+  // Otherwise, work with the remainder
+  auto quotient = static_cast<uint32_t>(n / 100000000);
+  auto remainder = static_cast<uint32_t>(n - 100000000 * quotient);
+
+  if (t == 0 || remainder * mod_inv1 > max_quotient1) {
+    return 0;
+  }
+  remainder *= mod_inv1;
+
+  if (t == 1 || remainder * mod_inv1 > max_quotient1) {
+    n = (remainder >> 1) + quotient * 10000000ull;
+    return 1;
+  }
+  remainder *= mod_inv1;
+
+  if (t == 2 || remainder * mod_inv1 > max_quotient1) {
+    n = (remainder >> 2) + quotient * 1000000ull;
+    return 2;
+  }
+  remainder *= mod_inv1;
+
+  if (t == 3 || remainder * mod_inv1 > max_quotient1) {
+    n = (remainder >> 3) + quotient * 100000ull;
+    return 3;
+  }
+  remainder *= mod_inv1;
+
+  if (t == 4 || remainder * mod_inv1 > max_quotient1) {
+    n = (remainder >> 4) + quotient * 10000ull;
+    return 4;
+  }
+  remainder *= mod_inv1;
+
+  if (t == 5 || remainder * mod_inv1 > max_quotient1) {
+    n = (remainder >> 5) + quotient * 1000ull;
+    return 5;
+  }
+  remainder *= mod_inv1;
+
+  if (t == 6 || remainder * mod_inv1 > max_quotient1) {
+    n = (remainder >> 6) + quotient * 100ull;
+    return 6;
+  }
+  remainder *= mod_inv1;
+
+  n = (remainder >> 7) + quotient * 10ull;
+  return 7;
+}
+
+// The main algorithm for shorter interval case
+template <class T>
+FMT_ALWAYS_INLINE FMT_SAFEBUFFERS decimal_fp<T> shorter_interval_case(
+    int exponent) FMT_NOEXCEPT {
+  decimal_fp<T> ret_value;
+  // Compute k and beta
+  const int minus_k = floor_log10_pow2_minus_log10_4_over_3(exponent);
+  const int beta_minus_1 = exponent + floor_log2_pow10(-minus_k);
+
+  // Compute xi and zi
+  using cache_entry_type = typename cache_accessor<T>::cache_entry_type;
+  const cache_entry_type cache = cache_accessor<T>::get_cached_power(-minus_k);
+
+  auto xi = cache_accessor<T>::compute_left_endpoint_for_shorter_interval_case(
+      cache, beta_minus_1);
+  auto zi = cache_accessor<T>::compute_right_endpoint_for_shorter_interval_case(
+      cache, beta_minus_1);
+
+  // If the left endpoint is not an integer, increase it
+  if (!is_left_endpoint_integer_shorter_interval<T>(exponent)) ++xi;
+
+  // Try bigger divisor
+  ret_value.significand = zi / 10;
+
+  // If succeed, remove trailing zeros if necessary and return
+  if (ret_value.significand * 10 >= xi) {
+    ret_value.exponent = minus_k + 1;
+    ret_value.exponent += remove_trailing_zeros(ret_value.significand);
+    return ret_value;
+  }
+
+  // Otherwise, compute the round-up of y
+  ret_value.significand =
+      cache_accessor<T>::compute_round_up_for_shorter_interval_case(
+          cache, beta_minus_1);
+  ret_value.exponent = minus_k;
+
+  // When tie occurs, choose one of them according to the rule
+  if (exponent >= float_info<T>::shorter_interval_tie_lower_threshold &&
+      exponent <= float_info<T>::shorter_interval_tie_upper_threshold) {
+    ret_value.significand = ret_value.significand % 2 == 0
+                                ? ret_value.significand
+                                : ret_value.significand - 1;
+  } else if (ret_value.significand < xi) {
+    ++ret_value.significand;
+  }
+  return ret_value;
+}
+
+template <typename T>
+FMT_SAFEBUFFERS decimal_fp<T> to_decimal(T x) FMT_NOEXCEPT {
+  // Step 1: integer promotion & Schubfach multiplier calculation.
+
+  using carrier_uint = typename float_info<T>::carrier_uint;
+  using cache_entry_type = typename cache_accessor<T>::cache_entry_type;
+  auto br = bit_cast<carrier_uint>(x);
+
+  // Extract significand bits and exponent bits.
+  const carrier_uint significand_mask =
+      (static_cast<carrier_uint>(1) << float_info<T>::significand_bits) - 1;
+  carrier_uint significand = (br & significand_mask);
+  int exponent = static_cast<int>((br & exponent_mask<T>()) >>
+                                  float_info<T>::significand_bits);
+
+  if (exponent != 0) {  // Check if normal.
+    exponent += float_info<T>::exponent_bias - float_info<T>::significand_bits;
+
+    // Shorter interval case; proceed like Schubfach.
+    if (significand == 0) return shorter_interval_case<T>(exponent);
+
+    significand |=
+        (static_cast<carrier_uint>(1) << float_info<T>::significand_bits);
+  } else {
+    // Subnormal case; the interval is always regular.
+    if (significand == 0) return {0, 0};
+    exponent = float_info<T>::min_exponent - float_info<T>::significand_bits;
+  }
+
+  const bool include_left_endpoint = (significand % 2 == 0);
+  const bool include_right_endpoint = include_left_endpoint;
+
+  // Compute k and beta.
+  const int minus_k = floor_log10_pow2(exponent) - float_info<T>::kappa;
+  const cache_entry_type cache = cache_accessor<T>::get_cached_power(-minus_k);
+  const int beta_minus_1 = exponent + floor_log2_pow10(-minus_k);
+
+  // Compute zi and deltai
+  // 10^kappa <= deltai < 10^(kappa + 1)
+  const uint32_t deltai = cache_accessor<T>::compute_delta(cache, beta_minus_1);
+  const carrier_uint two_fc = significand << 1;
+  const carrier_uint two_fr = two_fc | 1;
+  const carrier_uint zi =
+      cache_accessor<T>::compute_mul(two_fr << beta_minus_1, cache);
+
+  // Step 2: Try larger divisor; remove trailing zeros if necessary
+
+  // Using an upper bound on zi, we might be able to optimize the division
+  // better than the compiler; we are computing zi / big_divisor here
+  decimal_fp<T> ret_value;
+  ret_value.significand = divide_by_10_to_kappa_plus_1(zi);
+  uint32_t r = static_cast<uint32_t>(zi - float_info<T>::big_divisor *
+                                              ret_value.significand);
+
+  if (r > deltai) {
+    goto small_divisor_case_label;
+  } else if (r < deltai) {
+    // Exclude the right endpoint if necessary
+    if (r == 0 && !include_right_endpoint &&
+        is_endpoint_integer<T>(two_fr, exponent, minus_k)) {
+      --ret_value.significand;
+      r = float_info<T>::big_divisor;
+      goto small_divisor_case_label;
+    }
+  } else {
+    // r == deltai; compare fractional parts
+    // Check conditions in the order different from the paper
+    // to take advantage of short-circuiting
+    const carrier_uint two_fl = two_fc - 1;
+    if ((!include_left_endpoint ||
+         !is_endpoint_integer<T>(two_fl, exponent, minus_k)) &&
+        !cache_accessor<T>::compute_mul_parity(two_fl, cache, beta_minus_1)) {
+      goto small_divisor_case_label;
+    }
+  }
+  ret_value.exponent = minus_k + float_info<T>::kappa + 1;
+
+  // We may need to remove trailing zeros
+  ret_value.exponent += remove_trailing_zeros(ret_value.significand);
+  return ret_value;
+
+  // Step 3: Find the significand with the smaller divisor
+
+small_divisor_case_label:
+  ret_value.significand *= 10;
+  ret_value.exponent = minus_k + float_info<T>::kappa;
+
+  const uint32_t mask = (1u << float_info<T>::kappa) - 1;
+  auto dist = r - (deltai / 2) + (float_info<T>::small_divisor / 2);
+
+  // Is dist divisible by 2^kappa?
+  if ((dist & mask) == 0) {
+    const bool approx_y_parity =
+        ((dist ^ (float_info<T>::small_divisor / 2)) & 1) != 0;
+    dist >>= float_info<T>::kappa;
+
+    // Is dist divisible by 5^kappa?
+    if (check_divisibility_and_divide_by_pow5<float_info<T>::kappa>(dist)) {
+      ret_value.significand += dist;
+
+      // Check z^(f) >= epsilon^(f)
+      // We have either yi == zi - epsiloni or yi == (zi - epsiloni) - 1,
+      // where yi == zi - epsiloni if and only if z^(f) >= epsilon^(f)
+      // Since there are only 2 possibilities, we only need to care about the
+      // parity. Also, zi and r should have the same parity since the divisor
+      // is an even number
+      if (cache_accessor<T>::compute_mul_parity(two_fc, cache, beta_minus_1) !=
+          approx_y_parity) {
+        --ret_value.significand;
+      } else {
+        // If z^(f) >= epsilon^(f), we might have a tie
+        // when z^(f) == epsilon^(f), or equivalently, when y is an integer
+        if (is_center_integer<T>(two_fc, exponent, minus_k)) {
+          ret_value.significand = ret_value.significand % 2 == 0
+                                      ? ret_value.significand
+                                      : ret_value.significand - 1;
+        }
+      }
+    }
+    // Is dist not divisible by 5^kappa?
+    else {
+      ret_value.significand += dist;
+    }
+  }
+  // Is dist not divisible by 2^kappa?
+  else {
+    // Since we know dist is small, we might be able to optimize the division
+    // better than the compiler; we are computing dist / small_divisor here
+    ret_value.significand +=
+        small_division_by_pow10<float_info<T>::kappa>(dist);
+  }
+  return ret_value;
+}
+}  // namespace dragonbox
+
+// Formats value using a variation of the Fixed-Precision Positive
+// Floating-Point Printout ((FPP)^2) algorithm by Steele & White:
+// https://fmt.dev/p372-steele.pdf.
+template <typename Double>
+void fallback_format(Double d, int num_digits, bool binary32, buffer<char>& buf,
+                     int& exp10) {
+  bigint numerator;    // 2 * R in (FPP)^2.
+  bigint denominator;  // 2 * S in (FPP)^2.
+  // lower and upper are differences between value and corresponding boundaries.
+  bigint lower;             // (M^- in (FPP)^2).
+  bigint upper_store;       // upper's value if different from lower.
+  bigint* upper = nullptr;  // (M^+ in (FPP)^2).
+  fp value;
+  // Shift numerator and denominator by an extra bit or two (if lower boundary
+  // is closer) to make lower and upper integers. This eliminates multiplication
+  // by 2 during later computations.
+  const bool is_predecessor_closer =
+      binary32 ? value.assign(static_cast<float>(d)) : value.assign(d);
+  int shift = is_predecessor_closer ? 2 : 1;
+  uint64_t significand = value.f << shift;
+  if (value.e >= 0) {
+    numerator.assign(significand);
+    numerator <<= value.e;
+    lower.assign(1);
+    lower <<= value.e;
+    if (shift != 1) {
+      upper_store.assign(1);
+      upper_store <<= value.e + 1;
+      upper = &upper_store;
+    }
+    denominator.assign_pow10(exp10);
+    denominator <<= shift;
+  } else if (exp10 < 0) {
+    numerator.assign_pow10(-exp10);
+    lower.assign(numerator);
+    if (shift != 1) {
+      upper_store.assign(numerator);
+      upper_store <<= 1;
+      upper = &upper_store;
+    }
+    numerator *= significand;
+    denominator.assign(1);
+    denominator <<= shift - value.e;
+  } else {
+    numerator.assign(significand);
+    denominator.assign_pow10(exp10);
+    denominator <<= shift - value.e;
+    lower.assign(1);
+    if (shift != 1) {
+      upper_store.assign(1ULL << 1);
+      upper = &upper_store;
+    }
+  }
+  // Invariant: value == (numerator / denominator) * pow(10, exp10).
+  if (num_digits < 0) {
+    // Generate the shortest representation.
+    if (!upper) upper = &lower;
+    bool even = (value.f & 1) == 0;
+    num_digits = 0;
+    char* data = buf.data();
+    for (;;) {
+      int digit = numerator.divmod_assign(denominator);
+      bool low = compare(numerator, lower) - even < 0;  // numerator <[=] lower.
+      // numerator + upper >[=] pow10:
+      bool high = add_compare(numerator, *upper, denominator) + even > 0;
+      data[num_digits++] = static_cast<char>('0' + digit);
+      if (low || high) {
+        if (!low) {
+          ++data[num_digits - 1];
+        } else if (high) {
+          int result = add_compare(numerator, numerator, denominator);
+          // Round half to even.
+          if (result > 0 || (result == 0 && (digit % 2) != 0))
+            ++data[num_digits - 1];
+        }
+        buf.try_resize(to_unsigned(num_digits));
+        exp10 -= num_digits - 1;
+        return;
+      }
+      numerator *= 10;
+      lower *= 10;
+      if (upper != &lower) *upper *= 10;
+    }
+  }
+  // Generate the given number of digits.
+  exp10 -= num_digits - 1;
+  if (num_digits == 0) {
+    buf.try_resize(1);
+    denominator *= 10;
+    buf[0] = add_compare(numerator, numerator, denominator) > 0 ? '1' : '0';
+    return;
+  }
+  buf.try_resize(to_unsigned(num_digits));
+  for (int i = 0; i < num_digits - 1; ++i) {
+    int digit = numerator.divmod_assign(denominator);
+    buf[i] = static_cast<char>('0' + digit);
+    numerator *= 10;
+  }
+  int digit = numerator.divmod_assign(denominator);
+  auto result = add_compare(numerator, numerator, denominator);
+  if (result > 0 || (result == 0 && (digit % 2) != 0)) {
+    if (digit == 9) {
+      const auto overflow = '0' + 10;
+      buf[num_digits - 1] = overflow;
+      // Propagate the carry.
+      for (int i = num_digits - 1; i > 0 && buf[i] == overflow; --i) {
+        buf[i] = '0';
+        ++buf[i - 1];
+      }
+      if (buf[0] == overflow) {
+        buf[0] = '1';
+        ++exp10;
+      }
+      return;
+    }
+    ++digit;
+  }
+  buf[num_digits - 1] = static_cast<char>('0' + digit);
+}
+
+template <typename T>
+int format_float(T value, int precision, float_specs specs, buffer<char>& buf) {
+  static_assert(!std::is_same<T, float>::value, "");
+  FMT_ASSERT(value >= 0, "value is negative");
+
+  const bool fixed = specs.format == float_format::fixed;
+  if (value <= 0) {  // <= instead of == to silence a warning.
+    if (precision <= 0 || !fixed) {
+      buf.push_back('0');
+      return 0;
+    }
+    buf.try_resize(to_unsigned(precision));
+    std::uninitialized_fill_n(buf.data(), precision, '0');
+    return -precision;
+  }
+
+  if (!specs.use_grisu) return snprintf_float(value, precision, specs, buf);
+
+  if (precision < 0) {
+    // Use Dragonbox for the shortest format.
+    if (specs.binary32) {
+      auto dec = dragonbox::to_decimal(static_cast<float>(value));
+      write<char>(buffer_appender<char>(buf), dec.significand);
+      return dec.exponent;
+    }
+    auto dec = dragonbox::to_decimal(static_cast<double>(value));
+    write<char>(buffer_appender<char>(buf), dec.significand);
+    return dec.exponent;
+  }
+
+  // Use Grisu + Dragon4 for the given precision:
+  // https://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf.
+  int exp = 0;
+  const int min_exp = -60;  // alpha in Grisu.
+  int cached_exp10 = 0;     // K in Grisu.
+  fp normalized = normalize(fp(value));
+  const auto cached_pow = get_cached_power(
+      min_exp - (normalized.e + fp::significand_size), cached_exp10);
+  normalized = normalized * cached_pow;
+  // Limit precision to the maximum possible number of significant digits in an
+  // IEEE754 double because we don't need to generate zeros.
+  const int max_double_digits = 767;
+  if (precision > max_double_digits) precision = max_double_digits;
+  fixed_handler handler{buf.data(), 0, precision, -cached_exp10, fixed};
+  if (grisu_gen_digits(normalized, 1, exp, handler) == digits::error) {
+    exp += handler.size - cached_exp10 - 1;
+    fallback_format(value, handler.precision, specs.binary32, buf, exp);
+  } else {
+    exp += handler.exp10;
+    buf.try_resize(to_unsigned(handler.size));
+  }
+  if (!fixed && !specs.showpoint) {
+    // Remove trailing zeros.
+    auto num_digits = buf.size();
+    while (num_digits > 0 && buf[num_digits - 1] == '0') {
+      --num_digits;
+      ++exp;
+    }
+    buf.try_resize(num_digits);
+  }
+  return exp;
+}  // namespace detail
+
+template <typename T>
+int snprintf_float(T value, int precision, float_specs specs,
+                   buffer<char>& buf) {
+  // Buffer capacity must be non-zero, otherwise MSVC's vsnprintf_s will fail.
+  FMT_ASSERT(buf.capacity() > buf.size(), "empty buffer");
+  static_assert(!std::is_same<T, float>::value, "");
+
+  // Subtract 1 to account for the difference in precision since we use %e for
+  // both general and exponent format.
+  if (specs.format == float_format::general ||
+      specs.format == float_format::exp)
+    precision = (precision >= 0 ? precision : 6) - 1;
+
+  // Build the format string.
+  enum { max_format_size = 7 };  // The longest format is "%#.*Le".
+  char format[max_format_size];
+  char* format_ptr = format;
+  *format_ptr++ = '%';
+  if (specs.showpoint && specs.format == float_format::hex) *format_ptr++ = '#';
+  if (precision >= 0) {
+    *format_ptr++ = '.';
+    *format_ptr++ = '*';
+  }
+  if (std::is_same<T, long double>()) *format_ptr++ = 'L';
+  *format_ptr++ = specs.format != float_format::hex
+                      ? (specs.format == float_format::fixed ? 'f' : 'e')
+                      : (specs.upper ? 'A' : 'a');
+  *format_ptr = '\0';
+
+  // Format using snprintf.
+  auto offset = buf.size();
+  for (;;) {
+    auto begin = buf.data() + offset;
+    auto capacity = buf.capacity() - offset;
+#ifdef FMT_FUZZ
+    if (precision > 100000)
+      throw std::runtime_error(
+          "fuzz mode - avoid large allocation inside snprintf");
+#endif
+    // Suppress the warning about a nonliteral format string.
+    // Cannot use auto because of a bug in MinGW (#1532).
+    int (*snprintf_ptr)(char*, size_t, const char*, ...) = FMT_SNPRINTF;
+    int result = precision >= 0
+                     ? snprintf_ptr(begin, capacity, format, precision, value)
+                     : snprintf_ptr(begin, capacity, format, value);
+    if (result < 0) {
+      // The buffer will grow exponentially.
+      buf.try_reserve(buf.capacity() + 1);
+      continue;
+    }
+    auto size = to_unsigned(result);
+    // Size equal to capacity means that the last character was truncated.
+    if (size >= capacity) {
+      buf.try_reserve(size + offset + 1);  // Add 1 for the terminating '\0'.
+      continue;
+    }
+    auto is_digit = [](char c) { return c >= '0' && c <= '9'; };
+    if (specs.format == float_format::fixed) {
+      if (precision == 0) {
+        buf.try_resize(size);
+        return 0;
+      }
+      // Find and remove the decimal point.
+      auto end = begin + size, p = end;
+      do {
+        --p;
+      } while (is_digit(*p));
+      int fraction_size = static_cast<int>(end - p - 1);
+      std::memmove(p, p + 1, to_unsigned(fraction_size));
+      buf.try_resize(size - 1);
+      return -fraction_size;
+    }
+    if (specs.format == float_format::hex) {
+      buf.try_resize(size + offset);
+      return 0;
+    }
+    // Find and parse the exponent.
+    auto end = begin + size, exp_pos = end;
+    do {
+      --exp_pos;
+    } while (*exp_pos != 'e');
+    char sign = exp_pos[1];
+    assert(sign == '+' || sign == '-');
+    int exp = 0;
+    auto p = exp_pos + 2;  // Skip 'e' and sign.
+    do {
+      assert(is_digit(*p));
+      exp = exp * 10 + (*p++ - '0');
+    } while (p != end);
+    if (sign == '-') exp = -exp;
+    int fraction_size = 0;
+    if (exp_pos != begin + 1) {
+      // Remove trailing zeros.
+      auto fraction_end = exp_pos - 1;
+      while (*fraction_end == '0') --fraction_end;
+      // Move the fractional part left to get rid of the decimal point.
+      fraction_size = static_cast<int>(fraction_end - begin - 1);
+      std::memmove(begin + 1, begin + 2, to_unsigned(fraction_size));
+    }
+    buf.try_resize(to_unsigned(fraction_size) + offset + 1);
+    return exp - fraction_size;
+  }
+}
+
+// A public domain branchless UTF-8 decoder by Christopher Wellons:
+// https://github.com/skeeto/branchless-utf8
+/* Decode the next character, c, from buf, reporting errors in e.
+ *
+ * Since this is a branchless decoder, four bytes will be read from the
+ * buffer regardless of the actual length of the next character. This
+ * means the buffer _must_ have at least three bytes of zero padding
+ * following the end of the data stream.
+ *
+ * Errors are reported in e, which will be non-zero if the parsed
+ * character was somehow invalid: invalid byte sequence, non-canonical
+ * encoding, or a surrogate half.
+ *
+ * The function returns a pointer to the next character. When an error
+ * occurs, this pointer will be a guess that depends on the particular
+ * error, but it will always advance at least one byte.
+ */
+inline const char* utf8_decode(const char* buf, uint32_t* c, int* e) {
+  static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
+  static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
+  static const int shiftc[] = {0, 18, 12, 6, 0};
+  static const int shifte[] = {0, 6, 4, 2, 0};
+
+  int len = code_point_length(buf);
+  const char* next = buf + len;
+
+  // Assume a four-byte character and load four bytes. Unused bits are
+  // shifted out.
+  auto s = reinterpret_cast<const unsigned char*>(buf);
+  *c = uint32_t(s[0] & masks[len]) << 18;
+  *c |= uint32_t(s[1] & 0x3f) << 12;
+  *c |= uint32_t(s[2] & 0x3f) << 6;
+  *c |= uint32_t(s[3] & 0x3f) << 0;
+  *c >>= shiftc[len];
+
+  // Accumulate the various error conditions.
+  *e = (*c < mins[len]) << 6;       // non-canonical encoding
+  *e |= ((*c >> 11) == 0x1b) << 7;  // surrogate half?
+  *e |= (*c > 0x10FFFF) << 8;       // out of range?
+  *e |= (s[1] & 0xc0) >> 2;
+  *e |= (s[2] & 0xc0) >> 4;
+  *e |= (s[3]) >> 6;
+  *e ^= 0x2a;  // top two bits of each tail byte correct?
+  *e >>= shifte[len];
+
+  return next;
+}
+
+struct stringifier {
+  template <typename T> FMT_INLINE std::string operator()(T value) const {
+    return to_string(value);
+  }
+  std::string operator()(basic_format_arg<format_context>::handle h) const {
+    memory_buffer buf;
+    format_parse_context parse_ctx({});
+    format_context format_ctx(buffer_appender<char>(buf), {}, {});
+    h.format(parse_ctx, format_ctx);
+    return to_string(buf);
+  }
+};
+}  // namespace detail
+
+template <> struct formatter<detail::bigint> {
+  format_parse_context::iterator parse(format_parse_context& ctx) {
+    return ctx.begin();
+  }
+
+  format_context::iterator format(const detail::bigint& n,
+                                  format_context& ctx) {
+    auto out = ctx.out();
+    bool first = true;
+    for (auto i = n.bigits_.size(); i > 0; --i) {
+      auto value = n.bigits_[i - 1u];
+      if (first) {
+        out = format_to(out, "{:x}", value);
+        first = false;
+        continue;
+      }
+      out = format_to(out, "{:08x}", value);
+    }
+    if (n.exp_ > 0)
+      out = format_to(out, "p{}", n.exp_ * detail::bigint::bigit_bits);
+    return out;
+  }
+};
+
+FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) {
+  auto transcode = [this](const char* p) {
+    auto cp = uint32_t();
+    auto error = 0;
+    p = utf8_decode(p, &cp, &error);
+    if (error != 0) FMT_THROW(std::runtime_error("invalid utf8"));
+    if (cp <= 0xFFFF) {
+      buffer_.push_back(static_cast<wchar_t>(cp));
+    } else {
+      cp -= 0x10000;
+      buffer_.push_back(static_cast<wchar_t>(0xD800 + (cp >> 10)));
+      buffer_.push_back(static_cast<wchar_t>(0xDC00 + (cp & 0x3FF)));
+    }
+    return p;
+  };
+  auto p = s.data();
+  const size_t block_size = 4;  // utf8_decode always reads blocks of 4 chars.
+  if (s.size() >= block_size) {
+    for (auto end = p + s.size() - block_size + 1; p < end;) p = transcode(p);
+  }
+  if (auto num_chars_left = s.data() + s.size() - p) {
+    char buf[2 * block_size - 1] = {};
+    memcpy(buf, p, to_unsigned(num_chars_left));
+    p = buf;
+    do {
+      p = transcode(p);
+    } while (p - buf < num_chars_left);
+  }
+  buffer_.push_back(0);
+}
+
+FMT_FUNC void format_system_error(detail::buffer<char>& out, int error_code,
+                                  string_view message) FMT_NOEXCEPT {
+  FMT_TRY {
+    memory_buffer buf;
+    buf.resize(inline_buffer_size);
+    for (;;) {
+      char* system_message = &buf[0];
+      int result =
+          detail::safe_strerror(error_code, system_message, buf.size());
+      if (result == 0) {
+        format_to(detail::buffer_appender<char>(out), "{}: {}", message,
+                  system_message);
+        return;
+      }
+      if (result != ERANGE)
+        break;  // Can't get error message, report error code instead.
+      buf.resize(buf.size() * 2);
+    }
+  }
+  FMT_CATCH(...) {}
+  format_error_code(out, error_code, message);
+}
+
+FMT_FUNC void detail::error_handler::on_error(const char* message) {
+  FMT_THROW(format_error(message));
+}
+
+FMT_FUNC void report_system_error(int error_code,
+                                  fmt::string_view message) FMT_NOEXCEPT {
+  report_error(format_system_error, error_code, message);
+}
+
+FMT_FUNC std::string detail::vformat(string_view format_str, format_args args) {
+  if (format_str.size() == 2 && equal2(format_str.data(), "{}")) {
+    auto arg = args.get(0);
+    if (!arg) error_handler().on_error("argument not found");
+    return visit_format_arg(stringifier(), arg);
+  }
+  memory_buffer buffer;
+  detail::vformat_to(buffer, format_str, args);
+  return to_string(buffer);
+}
+
+#ifdef _WIN32
+namespace detail {
+using dword = conditional_t<sizeof(long) == 4, unsigned long, unsigned>;
+extern "C" __declspec(dllimport) int __stdcall WriteConsoleW(  //
+    void*, const void*, dword, dword*, void*);
+}  // namespace detail
+#endif
+
+FMT_FUNC void vprint(std::FILE* f, string_view format_str, format_args args) {
+  memory_buffer buffer;
+  detail::vformat_to(buffer, format_str,
+                     basic_format_args<buffer_context<char>>(args));
+#ifdef _WIN32
+  auto fd = _fileno(f);
+  if (_isatty(fd)) {
+    detail::utf8_to_utf16 u16(string_view(buffer.data(), buffer.size()));
+    auto written = detail::dword();
+    if (!detail::WriteConsoleW(reinterpret_cast<void*>(_get_osfhandle(fd)),
+                               u16.c_str(), static_cast<uint32_t>(u16.size()),
+                               &written, nullptr)) {
+      FMT_THROW(format_error("failed to write to console"));
+    }
+    return;
+  }
+#endif
+  detail::fwrite_fully(buffer.data(), 1, buffer.size(), f);
+}
+
+#ifdef _WIN32
+// Print assuming legacy (non-Unicode) encoding.
+FMT_FUNC void detail::vprint_mojibake(std::FILE* f, string_view format_str,
+                                      format_args args) {
+  memory_buffer buffer;
+  detail::vformat_to(buffer, format_str,
+                     basic_format_args<buffer_context<char>>(args));
+  fwrite_fully(buffer.data(), 1, buffer.size(), f);
+}
+#endif
+
+FMT_FUNC void vprint(string_view format_str, format_args args) {
+  vprint(stdout, format_str, args);
+}
+
+FMT_END_NAMESPACE
+
+#endif  // FMT_FORMAT_INL_H_
diff --git a/include/vtkdiy2/fmt/format.cc b/include/vtkdiy2/thirdparty/fmt/format.cc
similarity index 98%
rename from include/vtkdiy2/fmt/format.cc
rename to include/vtkdiy2/thirdparty/fmt/format.cc
index 41076f1633c..679ac799a43 100644
--- a/include/vtkdiy2/fmt/format.cc
+++ b/include/vtkdiy2/thirdparty/fmt/format.cc
@@ -5,7 +5,7 @@
 //
 // For the license information refer to format.h.
 
-#include "fmt/format-inl.h"
+#include "format-inl.h"
 
 FMT_BEGIN_NAMESPACE
 template struct FMT_API internal::basic_data<void>;
diff --git a/include/vtkdiy2/thirdparty/fmt/format.h b/include/vtkdiy2/thirdparty/fmt/format.h
new file mode 100644
index 00000000000..1de9c387998
--- /dev/null
+++ b/include/vtkdiy2/thirdparty/fmt/format.h
@@ -0,0 +1,3962 @@
+/*
+ Formatting library for C++
+
+ Copyright (c) 2012 - present, Victor Zverovich
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ --- Optional exception to the license ---
+
+ As an exception, if, as a result of your compiling your source code, portions
+ of this Software are embedded into a machine-executable object form of such
+ source code, you may redistribute such embedded portions in such object form
+ without including the above copyright and permission notices.
+ */
+
+#ifndef FMT_FORMAT_H_
+#define FMT_FORMAT_H_
+
+#define FMT_HEADER_ONLY     // Added by diy for header-only usage
+
+#include <algorithm>
+#include <cerrno>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <stdexcept>
+
+#include "core.h"
+
+#ifdef __INTEL_COMPILER
+#  define FMT_ICC_VERSION __INTEL_COMPILER
+#elif defined(__ICL)
+#  define FMT_ICC_VERSION __ICL
+#else
+#  define FMT_ICC_VERSION 0
+#endif
+
+#ifdef __NVCC__
+#  define FMT_CUDA_VERSION (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__)
+#else
+#  define FMT_CUDA_VERSION 0
+#endif
+
+#ifdef __has_builtin
+#  define FMT_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#  define FMT_HAS_BUILTIN(x) 0
+#endif
+
+#if FMT_GCC_VERSION || FMT_CLANG_VERSION
+#  define FMT_NOINLINE __attribute__((noinline))
+#else
+#  define FMT_NOINLINE
+#endif
+
+#if __cplusplus == 201103L || __cplusplus == 201402L
+#  if defined(__INTEL_COMPILER) || defined(__PGI)
+#    define FMT_FALLTHROUGH
+#  elif defined(__clang__)
+#    define FMT_FALLTHROUGH [[clang::fallthrough]]
+#  elif FMT_GCC_VERSION >= 700 && \
+      (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= 520)
+#    define FMT_FALLTHROUGH [[gnu::fallthrough]]
+#  else
+#    define FMT_FALLTHROUGH
+#  endif
+#elif FMT_HAS_CPP17_ATTRIBUTE(fallthrough) || \
+    (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#  define FMT_FALLTHROUGH [[fallthrough]]
+#else
+#  define FMT_FALLTHROUGH
+#endif
+
+#ifndef FMT_MAYBE_UNUSED
+#  if FMT_HAS_CPP17_ATTRIBUTE(maybe_unused)
+#    define FMT_MAYBE_UNUSED [[maybe_unused]]
+#  else
+#    define FMT_MAYBE_UNUSED
+#  endif
+#endif
+
+#ifndef FMT_THROW
+#  if FMT_EXCEPTIONS
+#    if FMT_MSC_VER || FMT_NVCC
+FMT_BEGIN_NAMESPACE
+namespace detail {
+template <typename Exception> inline void do_throw(const Exception& x) {
+  // Silence unreachable code warnings in MSVC and NVCC because these
+  // are nearly impossible to fix in a generic code.
+  volatile bool b = true;
+  if (b) throw x;
+}
+}  // namespace detail
+FMT_END_NAMESPACE
+#      define FMT_THROW(x) detail::do_throw(x)
+#    else
+#      define FMT_THROW(x) throw x
+#    endif
+#  else
+#    define FMT_THROW(x)              \
+      do {                            \
+        static_cast<void>(sizeof(x)); \
+        FMT_ASSERT(false, "");        \
+      } while (false)
+#  endif
+#endif
+
+#if FMT_EXCEPTIONS
+#  define FMT_TRY try
+#  define FMT_CATCH(x) catch (x)
+#else
+#  define FMT_TRY if (true)
+#  define FMT_CATCH(x) if (false)
+#endif
+
+#ifndef FMT_USE_USER_DEFINED_LITERALS
+// EDG based compilers (Intel, NVIDIA, Elbrus, etc), GCC and MSVC support UDLs.
+#  if (FMT_HAS_FEATURE(cxx_user_literals) || FMT_GCC_VERSION >= 407 || \
+       FMT_MSC_VER >= 1900) &&                                         \
+      (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= /* UDL feature */ 480)
+#    define FMT_USE_USER_DEFINED_LITERALS 1
+#  else
+#    define FMT_USE_USER_DEFINED_LITERALS 0
+#  endif
+#endif
+
+#ifndef FMT_USE_UDL_TEMPLATE
+// EDG frontend based compilers (icc, nvcc, PGI, etc) and GCC < 6.4 do not
+// properly support UDL templates and GCC >= 9 warns about them.
+#  if FMT_USE_USER_DEFINED_LITERALS &&                         \
+      (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= 501) && \
+      ((FMT_GCC_VERSION >= 604 && __cplusplus >= 201402L) ||   \
+       FMT_CLANG_VERSION >= 304) &&                            \
+      !defined(__PGI) && !defined(__NVCC__)
+#    define FMT_USE_UDL_TEMPLATE 1
+#  else
+#    define FMT_USE_UDL_TEMPLATE 0
+#  endif
+#endif
+
+#ifndef FMT_USE_FLOAT
+#  define FMT_USE_FLOAT 1
+#endif
+
+#ifndef FMT_USE_DOUBLE
+#  define FMT_USE_DOUBLE 1
+#endif
+
+#ifndef FMT_USE_LONG_DOUBLE
+#  define FMT_USE_LONG_DOUBLE 1
+#endif
+
+// Defining FMT_REDUCE_INT_INSTANTIATIONS to 1, will reduce the number of
+// int_writer template instances to just one by only using the largest integer
+// type. This results in a reduction in binary size but will cause a decrease in
+// integer formatting performance.
+#if !defined(FMT_REDUCE_INT_INSTANTIATIONS)
+#  define FMT_REDUCE_INT_INSTANTIATIONS 0
+#endif
+
+// __builtin_clz is broken in clang with Microsoft CodeGen:
+// https://github.com/fmtlib/fmt/issues/519
+#if (FMT_GCC_VERSION || FMT_HAS_BUILTIN(__builtin_clz)) && !FMT_MSC_VER
+#  define FMT_BUILTIN_CLZ(n) __builtin_clz(n)
+#endif
+#if (FMT_GCC_VERSION || FMT_HAS_BUILTIN(__builtin_clzll)) && !FMT_MSC_VER
+#  define FMT_BUILTIN_CLZLL(n) __builtin_clzll(n)
+#endif
+#if (FMT_GCC_VERSION || FMT_HAS_BUILTIN(__builtin_ctz))
+#  define FMT_BUILTIN_CTZ(n) __builtin_ctz(n)
+#endif
+#if (FMT_GCC_VERSION || FMT_HAS_BUILTIN(__builtin_ctzll))
+#  define FMT_BUILTIN_CTZLL(n) __builtin_ctzll(n)
+#endif
+
+#if FMT_MSC_VER
+#  include <intrin.h>  // _BitScanReverse[64], _BitScanForward[64], _umul128
+#endif
+
+// Some compilers masquerade as both MSVC and GCC-likes or otherwise support
+// __builtin_clz and __builtin_clzll, so only define FMT_BUILTIN_CLZ using the
+// MSVC intrinsics if the clz and clzll builtins are not available.
+#if FMT_MSC_VER && !defined(FMT_BUILTIN_CLZLL) && \
+    !defined(FMT_BUILTIN_CTZLL) && !defined(_MANAGED)
+FMT_BEGIN_NAMESPACE
+namespace detail {
+// Avoid Clang with Microsoft CodeGen's -Wunknown-pragmas warning.
+#  ifndef __clang__
+#    pragma intrinsic(_BitScanForward)
+#    pragma intrinsic(_BitScanReverse)
+#  endif
+#  if defined(_WIN64) && !defined(__clang__)
+#    pragma intrinsic(_BitScanForward64)
+#    pragma intrinsic(_BitScanReverse64)
+#  endif
+
+inline int clz(uint32_t x) {
+  unsigned long r = 0;
+  _BitScanReverse(&r, x);
+  FMT_ASSERT(x != 0, "");
+  // Static analysis complains about using uninitialized data
+  // "r", but the only way that can happen is if "x" is 0,
+  // which the callers guarantee to not happen.
+  FMT_SUPPRESS_MSC_WARNING(6102)
+  return 31 ^ static_cast<int>(r);
+}
+#  define FMT_BUILTIN_CLZ(n) detail::clz(n)
+
+inline int clzll(uint64_t x) {
+  unsigned long r = 0;
+#  ifdef _WIN64
+  _BitScanReverse64(&r, x);
+#  else
+  // Scan the high 32 bits.
+  if (_BitScanReverse(&r, static_cast<uint32_t>(x >> 32))) return 63 ^ (r + 32);
+  // Scan the low 32 bits.
+  _BitScanReverse(&r, static_cast<uint32_t>(x));
+#  endif
+  FMT_ASSERT(x != 0, "");
+  FMT_SUPPRESS_MSC_WARNING(6102)  // Suppress a bogus static analysis warning.
+  return 63 ^ static_cast<int>(r);
+}
+#  define FMT_BUILTIN_CLZLL(n) detail::clzll(n)
+
+inline int ctz(uint32_t x) {
+  unsigned long r = 0;
+  _BitScanForward(&r, x);
+  FMT_ASSERT(x != 0, "");
+  FMT_SUPPRESS_MSC_WARNING(6102)  // Suppress a bogus static analysis warning.
+  return static_cast<int>(r);
+}
+#  define FMT_BUILTIN_CTZ(n) detail::ctz(n)
+
+inline int ctzll(uint64_t x) {
+  unsigned long r = 0;
+  FMT_ASSERT(x != 0, "");
+  FMT_SUPPRESS_MSC_WARNING(6102)  // Suppress a bogus static analysis warning.
+#  ifdef _WIN64
+  _BitScanForward64(&r, x);
+#  else
+  // Scan the low 32 bits.
+  if (_BitScanForward(&r, static_cast<uint32_t>(x))) return static_cast<int>(r);
+  // Scan the high 32 bits.
+  _BitScanForward(&r, static_cast<uint32_t>(x >> 32));
+  r += 32;
+#  endif
+  return static_cast<int>(r);
+}
+#  define FMT_BUILTIN_CTZLL(n) detail::ctzll(n)
+}  // namespace detail
+FMT_END_NAMESPACE
+#endif
+
+// Enable the deprecated numeric alignment.
+#ifndef FMT_DEPRECATED_NUMERIC_ALIGN
+#  define FMT_DEPRECATED_NUMERIC_ALIGN 0
+#endif
+
+FMT_BEGIN_NAMESPACE
+namespace detail {
+
+// An equivalent of `*reinterpret_cast<Dest*>(&source)` that doesn't have
+// undefined behavior (e.g. due to type aliasing).
+// Example: uint64_t d = bit_cast<uint64_t>(2.718);
+template <typename Dest, typename Source>
+inline Dest bit_cast(const Source& source) {
+  static_assert(sizeof(Dest) == sizeof(Source), "size mismatch");
+  Dest dest;
+  std::memcpy(&dest, &source, sizeof(dest));
+  return dest;
+}
+
+inline bool is_big_endian() {
+  const auto u = 1u;
+  struct bytes {
+    char data[sizeof(u)];
+  };
+  return bit_cast<bytes>(u).data[0] == 0;
+}
+
+// A fallback implementation of uintptr_t for systems that lack it.
+struct fallback_uintptr {
+  unsigned char value[sizeof(void*)];
+
+  fallback_uintptr() = default;
+  explicit fallback_uintptr(const void* p) {
+    *this = bit_cast<fallback_uintptr>(p);
+    if (is_big_endian()) {
+      for (size_t i = 0, j = sizeof(void*) - 1; i < j; ++i, --j)
+        std::swap(value[i], value[j]);
+    }
+  }
+};
+#ifdef UINTPTR_MAX
+using uintptr_t = ::uintptr_t;
+inline uintptr_t to_uintptr(const void* p) { return bit_cast<uintptr_t>(p); }
+#else
+using uintptr_t = fallback_uintptr;
+inline fallback_uintptr to_uintptr(const void* p) {
+  return fallback_uintptr(p);
+}
+#endif
+
+// Returns the largest possible value for type T. Same as
+// std::numeric_limits<T>::max() but shorter and not affected by the max macro.
+template <typename T> constexpr T max_value() {
+  return (std::numeric_limits<T>::max)();
+}
+template <typename T> constexpr int num_bits() {
+  return std::numeric_limits<T>::digits;
+}
+// std::numeric_limits<T>::digits may return 0 for 128-bit ints.
+template <> constexpr int num_bits<int128_t>() { return 128; }
+template <> constexpr int num_bits<uint128_t>() { return 128; }
+template <> constexpr int num_bits<fallback_uintptr>() {
+  return static_cast<int>(sizeof(void*) *
+                          std::numeric_limits<unsigned char>::digits);
+}
+
+FMT_INLINE void assume(bool condition) {
+  (void)condition;
+#if FMT_HAS_BUILTIN(__builtin_assume)
+  __builtin_assume(condition);
+#endif
+}
+
+// An approximation of iterator_t for pre-C++20 systems.
+template <typename T>
+using iterator_t = decltype(std::begin(std::declval<T&>()));
+template <typename T> using sentinel_t = decltype(std::end(std::declval<T&>()));
+
+// A workaround for std::string not having mutable data() until C++17.
+template <typename Char> inline Char* get_data(std::basic_string<Char>& s) {
+  return &s[0];
+}
+template <typename Container>
+inline typename Container::value_type* get_data(Container& c) {
+  return c.data();
+}
+
+#if defined(_SECURE_SCL) && _SECURE_SCL
+// Make a checked iterator to avoid MSVC warnings.
+template <typename T> using checked_ptr = stdext::checked_array_iterator<T*>;
+template <typename T> checked_ptr<T> make_checked(T* p, size_t size) {
+  return {p, size};
+}
+#else
+template <typename T> using checked_ptr = T*;
+template <typename T> inline T* make_checked(T* p, size_t) { return p; }
+#endif
+
+template <typename Container, FMT_ENABLE_IF(is_contiguous<Container>::value)>
+#if FMT_CLANG_VERSION
+__attribute__((no_sanitize("undefined")))
+#endif
+inline checked_ptr<typename Container::value_type>
+reserve(std::back_insert_iterator<Container> it, size_t n) {
+  Container& c = get_container(it);
+  size_t size = c.size();
+  c.resize(size + n);
+  return make_checked(get_data(c) + size, n);
+}
+
+template <typename T>
+inline buffer_appender<T> reserve(buffer_appender<T> it, size_t n) {
+  buffer<T>& buf = get_container(it);
+  buf.try_reserve(buf.size() + n);
+  return it;
+}
+
+template <typename Iterator> inline Iterator& reserve(Iterator& it, size_t) {
+  return it;
+}
+
+template <typename T, typename OutputIt>
+constexpr T* to_pointer(OutputIt, size_t) {
+  return nullptr;
+}
+template <typename T> T* to_pointer(buffer_appender<T> it, size_t n) {
+  buffer<T>& buf = get_container(it);
+  auto size = buf.size();
+  if (buf.capacity() < size + n) return nullptr;
+  buf.try_resize(size + n);
+  return buf.data() + size;
+}
+
+template <typename Container, FMT_ENABLE_IF(is_contiguous<Container>::value)>
+inline std::back_insert_iterator<Container> base_iterator(
+    std::back_insert_iterator<Container>& it,
+    checked_ptr<typename Container::value_type>) {
+  return it;
+}
+
+template <typename Iterator>
+inline Iterator base_iterator(Iterator, Iterator it) {
+  return it;
+}
+
+// An output iterator that counts the number of objects written to it and
+// discards them.
+class counting_iterator {
+ private:
+  size_t count_;
+
+ public:
+  using iterator_category = std::output_iterator_tag;
+  using difference_type = std::ptrdiff_t;
+  using pointer = void;
+  using reference = void;
+  using _Unchecked_type = counting_iterator;  // Mark iterator as checked.
+
+  struct value_type {
+    template <typename T> void operator=(const T&) {}
+  };
+
+  counting_iterator() : count_(0) {}
+
+  size_t count() const { return count_; }
+
+  counting_iterator& operator++() {
+    ++count_;
+    return *this;
+  }
+  counting_iterator operator++(int) {
+    auto it = *this;
+    ++*this;
+    return it;
+  }
+
+  friend counting_iterator operator+(counting_iterator it, difference_type n) {
+    it.count_ += static_cast<size_t>(n);
+    return it;
+  }
+
+  value_type operator*() const { return {}; }
+};
+
+template <typename OutputIt> class truncating_iterator_base {
+ protected:
+  OutputIt out_;
+  size_t limit_;
+  size_t count_;
+
+  truncating_iterator_base(OutputIt out, size_t limit)
+      : out_(out), limit_(limit), count_(0) {}
+
+ public:
+  using iterator_category = std::output_iterator_tag;
+  using value_type = typename std::iterator_traits<OutputIt>::value_type;
+  using difference_type = void;
+  using pointer = void;
+  using reference = void;
+  using _Unchecked_type =
+      truncating_iterator_base;  // Mark iterator as checked.
+
+  OutputIt base() const { return out_; }
+  size_t count() const { return count_; }
+};
+
+// An output iterator that truncates the output and counts the number of objects
+// written to it.
+template <typename OutputIt,
+          typename Enable = typename std::is_void<
+              typename std::iterator_traits<OutputIt>::value_type>::type>
+class truncating_iterator;
+
+template <typename OutputIt>
+class truncating_iterator<OutputIt, std::false_type>
+    : public truncating_iterator_base<OutputIt> {
+  mutable typename truncating_iterator_base<OutputIt>::value_type blackhole_;
+
+ public:
+  using value_type = typename truncating_iterator_base<OutputIt>::value_type;
+
+  truncating_iterator(OutputIt out, size_t limit)
+      : truncating_iterator_base<OutputIt>(out, limit) {}
+
+  truncating_iterator& operator++() {
+    if (this->count_++ < this->limit_) ++this->out_;
+    return *this;
+  }
+
+  truncating_iterator operator++(int) {
+    auto it = *this;
+    ++*this;
+    return it;
+  }
+
+  value_type& operator*() const {
+    return this->count_ < this->limit_ ? *this->out_ : blackhole_;
+  }
+};
+
+template <typename OutputIt>
+class truncating_iterator<OutputIt, std::true_type>
+    : public truncating_iterator_base<OutputIt> {
+ public:
+  truncating_iterator(OutputIt out, size_t limit)
+      : truncating_iterator_base<OutputIt>(out, limit) {}
+
+  template <typename T> truncating_iterator& operator=(T val) {
+    if (this->count_++ < this->limit_) *this->out_++ = val;
+    return *this;
+  }
+
+  truncating_iterator& operator++() { return *this; }
+  truncating_iterator& operator++(int) { return *this; }
+  truncating_iterator& operator*() { return *this; }
+};
+
+template <typename Char>
+inline size_t count_code_points(basic_string_view<Char> s) {
+  return s.size();
+}
+
+// Counts the number of code points in a UTF-8 string.
+inline size_t count_code_points(basic_string_view<char> s) {
+  const char* data = s.data();
+  size_t num_code_points = 0;
+  for (size_t i = 0, size = s.size(); i != size; ++i) {
+    if ((data[i] & 0xc0) != 0x80) ++num_code_points;
+  }
+  return num_code_points;
+}
+
+inline size_t count_code_points(basic_string_view<char8_type> s) {
+  return count_code_points(basic_string_view<char>(
+      reinterpret_cast<const char*>(s.data()), s.size()));
+}
+
+template <typename Char>
+inline size_t code_point_index(basic_string_view<Char> s, size_t n) {
+  size_t size = s.size();
+  return n < size ? n : size;
+}
+
+// Calculates the index of the nth code point in a UTF-8 string.
+inline size_t code_point_index(basic_string_view<char8_type> s, size_t n) {
+  const char8_type* data = s.data();
+  size_t num_code_points = 0;
+  for (size_t i = 0, size = s.size(); i != size; ++i) {
+    if ((data[i] & 0xc0) != 0x80 && ++num_code_points > n) {
+      return i;
+    }
+  }
+  return s.size();
+}
+
+template <typename InputIt, typename OutChar>
+using needs_conversion = bool_constant<
+    std::is_same<typename std::iterator_traits<InputIt>::value_type,
+                 char>::value &&
+    std::is_same<OutChar, char8_type>::value>;
+
+template <typename OutChar, typename InputIt, typename OutputIt,
+          FMT_ENABLE_IF(!needs_conversion<InputIt, OutChar>::value)>
+OutputIt copy_str(InputIt begin, InputIt end, OutputIt it) {
+  return std::copy(begin, end, it);
+}
+
+template <typename OutChar, typename InputIt, typename OutputIt,
+          FMT_ENABLE_IF(needs_conversion<InputIt, OutChar>::value)>
+OutputIt copy_str(InputIt begin, InputIt end, OutputIt it) {
+  return std::transform(begin, end, it,
+                        [](char c) { return static_cast<char8_type>(c); });
+}
+
+template <typename Char, typename InputIt>
+inline counting_iterator copy_str(InputIt begin, InputIt end,
+                                  counting_iterator it) {
+  return it + (end - begin);
+}
+
+template <typename T>
+using is_fast_float = bool_constant<std::numeric_limits<T>::is_iec559 &&
+                                    sizeof(T) <= sizeof(double)>;
+
+#ifndef FMT_USE_FULL_CACHE_DRAGONBOX
+#  define FMT_USE_FULL_CACHE_DRAGONBOX 0
+#endif
+
+template <typename T>
+template <typename U>
+void buffer<T>::append(const U* begin, const U* end) {
+  do {
+    auto count = to_unsigned(end - begin);
+    try_reserve(size_ + count);
+    auto free_cap = capacity_ - size_;
+    if (free_cap < count) count = free_cap;
+    std::uninitialized_copy_n(begin, count, make_checked(ptr_ + size_, count));
+    size_ += count;
+    begin += count;
+  } while (begin != end);
+}
+
+template <typename OutputIt, typename T, typename Traits>
+void iterator_buffer<OutputIt, T, Traits>::flush() {
+  out_ = std::copy_n(data_, this->limit(this->size()), out_);
+  this->clear();
+}
+}  // namespace detail
+
+// The number of characters to store in the basic_memory_buffer object itself
+// to avoid dynamic memory allocation.
+enum { inline_buffer_size = 500 };
+
+/**
+  \rst
+  A dynamically growing memory buffer for trivially copyable/constructible types
+  with the first ``SIZE`` elements stored in the object itself.
+
+  You can use one of the following type aliases for common character types:
+
+  +----------------+------------------------------+
+  | Type           | Definition                   |
+  +================+==============================+
+  | memory_buffer  | basic_memory_buffer<char>    |
+  +----------------+------------------------------+
+  | wmemory_buffer | basic_memory_buffer<wchar_t> |
+  +----------------+------------------------------+
+
+  **Example**::
+
+     fmt::memory_buffer out;
+     format_to(out, "The answer is {}.", 42);
+
+  This will append the following output to the ``out`` object:
+
+  .. code-block:: none
+
+     The answer is 42.
+
+  The output can be converted to an ``std::string`` with ``to_string(out)``.
+  \endrst
+ */
+template <typename T, size_t SIZE = inline_buffer_size,
+          typename Allocator = std::allocator<T>>
+class basic_memory_buffer final : public detail::buffer<T> {
+ private:
+  T store_[SIZE];
+
+  // Don't inherit from Allocator avoid generating type_info for it.
+  Allocator alloc_;
+
+  // Deallocate memory allocated by the buffer.
+  void deallocate() {
+    T* data = this->data();
+    if (data != store_) alloc_.deallocate(data, this->capacity());
+  }
+
+ protected:
+  void grow(size_t size) final FMT_OVERRIDE;
+
+ public:
+  using value_type = T;
+  using const_reference = const T&;
+
+  explicit basic_memory_buffer(const Allocator& alloc = Allocator())
+      : alloc_(alloc) {
+    this->set(store_, SIZE);
+  }
+  ~basic_memory_buffer() { deallocate(); }
+
+ private:
+  // Move data from other to this buffer.
+  void move(basic_memory_buffer& other) {
+    alloc_ = std::move(other.alloc_);
+    T* data = other.data();
+    size_t size = other.size(), capacity = other.capacity();
+    if (data == other.store_) {
+      this->set(store_, capacity);
+      std::uninitialized_copy(other.store_, other.store_ + size,
+                              detail::make_checked(store_, capacity));
+    } else {
+      this->set(data, capacity);
+      // Set pointer to the inline array so that delete is not called
+      // when deallocating.
+      other.set(other.store_, 0);
+    }
+    this->resize(size);
+  }
+
+ public:
+  /**
+    \rst
+    Constructs a :class:`fmt::basic_memory_buffer` object moving the content
+    of the other object to it.
+    \endrst
+   */
+  basic_memory_buffer(basic_memory_buffer&& other) FMT_NOEXCEPT { move(other); }
+
+  /**
+    \rst
+    Moves the content of the other ``basic_memory_buffer`` object to this one.
+    \endrst
+   */
+  basic_memory_buffer& operator=(basic_memory_buffer&& other) FMT_NOEXCEPT {
+    FMT_ASSERT(this != &other, "");
+    deallocate();
+    move(other);
+    return *this;
+  }
+
+  // Returns a copy of the allocator associated with this buffer.
+  Allocator get_allocator() const { return alloc_; }
+
+  /**
+    Resizes the buffer to contain *count* elements. If T is a POD type new
+    elements may not be initialized.
+   */
+  void resize(size_t count) { this->try_resize(count); }
+
+  /** Increases the buffer capacity to *new_capacity*. */
+  void reserve(size_t new_capacity) { this->try_reserve(new_capacity); }
+
+  // Directly append data into the buffer
+  using detail::buffer<T>::append;
+  template <typename ContiguousRange>
+  void append(const ContiguousRange& range) {
+    append(range.data(), range.data() + range.size());
+  }
+};
+
+template <typename T, size_t SIZE, typename Allocator>
+void basic_memory_buffer<T, SIZE, Allocator>::grow(size_t size) {
+#ifdef FMT_FUZZ
+  if (size > 5000) throw std::runtime_error("fuzz mode - won't grow that much");
+#endif
+  size_t old_capacity = this->capacity();
+  size_t new_capacity = old_capacity + old_capacity / 2;
+  if (size > new_capacity) new_capacity = size;
+  T* old_data = this->data();
+  T* new_data =
+      std::allocator_traits<Allocator>::allocate(alloc_, new_capacity);
+  // The following code doesn't throw, so the raw pointer above doesn't leak.
+  std::uninitialized_copy(old_data, old_data + this->size(),
+                          detail::make_checked(new_data, new_capacity));
+  this->set(new_data, new_capacity);
+  // deallocate must not throw according to the standard, but even if it does,
+  // the buffer already uses the new storage and will deallocate it in
+  // destructor.
+  if (old_data != store_) alloc_.deallocate(old_data, old_capacity);
+}
+
+using memory_buffer = basic_memory_buffer<char>;
+using wmemory_buffer = basic_memory_buffer<wchar_t>;
+
+template <typename T, size_t SIZE, typename Allocator>
+struct is_contiguous<basic_memory_buffer<T, SIZE, Allocator>> : std::true_type {
+};
+
+/** A formatting error such as invalid format string. */
+FMT_CLASS_API
+class FMT_API format_error : public std::runtime_error {
+ public:
+  explicit format_error(const char* message) : std::runtime_error(message) {}
+  explicit format_error(const std::string& message)
+      : std::runtime_error(message) {}
+  format_error(const format_error&) = default;
+  format_error& operator=(const format_error&) = default;
+  format_error(format_error&&) = default;
+  format_error& operator=(format_error&&) = default;
+  ~format_error() FMT_NOEXCEPT FMT_OVERRIDE;
+};
+
+namespace detail {
+
+template <typename T>
+using is_signed =
+    std::integral_constant<bool, std::numeric_limits<T>::is_signed ||
+                                     std::is_same<T, int128_t>::value>;
+
+// Returns true if value is negative, false otherwise.
+// Same as `value < 0` but doesn't produce warnings if T is an unsigned type.
+template <typename T, FMT_ENABLE_IF(is_signed<T>::value)>
+FMT_CONSTEXPR bool is_negative(T value) {
+  return value < 0;
+}
+template <typename T, FMT_ENABLE_IF(!is_signed<T>::value)>
+FMT_CONSTEXPR bool is_negative(T) {
+  return false;
+}
+
+template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value)>
+FMT_CONSTEXPR bool is_supported_floating_point(T) {
+  return (std::is_same<T, float>::value && FMT_USE_FLOAT) ||
+         (std::is_same<T, double>::value && FMT_USE_DOUBLE) ||
+         (std::is_same<T, long double>::value && FMT_USE_LONG_DOUBLE);
+}
+
+// Smallest of uint32_t, uint64_t, uint128_t that is large enough to
+// represent all values of an integral type T.
+template <typename T>
+using uint32_or_64_or_128_t =
+    conditional_t<num_bits<T>() <= 32 && !FMT_REDUCE_INT_INSTANTIATIONS,
+                  uint32_t,
+                  conditional_t<num_bits<T>() <= 64, uint64_t, uint128_t>>;
+
+// 128-bit integer type used internally
+struct FMT_EXTERN_TEMPLATE_API uint128_wrapper {
+  uint128_wrapper() = default;
+
+#if FMT_USE_INT128
+  uint128_t internal_;
+
+  uint128_wrapper(uint64_t high, uint64_t low) FMT_NOEXCEPT
+      : internal_{static_cast<uint128_t>(low) |
+                  (static_cast<uint128_t>(high) << 64)} {}
+
+  uint128_wrapper(uint128_t u) : internal_{u} {}
+
+  uint64_t high() const FMT_NOEXCEPT { return uint64_t(internal_ >> 64); }
+  uint64_t low() const FMT_NOEXCEPT { return uint64_t(internal_); }
+
+  uint128_wrapper& operator+=(uint64_t n) FMT_NOEXCEPT {
+    internal_ += n;
+    return *this;
+  }
+#else
+  uint64_t high_;
+  uint64_t low_;
+
+  uint128_wrapper(uint64_t high, uint64_t low) FMT_NOEXCEPT : high_{high},
+                                                              low_{low} {}
+
+  uint64_t high() const FMT_NOEXCEPT { return high_; }
+  uint64_t low() const FMT_NOEXCEPT { return low_; }
+
+  uint128_wrapper& operator+=(uint64_t n) FMT_NOEXCEPT {
+#  if defined(_MSC_VER) && defined(_M_X64)
+    unsigned char carry = _addcarry_u64(0, low_, n, &low_);
+    _addcarry_u64(carry, high_, 0, &high_);
+    return *this;
+#  else
+    uint64_t sum = low_ + n;
+    high_ += (sum < low_ ? 1 : 0);
+    low_ = sum;
+    return *this;
+#  endif
+  }
+#endif
+};
+
+// Table entry type for divisibility test used internally
+template <typename T> struct FMT_EXTERN_TEMPLATE_API divtest_table_entry {
+  T mod_inv;
+  T max_quotient;
+};
+
+// Static data is placed in this class template for the header-only config.
+template <typename T = void> struct FMT_EXTERN_TEMPLATE_API basic_data {
+  static const uint64_t powers_of_10_64[];
+  static const uint32_t zero_or_powers_of_10_32_new[];
+  static const uint64_t zero_or_powers_of_10_64_new[];
+  static const uint64_t grisu_pow10_significands[];
+  static const int16_t grisu_pow10_exponents[];
+  static const divtest_table_entry<uint32_t> divtest_table_for_pow5_32[];
+  static const divtest_table_entry<uint64_t> divtest_table_for_pow5_64[];
+  static const uint64_t dragonbox_pow10_significands_64[];
+  static const uint128_wrapper dragonbox_pow10_significands_128[];
+  // log10(2) = 0x0.4d104d427de7fbcc...
+  static const uint64_t log10_2_significand = 0x4d104d427de7fbcc;
+#if !FMT_USE_FULL_CACHE_DRAGONBOX
+  static const uint64_t powers_of_5_64[];
+  static const uint32_t dragonbox_pow10_recovery_errors[];
+#endif
+  // GCC generates slightly better code for pairs than chars.
+  using digit_pair = char[2];
+  static const digit_pair digits[];
+  static const char hex_digits[];
+  static const char foreground_color[];
+  static const char background_color[];
+  static const char reset_color[5];
+  static const wchar_t wreset_color[5];
+  static const char signs[];
+  static const char left_padding_shifts[5];
+  static const char right_padding_shifts[5];
+
+  // DEPRECATED! These are for ABI compatibility.
+  static const uint32_t zero_or_powers_of_10_32[];
+  static const uint64_t zero_or_powers_of_10_64[];
+};
+
+// Maps bsr(n) to ceil(log10(pow(2, bsr(n) + 1) - 1)).
+// This is a function instead of an array to workaround a bug in GCC10 (#1810).
+FMT_INLINE uint16_t bsr2log10(int bsr) {
+  static constexpr uint16_t data[] = {
+      1,  1,  1,  2,  2,  2,  3,  3,  3,  4,  4,  4,  4,  5,  5,  5,
+      6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  9,  9,  9,  10, 10, 10,
+      10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 15, 15,
+      15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 19, 20};
+  return data[bsr];
+}
+
+#ifndef FMT_EXPORTED
+FMT_EXTERN template struct basic_data<void>;
+#endif
+
+// This is a struct rather than an alias to avoid shadowing warnings in gcc.
+struct data : basic_data<> {};
+
+#ifdef FMT_BUILTIN_CLZLL
+// Returns the number of decimal digits in n. Leading zeros are not counted
+// except for n == 0 in which case count_digits returns 1.
+inline int count_digits(uint64_t n) {
+  // https://github.com/fmtlib/format-benchmark/blob/master/digits10
+  auto t = bsr2log10(FMT_BUILTIN_CLZLL(n | 1) ^ 63);
+  return t - (n < data::zero_or_powers_of_10_64_new[t]);
+}
+#else
+// Fallback version of count_digits used when __builtin_clz is not available.
+inline int count_digits(uint64_t n) {
+  int count = 1;
+  for (;;) {
+    // Integer division is slow so do it for a group of four digits instead
+    // of for every digit. The idea comes from the talk by Alexandrescu
+    // "Three Optimization Tips for C++". See speed-test for a comparison.
+    if (n < 10) return count;
+    if (n < 100) return count + 1;
+    if (n < 1000) return count + 2;
+    if (n < 10000) return count + 3;
+    n /= 10000u;
+    count += 4;
+  }
+}
+#endif
+
+#if FMT_USE_INT128
+inline int count_digits(uint128_t n) {
+  int count = 1;
+  for (;;) {
+    // Integer division is slow so do it for a group of four digits instead
+    // of for every digit. The idea comes from the talk by Alexandrescu
+    // "Three Optimization Tips for C++". See speed-test for a comparison.
+    if (n < 10) return count;
+    if (n < 100) return count + 1;
+    if (n < 1000) return count + 2;
+    if (n < 10000) return count + 3;
+    n /= 10000U;
+    count += 4;
+  }
+}
+#endif
+
+// Counts the number of digits in n. BITS = log2(radix).
+template <unsigned BITS, typename UInt> inline int count_digits(UInt n) {
+  int num_digits = 0;
+  do {
+    ++num_digits;
+  } while ((n >>= BITS) != 0);
+  return num_digits;
+}
+
+template <> int count_digits<4>(detail::fallback_uintptr n);
+
+#if FMT_GCC_VERSION || FMT_CLANG_VERSION
+#  define FMT_ALWAYS_INLINE inline __attribute__((always_inline))
+#elif FMT_MSC_VER
+#  define FMT_ALWAYS_INLINE __forceinline
+#else
+#  define FMT_ALWAYS_INLINE inline
+#endif
+
+// To suppress unnecessary security cookie checks
+#if FMT_MSC_VER && !FMT_CLANG_VERSION
+#  define FMT_SAFEBUFFERS __declspec(safebuffers)
+#else
+#  define FMT_SAFEBUFFERS
+#endif
+
+#ifdef FMT_BUILTIN_CLZ
+// Optional version of count_digits for better performance on 32-bit platforms.
+inline int count_digits(uint32_t n) {
+  auto t = bsr2log10(FMT_BUILTIN_CLZ(n | 1) ^ 31);
+  return t - (n < data::zero_or_powers_of_10_32_new[t]);
+}
+#endif
+
+template <typename Int> constexpr int digits10() FMT_NOEXCEPT {
+  return std::numeric_limits<Int>::digits10;
+}
+template <> constexpr int digits10<int128_t>() FMT_NOEXCEPT { return 38; }
+template <> constexpr int digits10<uint128_t>() FMT_NOEXCEPT { return 38; }
+
+template <typename Char> FMT_API std::string grouping_impl(locale_ref loc);
+template <typename Char> inline std::string grouping(locale_ref loc) {
+  return grouping_impl<char>(loc);
+}
+template <> inline std::string grouping<wchar_t>(locale_ref loc) {
+  return grouping_impl<wchar_t>(loc);
+}
+
+template <typename Char> FMT_API Char thousands_sep_impl(locale_ref loc);
+template <typename Char> inline Char thousands_sep(locale_ref loc) {
+  return Char(thousands_sep_impl<char>(loc));
+}
+template <> inline wchar_t thousands_sep(locale_ref loc) {
+  return thousands_sep_impl<wchar_t>(loc);
+}
+
+template <typename Char> FMT_API Char decimal_point_impl(locale_ref loc);
+template <typename Char> inline Char decimal_point(locale_ref loc) {
+  return Char(decimal_point_impl<char>(loc));
+}
+template <> inline wchar_t decimal_point(locale_ref loc) {
+  return decimal_point_impl<wchar_t>(loc);
+}
+
+// Compares two characters for equality.
+template <typename Char> bool equal2(const Char* lhs, const char* rhs) {
+  return lhs[0] == rhs[0] && lhs[1] == rhs[1];
+}
+inline bool equal2(const char* lhs, const char* rhs) {
+  return memcmp(lhs, rhs, 2) == 0;
+}
+
+// Copies two characters from src to dst.
+template <typename Char> void copy2(Char* dst, const char* src) {
+  *dst++ = static_cast<Char>(*src++);
+  *dst = static_cast<Char>(*src);
+}
+FMT_INLINE void copy2(char* dst, const char* src) { memcpy(dst, src, 2); }
+
+template <typename Iterator> struct format_decimal_result {
+  Iterator begin;
+  Iterator end;
+};
+
+// Formats a decimal unsigned integer value writing into out pointing to a
+// buffer of specified size. The caller must ensure that the buffer is large
+// enough.
+template <typename Char, typename UInt>
+inline format_decimal_result<Char*> format_decimal(Char* out, UInt value,
+                                                   int size) {
+  FMT_ASSERT(size >= count_digits(value), "invalid digit count");
+  out += size;
+  Char* end = out;
+  while (value >= 100) {
+    // Integer division is slow so do it for a group of two digits instead
+    // of for every digit. The idea comes from the talk by Alexandrescu
+    // "Three Optimization Tips for C++". See speed-test for a comparison.
+    out -= 2;
+    copy2(out, data::digits[value % 100]);
+    value /= 100;
+  }
+  if (value < 10) {
+    *--out = static_cast<Char>('0' + value);
+    return {out, end};
+  }
+  out -= 2;
+  copy2(out, data::digits[value]);
+  return {out, end};
+}
+
+template <typename Char, typename UInt, typename Iterator,
+          FMT_ENABLE_IF(!std::is_pointer<remove_cvref_t<Iterator>>::value)>
+inline format_decimal_result<Iterator> format_decimal(Iterator out, UInt value,
+                                                      int size) {
+  // Buffer is large enough to hold all digits (digits10 + 1).
+  Char buffer[digits10<UInt>() + 1];
+  auto end = format_decimal(buffer, value, size).end;
+  return {out, detail::copy_str<Char>(buffer, end, out)};
+}
+
+template <unsigned BASE_BITS, typename Char, typename UInt>
+inline Char* format_uint(Char* buffer, UInt value, int num_digits,
+                         bool upper = false) {
+  buffer += num_digits;
+  Char* end = buffer;
+  do {
+    const char* digits = upper ? "0123456789ABCDEF" : "0123456789abcdef";
+    unsigned digit = (value & ((1 << BASE_BITS) - 1));
+    *--buffer = static_cast<Char>(BASE_BITS < 4 ? static_cast<char>('0' + digit)
+                                                : digits[digit]);
+  } while ((value >>= BASE_BITS) != 0);
+  return end;
+}
+
+template <unsigned BASE_BITS, typename Char>
+Char* format_uint(Char* buffer, detail::fallback_uintptr n, int num_digits,
+                  bool = false) {
+  auto char_digits = std::numeric_limits<unsigned char>::digits / 4;
+  int start = (num_digits + char_digits - 1) / char_digits - 1;
+  if (int start_digits = num_digits % char_digits) {
+    unsigned value = n.value[start--];
+    buffer = format_uint<BASE_BITS>(buffer, value, start_digits);
+  }
+  for (; start >= 0; --start) {
+    unsigned value = n.value[start];
+    buffer += char_digits;
+    auto p = buffer;
+    for (int i = 0; i < char_digits; ++i) {
+      unsigned digit = (value & ((1 << BASE_BITS) - 1));
+      *--p = static_cast<Char>(data::hex_digits[digit]);
+      value >>= BASE_BITS;
+    }
+  }
+  return buffer;
+}
+
+template <unsigned BASE_BITS, typename Char, typename It, typename UInt>
+inline It format_uint(It out, UInt value, int num_digits, bool upper = false) {
+  if (auto ptr = to_pointer<Char>(out, to_unsigned(num_digits))) {
+    format_uint<BASE_BITS>(ptr, value, num_digits, upper);
+    return out;
+  }
+  // Buffer should be large enough to hold all digits (digits / BASE_BITS + 1).
+  char buffer[num_bits<UInt>() / BASE_BITS + 1];
+  format_uint<BASE_BITS>(buffer, value, num_digits, upper);
+  return detail::copy_str<Char>(buffer, buffer + num_digits, out);
+}
+
+// A converter from UTF-8 to UTF-16.
+class utf8_to_utf16 {
+ private:
+  wmemory_buffer buffer_;
+
+ public:
+  FMT_API explicit utf8_to_utf16(string_view s);
+  operator wstring_view() const { return {&buffer_[0], size()}; }
+  size_t size() const { return buffer_.size() - 1; }
+  const wchar_t* c_str() const { return &buffer_[0]; }
+  std::wstring str() const { return {&buffer_[0], size()}; }
+};
+
+template <typename T = void> struct null {};
+
+// Workaround an array initialization issue in gcc 4.8.
+template <typename Char> struct fill_t {
+ private:
+  enum { max_size = 4 };
+  Char data_[max_size] = {Char(' '), Char(0), Char(0), Char(0)};
+  unsigned char size_ = 1;
+
+ public:
+  FMT_CONSTEXPR void operator=(basic_string_view<Char> s) {
+    auto size = s.size();
+    if (size > max_size) {
+      FMT_THROW(format_error("invalid fill"));
+      return;
+    }
+    for (size_t i = 0; i < size; ++i) data_[i] = s[i];
+    size_ = static_cast<unsigned char>(size);
+  }
+
+  size_t size() const { return size_; }
+  const Char* data() const { return data_; }
+
+  FMT_CONSTEXPR Char& operator[](size_t index) { return data_[index]; }
+  FMT_CONSTEXPR const Char& operator[](size_t index) const {
+    return data_[index];
+  }
+};
+}  // namespace detail
+
+// We cannot use enum classes as bit fields because of a gcc bug
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61414.
+namespace align {
+enum type { none, left, right, center, numeric };
+}
+using align_t = align::type;
+
+namespace sign {
+enum type { none, minus, plus, space };
+}
+using sign_t = sign::type;
+
+// Format specifiers for built-in and string types.
+template <typename Char> struct basic_format_specs {
+  int width;
+  int precision;
+  char type;
+  align_t align : 4;
+  sign_t sign : 3;
+  bool alt : 1;  // Alternate form ('#').
+  detail::fill_t<Char> fill;
+
+  constexpr basic_format_specs()
+      : width(0),
+        precision(-1),
+        type(0),
+        align(align::none),
+        sign(sign::none),
+        alt(false) {}
+};
+
+using format_specs = basic_format_specs<char>;
+
+namespace detail {
+namespace dragonbox {
+
+// Type-specific information that Dragonbox uses.
+template <class T> struct float_info;
+
+template <> struct float_info<float> {
+  using carrier_uint = uint32_t;
+  static const int significand_bits = 23;
+  static const int exponent_bits = 8;
+  static const int min_exponent = -126;
+  static const int max_exponent = 127;
+  static const int exponent_bias = -127;
+  static const int decimal_digits = 9;
+  static const int kappa = 1;
+  static const int big_divisor = 100;
+  static const int small_divisor = 10;
+  static const int min_k = -31;
+  static const int max_k = 46;
+  static const int cache_bits = 64;
+  static const int divisibility_check_by_5_threshold = 39;
+  static const int case_fc_pm_half_lower_threshold = -1;
+  static const int case_fc_pm_half_upper_threshold = 6;
+  static const int case_fc_lower_threshold = -2;
+  static const int case_fc_upper_threshold = 6;
+  static const int case_shorter_interval_left_endpoint_lower_threshold = 2;
+  static const int case_shorter_interval_left_endpoint_upper_threshold = 3;
+  static const int shorter_interval_tie_lower_threshold = -35;
+  static const int shorter_interval_tie_upper_threshold = -35;
+  static const int max_trailing_zeros = 7;
+};
+
+template <> struct float_info<double> {
+  using carrier_uint = uint64_t;
+  static const int significand_bits = 52;
+  static const int exponent_bits = 11;
+  static const int min_exponent = -1022;
+  static const int max_exponent = 1023;
+  static const int exponent_bias = -1023;
+  static const int decimal_digits = 17;
+  static const int kappa = 2;
+  static const int big_divisor = 1000;
+  static const int small_divisor = 100;
+  static const int min_k = -292;
+  static const int max_k = 326;
+  static const int cache_bits = 128;
+  static const int divisibility_check_by_5_threshold = 86;
+  static const int case_fc_pm_half_lower_threshold = -2;
+  static const int case_fc_pm_half_upper_threshold = 9;
+  static const int case_fc_lower_threshold = -4;
+  static const int case_fc_upper_threshold = 9;
+  static const int case_shorter_interval_left_endpoint_lower_threshold = 2;
+  static const int case_shorter_interval_left_endpoint_upper_threshold = 3;
+  static const int shorter_interval_tie_lower_threshold = -77;
+  static const int shorter_interval_tie_upper_threshold = -77;
+  static const int max_trailing_zeros = 16;
+};
+
+template <typename T> struct decimal_fp {
+  using significand_type = typename float_info<T>::carrier_uint;
+  significand_type significand;
+  int exponent;
+};
+
+template <typename T> FMT_API decimal_fp<T> to_decimal(T x) FMT_NOEXCEPT;
+}  // namespace dragonbox
+
+template <typename T>
+constexpr typename dragonbox::float_info<T>::carrier_uint exponent_mask() {
+  using uint = typename dragonbox::float_info<T>::carrier_uint;
+  return ((uint(1) << dragonbox::float_info<T>::exponent_bits) - 1)
+         << dragonbox::float_info<T>::significand_bits;
+}
+
+// A floating-point presentation format.
+enum class float_format : unsigned char {
+  general,  // General: exponent notation or fixed point based on magnitude.
+  exp,      // Exponent notation with the default precision of 6, e.g. 1.2e-3.
+  fixed,    // Fixed point with the default precision of 6, e.g. 0.0012.
+  hex
+};
+
+struct float_specs {
+  int precision;
+  float_format format : 8;
+  sign_t sign : 8;
+  bool upper : 1;
+  bool locale : 1;
+  bool binary32 : 1;
+  bool use_grisu : 1;
+  bool showpoint : 1;
+};
+
+// Writes the exponent exp in the form "[+-]d{2,3}" to buffer.
+template <typename Char, typename It> It write_exponent(int exp, It it) {
+  FMT_ASSERT(-10000 < exp && exp < 10000, "exponent out of range");
+  if (exp < 0) {
+    *it++ = static_cast<Char>('-');
+    exp = -exp;
+  } else {
+    *it++ = static_cast<Char>('+');
+  }
+  if (exp >= 100) {
+    const char* top = data::digits[exp / 100];
+    if (exp >= 1000) *it++ = static_cast<Char>(top[0]);
+    *it++ = static_cast<Char>(top[1]);
+    exp %= 100;
+  }
+  const char* d = data::digits[exp];
+  *it++ = static_cast<Char>(d[0]);
+  *it++ = static_cast<Char>(d[1]);
+  return it;
+}
+
+template <typename T>
+int format_float(T value, int precision, float_specs specs, buffer<char>& buf);
+
+// Formats a floating-point number with snprintf.
+template <typename T>
+int snprintf_float(T value, int precision, float_specs specs,
+                   buffer<char>& buf);
+
+template <typename T> T promote_float(T value) { return value; }
+inline double promote_float(float value) { return static_cast<double>(value); }
+
+template <typename Handler>
+FMT_CONSTEXPR void handle_int_type_spec(char spec, Handler&& handler) {
+  switch (spec) {
+  case 0:
+  case 'd':
+    handler.on_dec();
+    break;
+  case 'x':
+  case 'X':
+    handler.on_hex();
+    break;
+  case 'b':
+  case 'B':
+    handler.on_bin();
+    break;
+  case 'o':
+    handler.on_oct();
+    break;
+#ifdef FMT_DEPRECATED_N_SPECIFIER
+  case 'n':
+#endif
+  case 'L':
+    handler.on_num();
+    break;
+  case 'c':
+    handler.on_chr();
+    break;
+  default:
+    handler.on_error();
+  }
+}
+
+template <typename ErrorHandler = error_handler, typename Char>
+FMT_CONSTEXPR float_specs parse_float_type_spec(
+    const basic_format_specs<Char>& specs, ErrorHandler&& eh = {}) {
+  auto result = float_specs();
+  result.showpoint = specs.alt;
+  switch (specs.type) {
+  case 0:
+    result.format = float_format::general;
+    result.showpoint |= specs.precision > 0;
+    break;
+  case 'G':
+    result.upper = true;
+    FMT_FALLTHROUGH;
+  case 'g':
+    result.format = float_format::general;
+    break;
+  case 'E':
+    result.upper = true;
+    FMT_FALLTHROUGH;
+  case 'e':
+    result.format = float_format::exp;
+    result.showpoint |= specs.precision != 0;
+    break;
+  case 'F':
+    result.upper = true;
+    FMT_FALLTHROUGH;
+  case 'f':
+    result.format = float_format::fixed;
+    result.showpoint |= specs.precision != 0;
+    break;
+  case 'A':
+    result.upper = true;
+    FMT_FALLTHROUGH;
+  case 'a':
+    result.format = float_format::hex;
+    break;
+#ifdef FMT_DEPRECATED_N_SPECIFIER
+  case 'n':
+#endif
+  case 'L':
+    result.locale = true;
+    break;
+  default:
+    eh.on_error("invalid type specifier");
+    break;
+  }
+  return result;
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR void handle_char_specs(const basic_format_specs<Char>* specs,
+                                     Handler&& handler) {
+  if (!specs) return handler.on_char();
+  if (specs->type && specs->type != 'c') return handler.on_int();
+  if (specs->align == align::numeric || specs->sign != sign::none || specs->alt)
+    handler.on_error("invalid format specifier for char");
+  handler.on_char();
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR void handle_cstring_type_spec(Char spec, Handler&& handler) {
+  if (spec == 0 || spec == 's')
+    handler.on_string();
+  else if (spec == 'p')
+    handler.on_pointer();
+  else
+    handler.on_error("invalid type specifier");
+}
+
+template <typename Char, typename ErrorHandler>
+FMT_CONSTEXPR void check_string_type_spec(Char spec, ErrorHandler&& eh) {
+  if (spec != 0 && spec != 's') eh.on_error("invalid type specifier");
+}
+
+template <typename Char, typename ErrorHandler>
+FMT_CONSTEXPR void check_pointer_type_spec(Char spec, ErrorHandler&& eh) {
+  if (spec != 0 && spec != 'p') eh.on_error("invalid type specifier");
+}
+
+template <typename ErrorHandler> class int_type_checker : private ErrorHandler {
+ public:
+  FMT_CONSTEXPR explicit int_type_checker(ErrorHandler eh) : ErrorHandler(eh) {}
+
+  FMT_CONSTEXPR void on_dec() {}
+  FMT_CONSTEXPR void on_hex() {}
+  FMT_CONSTEXPR void on_bin() {}
+  FMT_CONSTEXPR void on_oct() {}
+  FMT_CONSTEXPR void on_num() {}
+  FMT_CONSTEXPR void on_chr() {}
+
+  FMT_CONSTEXPR void on_error() {
+    ErrorHandler::on_error("invalid type specifier");
+  }
+};
+
+template <typename ErrorHandler>
+class char_specs_checker : public ErrorHandler {
+ private:
+  char type_;
+
+ public:
+  FMT_CONSTEXPR char_specs_checker(char type, ErrorHandler eh)
+      : ErrorHandler(eh), type_(type) {}
+
+  FMT_CONSTEXPR void on_int() {
+    handle_int_type_spec(type_, int_type_checker<ErrorHandler>(*this));
+  }
+  FMT_CONSTEXPR void on_char() {}
+};
+
+template <typename ErrorHandler>
+class cstring_type_checker : public ErrorHandler {
+ public:
+  FMT_CONSTEXPR explicit cstring_type_checker(ErrorHandler eh)
+      : ErrorHandler(eh) {}
+
+  FMT_CONSTEXPR void on_string() {}
+  FMT_CONSTEXPR void on_pointer() {}
+};
+
+template <typename OutputIt, typename Char>
+FMT_NOINLINE OutputIt fill(OutputIt it, size_t n, const fill_t<Char>& fill) {
+  auto fill_size = fill.size();
+  if (fill_size == 1) return std::fill_n(it, n, fill[0]);
+  for (size_t i = 0; i < n; ++i) it = std::copy_n(fill.data(), fill_size, it);
+  return it;
+}
+
+// Writes the output of f, padded according to format specifications in specs.
+// size: output size in code units.
+// width: output display width in (terminal) column positions.
+template <align::type align = align::left, typename OutputIt, typename Char,
+          typename F>
+inline OutputIt write_padded(OutputIt out,
+                             const basic_format_specs<Char>& specs, size_t size,
+                             size_t width, F&& f) {
+  static_assert(align == align::left || align == align::right, "");
+  unsigned spec_width = to_unsigned(specs.width);
+  size_t padding = spec_width > width ? spec_width - width : 0;
+  auto* shifts = align == align::left ? data::left_padding_shifts
+                                      : data::right_padding_shifts;
+  size_t left_padding = padding >> shifts[specs.align];
+  auto it = reserve(out, size + padding * specs.fill.size());
+  it = fill(it, left_padding, specs.fill);
+  it = f(it);
+  it = fill(it, padding - left_padding, specs.fill);
+  return base_iterator(out, it);
+}
+
+template <align::type align = align::left, typename OutputIt, typename Char,
+          typename F>
+inline OutputIt write_padded(OutputIt out,
+                             const basic_format_specs<Char>& specs, size_t size,
+                             F&& f) {
+  return write_padded<align>(out, specs, size, size, f);
+}
+
+template <typename Char, typename OutputIt>
+OutputIt write_bytes(OutputIt out, string_view bytes,
+                     const basic_format_specs<Char>& specs) {
+  using iterator = remove_reference_t<decltype(reserve(out, 0))>;
+  return write_padded(out, specs, bytes.size(), [bytes](iterator it) {
+    const char* data = bytes.data();
+    return copy_str<Char>(data, data + bytes.size(), it);
+  });
+}
+
+// Data for write_int that doesn't depend on output iterator type. It is used to
+// avoid template code bloat.
+template <typename Char> struct write_int_data {
+  size_t size;
+  size_t padding;
+
+  write_int_data(int num_digits, string_view prefix,
+                 const basic_format_specs<Char>& specs)
+      : size(prefix.size() + to_unsigned(num_digits)), padding(0) {
+    if (specs.align == align::numeric) {
+      auto width = to_unsigned(specs.width);
+      if (width > size) {
+        padding = width - size;
+        size = width;
+      }
+    } else if (specs.precision > num_digits) {
+      size = prefix.size() + to_unsigned(specs.precision);
+      padding = to_unsigned(specs.precision - num_digits);
+    }
+  }
+};
+
+// Writes an integer in the format
+//   <left-padding><prefix><numeric-padding><digits><right-padding>
+// where <digits> are written by f(it).
+template <typename OutputIt, typename Char, typename F>
+OutputIt write_int(OutputIt out, int num_digits, string_view prefix,
+                   const basic_format_specs<Char>& specs, F f) {
+  auto data = write_int_data<Char>(num_digits, prefix, specs);
+  using iterator = remove_reference_t<decltype(reserve(out, 0))>;
+  return write_padded<align::right>(out, specs, data.size, [=](iterator it) {
+    if (prefix.size() != 0)
+      it = copy_str<Char>(prefix.begin(), prefix.end(), it);
+    it = std::fill_n(it, data.padding, static_cast<Char>('0'));
+    return f(it);
+  });
+}
+
+template <typename StrChar, typename Char, typename OutputIt>
+OutputIt write(OutputIt out, basic_string_view<StrChar> s,
+               const basic_format_specs<Char>& specs) {
+  auto data = s.data();
+  auto size = s.size();
+  if (specs.precision >= 0 && to_unsigned(specs.precision) < size)
+    size = code_point_index(s, to_unsigned(specs.precision));
+  auto width = specs.width != 0
+                   ? count_code_points(basic_string_view<StrChar>(data, size))
+                   : 0;
+  using iterator = remove_reference_t<decltype(reserve(out, 0))>;
+  return write_padded(out, specs, size, width, [=](iterator it) {
+    return copy_str<Char>(data, data + size, it);
+  });
+}
+
+// The handle_int_type_spec handler that writes an integer.
+template <typename OutputIt, typename Char, typename UInt> struct int_writer {
+  OutputIt out;
+  locale_ref locale;
+  const basic_format_specs<Char>& specs;
+  UInt abs_value;
+  char prefix[4];
+  unsigned prefix_size;
+
+  using iterator =
+      remove_reference_t<decltype(reserve(std::declval<OutputIt&>(), 0))>;
+
+  string_view get_prefix() const { return string_view(prefix, prefix_size); }
+
+  template <typename Int>
+  int_writer(OutputIt output, locale_ref loc, Int value,
+             const basic_format_specs<Char>& s)
+      : out(output),
+        locale(loc),
+        specs(s),
+        abs_value(static_cast<UInt>(value)),
+        prefix_size(0) {
+    static_assert(std::is_same<uint32_or_64_or_128_t<Int>, UInt>::value, "");
+    if (is_negative(value)) {
+      prefix[0] = '-';
+      ++prefix_size;
+      abs_value = 0 - abs_value;
+    } else if (specs.sign != sign::none && specs.sign != sign::minus) {
+      prefix[0] = specs.sign == sign::plus ? '+' : ' ';
+      ++prefix_size;
+    }
+  }
+
+  void on_dec() {
+    auto num_digits = count_digits(abs_value);
+    out = write_int(
+        out, num_digits, get_prefix(), specs, [this, num_digits](iterator it) {
+          return format_decimal<Char>(it, abs_value, num_digits).end;
+        });
+  }
+
+  void on_hex() {
+    if (specs.alt) {
+      prefix[prefix_size++] = '0';
+      prefix[prefix_size++] = specs.type;
+    }
+    int num_digits = count_digits<4>(abs_value);
+    out = write_int(out, num_digits, get_prefix(), specs,
+                    [this, num_digits](iterator it) {
+                      return format_uint<4, Char>(it, abs_value, num_digits,
+                                                  specs.type != 'x');
+                    });
+  }
+
+  void on_bin() {
+    if (specs.alt) {
+      prefix[prefix_size++] = '0';
+      prefix[prefix_size++] = static_cast<char>(specs.type);
+    }
+    int num_digits = count_digits<1>(abs_value);
+    out = write_int(out, num_digits, get_prefix(), specs,
+                    [this, num_digits](iterator it) {
+                      return format_uint<1, Char>(it, abs_value, num_digits);
+                    });
+  }
+
+  void on_oct() {
+    int num_digits = count_digits<3>(abs_value);
+    if (specs.alt && specs.precision <= num_digits && abs_value != 0) {
+      // Octal prefix '0' is counted as a digit, so only add it if precision
+      // is not greater than the number of digits.
+      prefix[prefix_size++] = '0';
+    }
+    out = write_int(out, num_digits, get_prefix(), specs,
+                    [this, num_digits](iterator it) {
+                      return format_uint<3, Char>(it, abs_value, num_digits);
+                    });
+  }
+
+  enum { sep_size = 1 };
+
+  void on_num() {
+    std::string groups = grouping<Char>(locale);
+    if (groups.empty()) return on_dec();
+    auto sep = thousands_sep<Char>(locale);
+    if (!sep) return on_dec();
+    int num_digits = count_digits(abs_value);
+    int size = num_digits, n = num_digits;
+    std::string::const_iterator group = groups.cbegin();
+    while (group != groups.cend() && n > *group && *group > 0 &&
+           *group != max_value<char>()) {
+      size += sep_size;
+      n -= *group;
+      ++group;
+    }
+    if (group == groups.cend()) size += sep_size * ((n - 1) / groups.back());
+    char digits[40];
+    format_decimal(digits, abs_value, num_digits);
+    basic_memory_buffer<Char> buffer;
+    size += static_cast<int>(prefix_size);
+    const auto usize = to_unsigned(size);
+    buffer.resize(usize);
+    basic_string_view<Char> s(&sep, sep_size);
+    // Index of a decimal digit with the least significant digit having index 0.
+    int digit_index = 0;
+    group = groups.cbegin();
+    auto p = buffer.data() + size - 1;
+    for (int i = num_digits - 1; i > 0; --i) {
+      *p-- = static_cast<Char>(digits[i]);
+      if (*group <= 0 || ++digit_index % *group != 0 ||
+          *group == max_value<char>())
+        continue;
+      if (group + 1 != groups.cend()) {
+        digit_index = 0;
+        ++group;
+      }
+      std::uninitialized_copy(s.data(), s.data() + s.size(),
+                              make_checked(p, s.size()));
+      p -= s.size();
+    }
+    *p-- = static_cast<Char>(*digits);
+    if (prefix_size != 0) *p = static_cast<Char>('-');
+    auto data = buffer.data();
+    out = write_padded<align::right>(
+        out, specs, usize, usize,
+        [=](iterator it) { return copy_str<Char>(data, data + size, it); });
+  }
+
+  void on_chr() { *out++ = static_cast<Char>(abs_value); }
+
+  FMT_NORETURN void on_error() {
+    FMT_THROW(format_error("invalid type specifier"));
+  }
+};
+
+template <typename Char, typename OutputIt>
+OutputIt write_nonfinite(OutputIt out, bool isinf,
+                         const basic_format_specs<Char>& specs,
+                         const float_specs& fspecs) {
+  auto str =
+      isinf ? (fspecs.upper ? "INF" : "inf") : (fspecs.upper ? "NAN" : "nan");
+  constexpr size_t str_size = 3;
+  auto sign = fspecs.sign;
+  auto size = str_size + (sign ? 1 : 0);
+  using iterator = remove_reference_t<decltype(reserve(out, 0))>;
+  return write_padded(out, specs, size, [=](iterator it) {
+    if (sign) *it++ = static_cast<Char>(data::signs[sign]);
+    return copy_str<Char>(str, str + str_size, it);
+  });
+}
+
+// A decimal floating-point number significand * pow(10, exp).
+struct big_decimal_fp {
+  const char* significand;
+  int significand_size;
+  int exponent;
+};
+
+inline int get_significand_size(const big_decimal_fp& fp) {
+  return fp.significand_size;
+}
+template <typename T>
+inline int get_significand_size(const dragonbox::decimal_fp<T>& fp) {
+  return count_digits(fp.significand);
+}
+
+template <typename Char, typename OutputIt>
+inline OutputIt write_significand(OutputIt out, const char* significand,
+                                  int& significand_size) {
+  return copy_str<Char>(significand, significand + significand_size, out);
+}
+template <typename Char, typename OutputIt, typename UInt>
+inline OutputIt write_significand(OutputIt out, UInt significand,
+                                  int significand_size) {
+  return format_decimal<Char>(out, significand, significand_size).end;
+}
+
+template <typename Char, typename UInt,
+          FMT_ENABLE_IF(std::is_integral<UInt>::value)>
+inline Char* write_significand(Char* out, UInt significand,
+                               int significand_size, int integral_size,
+                               Char decimal_point) {
+  if (!decimal_point)
+    return format_decimal(out, significand, significand_size).end;
+  auto end = format_decimal(out + 1, significand, significand_size).end;
+  if (integral_size == 1)
+    out[0] = out[1];
+  else
+    std::copy_n(out + 1, integral_size, out);
+  out[integral_size] = decimal_point;
+  return end;
+}
+
+template <typename OutputIt, typename UInt, typename Char,
+          FMT_ENABLE_IF(!std::is_pointer<remove_cvref_t<OutputIt>>::value)>
+inline OutputIt write_significand(OutputIt out, UInt significand,
+                                  int significand_size, int integral_size,
+                                  Char decimal_point) {
+  // Buffer is large enough to hold digits (digits10 + 1) and a decimal point.
+  Char buffer[digits10<UInt>() + 2];
+  auto end = write_significand(buffer, significand, significand_size,
+                               integral_size, decimal_point);
+  return detail::copy_str<Char>(buffer, end, out);
+}
+
+template <typename OutputIt, typename Char>
+inline OutputIt write_significand(OutputIt out, const char* significand,
+                                  int significand_size, int integral_size,
+                                  Char decimal_point) {
+  out = detail::copy_str<Char>(significand, significand + integral_size, out);
+  if (!decimal_point) return out;
+  *out++ = decimal_point;
+  return detail::copy_str<Char>(significand + integral_size,
+                                significand + significand_size, out);
+}
+
+template <typename OutputIt, typename DecimalFP, typename Char>
+OutputIt write_float(OutputIt out, const DecimalFP& fp,
+                     const basic_format_specs<Char>& specs, float_specs fspecs,
+                     Char decimal_point) {
+  auto significand = fp.significand;
+  int significand_size = get_significand_size(fp);
+  static const Char zero = static_cast<Char>('0');
+  auto sign = fspecs.sign;
+  size_t size = to_unsigned(significand_size) + (sign ? 1 : 0);
+  using iterator = remove_reference_t<decltype(reserve(out, 0))>;
+
+  int output_exp = fp.exponent + significand_size - 1;
+  auto use_exp_format = [=]() {
+    if (fspecs.format == float_format::exp) return true;
+    if (fspecs.format != float_format::general) return false;
+    // Use the fixed notation if the exponent is in [exp_lower, exp_upper),
+    // e.g. 0.0001 instead of 1e-04. Otherwise use the exponent notation.
+    const int exp_lower = -4, exp_upper = 16;
+    return output_exp < exp_lower ||
+           output_exp >= (fspecs.precision > 0 ? fspecs.precision : exp_upper);
+  };
+  if (use_exp_format()) {
+    int num_zeros = 0;
+    if (fspecs.showpoint) {
+      num_zeros = (std::max)(fspecs.precision - significand_size, 0);
+      size += to_unsigned(num_zeros);
+    } else if (significand_size == 1) {
+      decimal_point = Char();
+    }
+    auto abs_output_exp = output_exp >= 0 ? output_exp : -output_exp;
+    int exp_digits = 2;
+    if (abs_output_exp >= 100) exp_digits = abs_output_exp >= 1000 ? 4 : 3;
+
+    size += to_unsigned((decimal_point ? 1 : 0) + 2 + exp_digits);
+    char exp_char = fspecs.upper ? 'E' : 'e';
+    auto write = [=](iterator it) {
+      if (sign) *it++ = static_cast<Char>(data::signs[sign]);
+      // Insert a decimal point after the first digit and add an exponent.
+      it = write_significand(it, significand, significand_size, 1,
+                             decimal_point);
+      if (num_zeros > 0) it = std::fill_n(it, num_zeros, zero);
+      *it++ = static_cast<Char>(exp_char);
+      return write_exponent<Char>(output_exp, it);
+    };
+    return specs.width > 0 ? write_padded<align::right>(out, specs, size, write)
+                           : base_iterator(out, write(reserve(out, size)));
+  }
+
+  int exp = fp.exponent + significand_size;
+  if (fp.exponent >= 0) {
+    // 1234e5 -> 123400000[.0+]
+    size += to_unsigned(fp.exponent);
+    int num_zeros = fspecs.precision - exp;
+#ifdef FMT_FUZZ
+    if (num_zeros > 5000)
+      throw std::runtime_error("fuzz mode - avoiding excessive cpu use");
+#endif
+    if (fspecs.showpoint) {
+      if (num_zeros <= 0 && fspecs.format != float_format::fixed) num_zeros = 1;
+      if (num_zeros > 0) size += to_unsigned(num_zeros);
+    }
+    return write_padded<align::right>(out, specs, size, [&](iterator it) {
+      if (sign) *it++ = static_cast<Char>(data::signs[sign]);
+      it = write_significand<Char>(it, significand, significand_size);
+      it = std::fill_n(it, fp.exponent, zero);
+      if (!fspecs.showpoint) return it;
+      *it++ = decimal_point;
+      return num_zeros > 0 ? std::fill_n(it, num_zeros, zero) : it;
+    });
+  } else if (exp > 0) {
+    // 1234e-2 -> 12.34[0+]
+    int num_zeros = fspecs.showpoint ? fspecs.precision - significand_size : 0;
+    size += 1 + to_unsigned(num_zeros > 0 ? num_zeros : 0);
+    return write_padded<align::right>(out, specs, size, [&](iterator it) {
+      if (sign) *it++ = static_cast<Char>(data::signs[sign]);
+      it = write_significand(it, significand, significand_size, exp,
+                             decimal_point);
+      return num_zeros > 0 ? std::fill_n(it, num_zeros, zero) : it;
+    });
+  }
+  // 1234e-6 -> 0.001234
+  int num_zeros = -exp;
+  if (significand_size == 0 && fspecs.precision >= 0 &&
+      fspecs.precision < num_zeros) {
+    num_zeros = fspecs.precision;
+  }
+  size += 2 + to_unsigned(num_zeros);
+  return write_padded<align::right>(out, specs, size, [&](iterator it) {
+    if (sign) *it++ = static_cast<Char>(data::signs[sign]);
+    *it++ = zero;
+    if (num_zeros == 0 && significand_size == 0 && !fspecs.showpoint) return it;
+    *it++ = decimal_point;
+    it = std::fill_n(it, num_zeros, zero);
+    return write_significand<Char>(it, significand, significand_size);
+  });
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(std::is_floating_point<T>::value)>
+OutputIt write(OutputIt out, T value, basic_format_specs<Char> specs,
+               locale_ref loc = {}) {
+  if (const_check(!is_supported_floating_point(value))) return out;
+  float_specs fspecs = parse_float_type_spec(specs);
+  fspecs.sign = specs.sign;
+  if (std::signbit(value)) {  // value < 0 is false for NaN so use signbit.
+    fspecs.sign = sign::minus;
+    value = -value;
+  } else if (fspecs.sign == sign::minus) {
+    fspecs.sign = sign::none;
+  }
+
+  if (!std::isfinite(value))
+    return write_nonfinite(out, std::isinf(value), specs, fspecs);
+
+  if (specs.align == align::numeric && fspecs.sign) {
+    auto it = reserve(out, 1);
+    *it++ = static_cast<Char>(data::signs[fspecs.sign]);
+    out = base_iterator(out, it);
+    fspecs.sign = sign::none;
+    if (specs.width != 0) --specs.width;
+  }
+
+  memory_buffer buffer;
+  if (fspecs.format == float_format::hex) {
+    if (fspecs.sign) buffer.push_back(data::signs[fspecs.sign]);
+    snprintf_float(promote_float(value), specs.precision, fspecs, buffer);
+    return write_bytes(out, {buffer.data(), buffer.size()}, specs);
+  }
+  int precision = specs.precision >= 0 || !specs.type ? specs.precision : 6;
+  if (fspecs.format == float_format::exp) {
+    if (precision == max_value<int>())
+      FMT_THROW(format_error("number is too big"));
+    else
+      ++precision;
+  }
+  if (const_check(std::is_same<T, float>())) fspecs.binary32 = true;
+  fspecs.use_grisu = is_fast_float<T>();
+  int exp = format_float(promote_float(value), precision, fspecs, buffer);
+  fspecs.precision = precision;
+  Char point =
+      fspecs.locale ? decimal_point<Char>(loc) : static_cast<Char>('.');
+  auto fp = big_decimal_fp{buffer.data(), static_cast<int>(buffer.size()), exp};
+  return write_float(out, fp, specs, fspecs, point);
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_fast_float<T>::value)>
+OutputIt write(OutputIt out, T value) {
+  if (const_check(!is_supported_floating_point(value))) return out;
+
+  using floaty = conditional_t<std::is_same<T, long double>::value, double, T>;
+  using uint = typename dragonbox::float_info<floaty>::carrier_uint;
+  auto bits = bit_cast<uint>(value);
+
+  auto fspecs = float_specs();
+  auto sign_bit = bits & (uint(1) << (num_bits<uint>() - 1));
+  if (sign_bit != 0) {
+    fspecs.sign = sign::minus;
+    value = -value;
+  }
+
+  static const auto specs = basic_format_specs<Char>();
+  uint mask = exponent_mask<floaty>();
+  if ((bits & mask) == mask)
+    return write_nonfinite(out, std::isinf(value), specs, fspecs);
+
+  auto dec = dragonbox::to_decimal(static_cast<floaty>(value));
+  return write_float(out, dec, specs, fspecs, static_cast<Char>('.'));
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(std::is_floating_point<T>::value &&
+                        !is_fast_float<T>::value)>
+inline OutputIt write(OutputIt out, T value) {
+  return write(out, value, basic_format_specs<Char>());
+}
+
+template <typename Char, typename OutputIt>
+OutputIt write_char(OutputIt out, Char value,
+                    const basic_format_specs<Char>& specs) {
+  using iterator = remove_reference_t<decltype(reserve(out, 0))>;
+  return write_padded(out, specs, 1, [=](iterator it) {
+    *it++ = value;
+    return it;
+  });
+}
+
+template <typename Char, typename OutputIt, typename UIntPtr>
+OutputIt write_ptr(OutputIt out, UIntPtr value,
+                   const basic_format_specs<Char>* specs) {
+  int num_digits = count_digits<4>(value);
+  auto size = to_unsigned(num_digits) + size_t(2);
+  using iterator = remove_reference_t<decltype(reserve(out, 0))>;
+  auto write = [=](iterator it) {
+    *it++ = static_cast<Char>('0');
+    *it++ = static_cast<Char>('x');
+    return format_uint<4, Char>(it, value, num_digits);
+  };
+  return specs ? write_padded<align::right>(out, *specs, size, write)
+               : base_iterator(out, write(reserve(out, size)));
+}
+
+template <typename T> struct is_integral : std::is_integral<T> {};
+template <> struct is_integral<int128_t> : std::true_type {};
+template <> struct is_integral<uint128_t> : std::true_type {};
+
+template <typename Char, typename OutputIt>
+OutputIt write(OutputIt out, monostate) {
+  FMT_ASSERT(false, "");
+  return out;
+}
+
+template <typename Char, typename OutputIt,
+          FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
+OutputIt write(OutputIt out, string_view value) {
+  auto it = reserve(out, value.size());
+  it = copy_str<Char>(value.begin(), value.end(), it);
+  return base_iterator(out, it);
+}
+
+template <typename Char, typename OutputIt>
+OutputIt write(OutputIt out, basic_string_view<Char> value) {
+  auto it = reserve(out, value.size());
+  it = std::copy(value.begin(), value.end(), it);
+  return base_iterator(out, it);
+}
+
+template <typename Char>
+buffer_appender<Char> write(buffer_appender<Char> out,
+                            basic_string_view<Char> value) {
+  get_container(out).append(value.begin(), value.end());
+  return out;
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_integral<T>::value &&
+                        !std::is_same<T, bool>::value &&
+                        !std::is_same<T, Char>::value)>
+OutputIt write(OutputIt out, T value) {
+  auto abs_value = static_cast<uint32_or_64_or_128_t<T>>(value);
+  bool negative = is_negative(value);
+  // Don't do -abs_value since it trips unsigned-integer-overflow sanitizer.
+  if (negative) abs_value = ~abs_value + 1;
+  int num_digits = count_digits(abs_value);
+  auto size = (negative ? 1 : 0) + static_cast<size_t>(num_digits);
+  auto it = reserve(out, size);
+  if (auto ptr = to_pointer<Char>(it, size)) {
+    if (negative) *ptr++ = static_cast<Char>('-');
+    format_decimal<Char>(ptr, abs_value, num_digits);
+    return out;
+  }
+  if (negative) *it++ = static_cast<Char>('-');
+  it = format_decimal<Char>(it, abs_value, num_digits).end;
+  return base_iterator(out, it);
+}
+
+template <typename Char, typename OutputIt>
+OutputIt write(OutputIt out, bool value) {
+  return write<Char>(out, string_view(value ? "true" : "false"));
+}
+
+template <typename Char, typename OutputIt>
+OutputIt write(OutputIt out, Char value) {
+  auto it = reserve(out, 1);
+  *it++ = value;
+  return base_iterator(out, it);
+}
+
+template <typename Char, typename OutputIt>
+OutputIt write(OutputIt out, const Char* value) {
+  if (!value) {
+    FMT_THROW(format_error("string pointer is null"));
+  } else {
+    auto length = std::char_traits<Char>::length(value);
+    out = write(out, basic_string_view<Char>(value, length));
+  }
+  return out;
+}
+
+template <typename Char, typename OutputIt>
+OutputIt write(OutputIt out, const void* value) {
+  return write_ptr<Char>(out, to_uintptr(value), nullptr);
+}
+
+template <typename Char, typename OutputIt, typename T>
+auto write(OutputIt out, const T& value) -> typename std::enable_if<
+    mapped_type_constant<T, basic_format_context<OutputIt, Char>>::value ==
+        type::custom_type,
+    OutputIt>::type {
+  using context_type = basic_format_context<OutputIt, Char>;
+  using formatter_type =
+      conditional_t<has_formatter<T, context_type>::value,
+                    typename context_type::template formatter_type<T>,
+                    fallback_formatter<T, Char>>;
+  context_type ctx(out, {}, {});
+  return formatter_type().format(value, ctx);
+}
+
+// An argument visitor that formats the argument and writes it via the output
+// iterator. It's a class and not a generic lambda for compatibility with C++11.
+template <typename OutputIt, typename Char> struct default_arg_formatter {
+  using context = basic_format_context<OutputIt, Char>;
+
+  OutputIt out;
+  basic_format_args<context> args;
+  locale_ref loc;
+
+  template <typename T> OutputIt operator()(T value) {
+    return write<Char>(out, value);
+  }
+
+  OutputIt operator()(typename basic_format_arg<context>::handle handle) {
+    basic_format_parse_context<Char> parse_ctx({});
+    basic_format_context<OutputIt, Char> format_ctx(out, args, loc);
+    handle.format(parse_ctx, format_ctx);
+    return format_ctx.out();
+  }
+};
+
+template <typename OutputIt, typename Char,
+          typename ErrorHandler = error_handler>
+class arg_formatter_base {
+ public:
+  using iterator = OutputIt;
+  using char_type = Char;
+  using format_specs = basic_format_specs<Char>;
+
+ private:
+  iterator out_;
+  locale_ref locale_;
+  format_specs* specs_;
+
+  // Attempts to reserve space for n extra characters in the output range.
+  // Returns a pointer to the reserved range or a reference to out_.
+  auto reserve(size_t n) -> decltype(detail::reserve(out_, n)) {
+    return detail::reserve(out_, n);
+  }
+
+  using reserve_iterator = remove_reference_t<decltype(
+      detail::reserve(std::declval<iterator&>(), 0))>;
+
+  template <typename T> void write_int(T value, const format_specs& spec) {
+    using uint_type = uint32_or_64_or_128_t<T>;
+    int_writer<iterator, Char, uint_type> w(out_, locale_, value, spec);
+    handle_int_type_spec(spec.type, w);
+    out_ = w.out;
+  }
+
+  void write(char value) {
+    auto&& it = reserve(1);
+    *it++ = value;
+  }
+
+  template <typename Ch, FMT_ENABLE_IF(std::is_same<Ch, Char>::value)>
+  void write(Ch value) {
+    out_ = detail::write<Char>(out_, value);
+  }
+
+  void write(string_view value) {
+    auto&& it = reserve(value.size());
+    it = copy_str<Char>(value.begin(), value.end(), it);
+  }
+  void write(wstring_view value) {
+    static_assert(std::is_same<Char, wchar_t>::value, "");
+    auto&& it = reserve(value.size());
+    it = std::copy(value.begin(), value.end(), it);
+  }
+
+  template <typename Ch>
+  void write(const Ch* s, size_t size, const format_specs& specs) {
+    auto width = specs.width != 0
+                     ? count_code_points(basic_string_view<Ch>(s, size))
+                     : 0;
+    out_ = write_padded(out_, specs, size, width, [=](reserve_iterator it) {
+      return copy_str<Char>(s, s + size, it);
+    });
+  }
+
+  template <typename Ch>
+  void write(basic_string_view<Ch> s, const format_specs& specs = {}) {
+    out_ = detail::write(out_, s, specs);
+  }
+
+  void write_pointer(const void* p) {
+    out_ = write_ptr<char_type>(out_, to_uintptr(p), specs_);
+  }
+
+  struct char_spec_handler : ErrorHandler {
+    arg_formatter_base& formatter;
+    Char value;
+
+    char_spec_handler(arg_formatter_base& f, Char val)
+        : formatter(f), value(val) {}
+
+    void on_int() {
+      // char is only formatted as int if there are specs.
+      formatter.write_int(static_cast<int>(value), *formatter.specs_);
+    }
+    void on_char() {
+      if (formatter.specs_)
+        formatter.out_ = write_char(formatter.out_, value, *formatter.specs_);
+      else
+        formatter.write(value);
+    }
+  };
+
+  struct cstring_spec_handler : error_handler {
+    arg_formatter_base& formatter;
+    const Char* value;
+
+    cstring_spec_handler(arg_formatter_base& f, const Char* val)
+        : formatter(f), value(val) {}
+
+    void on_string() { formatter.write(value); }
+    void on_pointer() { formatter.write_pointer(value); }
+  };
+
+ protected:
+  iterator out() { return out_; }
+  format_specs* specs() { return specs_; }
+
+  void write(bool value) {
+    if (specs_)
+      write(string_view(value ? "true" : "false"), *specs_);
+    else
+      out_ = detail::write<Char>(out_, value);
+  }
+
+  void write(const Char* value) {
+    if (!value) {
+      FMT_THROW(format_error("string pointer is null"));
+    } else {
+      auto length = std::char_traits<char_type>::length(value);
+      basic_string_view<char_type> sv(value, length);
+      specs_ ? write(sv, *specs_) : write(sv);
+    }
+  }
+
+ public:
+  arg_formatter_base(OutputIt out, format_specs* s, locale_ref loc)
+      : out_(out), locale_(loc), specs_(s) {}
+
+  iterator operator()(monostate) {
+    FMT_ASSERT(false, "invalid argument type");
+    return out_;
+  }
+
+  template <typename T, FMT_ENABLE_IF(is_integral<T>::value)>
+  FMT_INLINE iterator operator()(T value) {
+    if (specs_)
+      write_int(value, *specs_);
+    else
+      out_ = detail::write<Char>(out_, value);
+    return out_;
+  }
+
+  iterator operator()(Char value) {
+    handle_char_specs(specs_,
+                      char_spec_handler(*this, static_cast<Char>(value)));
+    return out_;
+  }
+
+  iterator operator()(bool value) {
+    if (specs_ && specs_->type) return (*this)(value ? 1 : 0);
+    write(value != 0);
+    return out_;
+  }
+
+  template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value)>
+  iterator operator()(T value) {
+    auto specs = specs_ ? *specs_ : format_specs();
+    if (const_check(is_supported_floating_point(value)))
+      out_ = detail::write(out_, value, specs, locale_);
+    else
+      FMT_ASSERT(false, "unsupported float argument type");
+    return out_;
+  }
+
+  iterator operator()(const Char* value) {
+    if (!specs_) return write(value), out_;
+    handle_cstring_type_spec(specs_->type, cstring_spec_handler(*this, value));
+    return out_;
+  }
+
+  iterator operator()(basic_string_view<Char> value) {
+    if (specs_) {
+      check_string_type_spec(specs_->type, error_handler());
+      write(value, *specs_);
+    } else {
+      write(value);
+    }
+    return out_;
+  }
+
+  iterator operator()(const void* value) {
+    if (specs_) check_pointer_type_spec(specs_->type, error_handler());
+    write_pointer(value);
+    return out_;
+  }
+};
+
+/** The default argument formatter. */
+template <typename OutputIt, typename Char>
+class arg_formatter : public arg_formatter_base<OutputIt, Char> {
+ private:
+  using char_type = Char;
+  using base = arg_formatter_base<OutputIt, Char>;
+  using context_type = basic_format_context<OutputIt, Char>;
+
+  context_type& ctx_;
+  basic_format_parse_context<char_type>* parse_ctx_;
+  const Char* ptr_;
+
+ public:
+  using iterator = typename base::iterator;
+  using format_specs = typename base::format_specs;
+
+  /**
+    \rst
+    Constructs an argument formatter object.
+    *ctx* is a reference to the formatting context,
+    *specs* contains format specifier information for standard argument types.
+    \endrst
+   */
+  explicit arg_formatter(
+      context_type& ctx,
+      basic_format_parse_context<char_type>* parse_ctx = nullptr,
+      format_specs* specs = nullptr, const Char* ptr = nullptr)
+      : base(ctx.out(), specs, ctx.locale()),
+        ctx_(ctx),
+        parse_ctx_(parse_ctx),
+        ptr_(ptr) {}
+
+  using base::operator();
+
+  /** Formats an argument of a user-defined type. */
+  iterator operator()(typename basic_format_arg<context_type>::handle handle) {
+    if (ptr_) advance_to(*parse_ctx_, ptr_);
+    handle.format(*parse_ctx_, ctx_);
+    return ctx_.out();
+  }
+};
+
+template <typename Char> FMT_CONSTEXPR bool is_name_start(Char c) {
+  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || '_' == c;
+}
+
+// Parses the range [begin, end) as an unsigned integer. This function assumes
+// that the range is non-empty and the first character is a digit.
+template <typename Char, typename ErrorHandler>
+FMT_CONSTEXPR int parse_nonnegative_int(const Char*& begin, const Char* end,
+                                        ErrorHandler&& eh) {
+  FMT_ASSERT(begin != end && '0' <= *begin && *begin <= '9', "");
+  unsigned value = 0;
+  // Convert to unsigned to prevent a warning.
+  constexpr unsigned max_int = max_value<int>();
+  unsigned big = max_int / 10;
+  do {
+    // Check for overflow.
+    if (value > big) {
+      value = max_int + 1;
+      break;
+    }
+    value = value * 10 + unsigned(*begin - '0');
+    ++begin;
+  } while (begin != end && '0' <= *begin && *begin <= '9');
+  if (value > max_int) eh.on_error("number is too big");
+  return static_cast<int>(value);
+}
+
+template <typename Context> class custom_formatter {
+ private:
+  using char_type = typename Context::char_type;
+
+  basic_format_parse_context<char_type>& parse_ctx_;
+  Context& ctx_;
+
+ public:
+  explicit custom_formatter(basic_format_parse_context<char_type>& parse_ctx,
+                            Context& ctx)
+      : parse_ctx_(parse_ctx), ctx_(ctx) {}
+
+  void operator()(typename basic_format_arg<Context>::handle h) const {
+    h.format(parse_ctx_, ctx_);
+  }
+
+  template <typename T> void operator()(T) const {}
+};
+
+template <typename T>
+using is_integer =
+    bool_constant<is_integral<T>::value && !std::is_same<T, bool>::value &&
+                  !std::is_same<T, char>::value &&
+                  !std::is_same<T, wchar_t>::value>;
+
+template <typename ErrorHandler> class width_checker {
+ public:
+  explicit FMT_CONSTEXPR width_checker(ErrorHandler& eh) : handler_(eh) {}
+
+  template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
+  FMT_CONSTEXPR unsigned long long operator()(T value) {
+    if (is_negative(value)) handler_.on_error("negative width");
+    return static_cast<unsigned long long>(value);
+  }
+
+  template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
+  FMT_CONSTEXPR unsigned long long operator()(T) {
+    handler_.on_error("width is not integer");
+    return 0;
+  }
+
+ private:
+  ErrorHandler& handler_;
+};
+
+template <typename ErrorHandler> class precision_checker {
+ public:
+  explicit FMT_CONSTEXPR precision_checker(ErrorHandler& eh) : handler_(eh) {}
+
+  template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
+  FMT_CONSTEXPR unsigned long long operator()(T value) {
+    if (is_negative(value)) handler_.on_error("negative precision");
+    return static_cast<unsigned long long>(value);
+  }
+
+  template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
+  FMT_CONSTEXPR unsigned long long operator()(T) {
+    handler_.on_error("precision is not integer");
+    return 0;
+  }
+
+ private:
+  ErrorHandler& handler_;
+};
+
+// A format specifier handler that sets fields in basic_format_specs.
+template <typename Char> class specs_setter {
+ public:
+  explicit FMT_CONSTEXPR specs_setter(basic_format_specs<Char>& specs)
+      : specs_(specs) {}
+
+  FMT_CONSTEXPR specs_setter(const specs_setter& other)
+      : specs_(other.specs_) {}
+
+  FMT_CONSTEXPR void on_align(align_t align) { specs_.align = align; }
+  FMT_CONSTEXPR void on_fill(basic_string_view<Char> fill) {
+    specs_.fill = fill;
+  }
+  FMT_CONSTEXPR void on_plus() { specs_.sign = sign::plus; }
+  FMT_CONSTEXPR void on_minus() { specs_.sign = sign::minus; }
+  FMT_CONSTEXPR void on_space() { specs_.sign = sign::space; }
+  FMT_CONSTEXPR void on_hash() { specs_.alt = true; }
+
+  FMT_CONSTEXPR void on_zero() {
+    specs_.align = align::numeric;
+    specs_.fill[0] = Char('0');
+  }
+
+  FMT_CONSTEXPR void on_width(int width) { specs_.width = width; }
+  FMT_CONSTEXPR void on_precision(int precision) {
+    specs_.precision = precision;
+  }
+  FMT_CONSTEXPR void end_precision() {}
+
+  FMT_CONSTEXPR void on_type(Char type) {
+    specs_.type = static_cast<char>(type);
+  }
+
+ protected:
+  basic_format_specs<Char>& specs_;
+};
+
+template <typename ErrorHandler> class numeric_specs_checker {
+ public:
+  FMT_CONSTEXPR numeric_specs_checker(ErrorHandler& eh, detail::type arg_type)
+      : error_handler_(eh), arg_type_(arg_type) {}
+
+  FMT_CONSTEXPR void require_numeric_argument() {
+    if (!is_arithmetic_type(arg_type_))
+      error_handler_.on_error("format specifier requires numeric argument");
+  }
+
+  FMT_CONSTEXPR void check_sign() {
+    require_numeric_argument();
+    if (is_integral_type(arg_type_) && arg_type_ != type::int_type &&
+        arg_type_ != type::long_long_type && arg_type_ != type::char_type) {
+      error_handler_.on_error("format specifier requires signed argument");
+    }
+  }
+
+  FMT_CONSTEXPR void check_precision() {
+    if (is_integral_type(arg_type_) || arg_type_ == type::pointer_type)
+      error_handler_.on_error("precision not allowed for this argument type");
+  }
+
+ private:
+  ErrorHandler& error_handler_;
+  detail::type arg_type_;
+};
+
+// A format specifier handler that checks if specifiers are consistent with the
+// argument type.
+template <typename Handler> class specs_checker : public Handler {
+ private:
+  numeric_specs_checker<Handler> checker_;
+
+  // Suppress an MSVC warning about using this in initializer list.
+  FMT_CONSTEXPR Handler& error_handler() { return *this; }
+
+ public:
+  FMT_CONSTEXPR specs_checker(const Handler& handler, detail::type arg_type)
+      : Handler(handler), checker_(error_handler(), arg_type) {}
+
+  FMT_CONSTEXPR specs_checker(const specs_checker& other)
+      : Handler(other), checker_(error_handler(), other.arg_type_) {}
+
+  FMT_CONSTEXPR void on_align(align_t align) {
+    if (align == align::numeric) checker_.require_numeric_argument();
+    Handler::on_align(align);
+  }
+
+  FMT_CONSTEXPR void on_plus() {
+    checker_.check_sign();
+    Handler::on_plus();
+  }
+
+  FMT_CONSTEXPR void on_minus() {
+    checker_.check_sign();
+    Handler::on_minus();
+  }
+
+  FMT_CONSTEXPR void on_space() {
+    checker_.check_sign();
+    Handler::on_space();
+  }
+
+  FMT_CONSTEXPR void on_hash() {
+    checker_.require_numeric_argument();
+    Handler::on_hash();
+  }
+
+  FMT_CONSTEXPR void on_zero() {
+    checker_.require_numeric_argument();
+    Handler::on_zero();
+  }
+
+  FMT_CONSTEXPR void end_precision() { checker_.check_precision(); }
+};
+
+template <template <typename> class Handler, typename FormatArg,
+          typename ErrorHandler>
+FMT_CONSTEXPR int get_dynamic_spec(FormatArg arg, ErrorHandler eh) {
+  unsigned long long value = visit_format_arg(Handler<ErrorHandler>(eh), arg);
+  if (value > to_unsigned(max_value<int>())) eh.on_error("number is too big");
+  return static_cast<int>(value);
+}
+
+struct auto_id {};
+
+template <typename Context, typename ID>
+FMT_CONSTEXPR typename Context::format_arg get_arg(Context& ctx, ID id) {
+  auto arg = ctx.arg(id);
+  if (!arg) ctx.on_error("argument not found");
+  return arg;
+}
+
+// The standard format specifier handler with checking.
+template <typename ParseContext, typename Context>
+class specs_handler : public specs_setter<typename Context::char_type> {
+ public:
+  using char_type = typename Context::char_type;
+
+  FMT_CONSTEXPR specs_handler(basic_format_specs<char_type>& specs,
+                              ParseContext& parse_ctx, Context& ctx)
+      : specs_setter<char_type>(specs),
+        parse_context_(parse_ctx),
+        context_(ctx) {}
+
+  template <typename Id> FMT_CONSTEXPR void on_dynamic_width(Id arg_id) {
+    this->specs_.width = get_dynamic_spec<width_checker>(
+        get_arg(arg_id), context_.error_handler());
+  }
+
+  template <typename Id> FMT_CONSTEXPR void on_dynamic_precision(Id arg_id) {
+    this->specs_.precision = get_dynamic_spec<precision_checker>(
+        get_arg(arg_id), context_.error_handler());
+  }
+
+  void on_error(const char* message) { context_.on_error(message); }
+
+ private:
+  // This is only needed for compatibility with gcc 4.4.
+  using format_arg = typename Context::format_arg;
+
+  FMT_CONSTEXPR format_arg get_arg(auto_id) {
+    return detail::get_arg(context_, parse_context_.next_arg_id());
+  }
+
+  FMT_CONSTEXPR format_arg get_arg(int arg_id) {
+    parse_context_.check_arg_id(arg_id);
+    return detail::get_arg(context_, arg_id);
+  }
+
+  FMT_CONSTEXPR format_arg get_arg(basic_string_view<char_type> arg_id) {
+    parse_context_.check_arg_id(arg_id);
+    return detail::get_arg(context_, arg_id);
+  }
+
+  ParseContext& parse_context_;
+  Context& context_;
+};
+
+enum class arg_id_kind { none, index, name };
+
+// An argument reference.
+template <typename Char> struct arg_ref {
+  FMT_CONSTEXPR arg_ref() : kind(arg_id_kind::none), val() {}
+
+  FMT_CONSTEXPR explicit arg_ref(int index)
+      : kind(arg_id_kind::index), val(index) {}
+  FMT_CONSTEXPR explicit arg_ref(basic_string_view<Char> name)
+      : kind(arg_id_kind::name), val(name) {}
+
+  FMT_CONSTEXPR arg_ref& operator=(int idx) {
+    kind = arg_id_kind::index;
+    val.index = idx;
+    return *this;
+  }
+
+  arg_id_kind kind;
+  union value {
+    FMT_CONSTEXPR value(int id = 0) : index{id} {}
+    FMT_CONSTEXPR value(basic_string_view<Char> n) : name(n) {}
+
+    int index;
+    basic_string_view<Char> name;
+  } val;
+};
+
+// Format specifiers with width and precision resolved at formatting rather
+// than parsing time to allow re-using the same parsed specifiers with
+// different sets of arguments (precompilation of format strings).
+template <typename Char>
+struct dynamic_format_specs : basic_format_specs<Char> {
+  arg_ref<Char> width_ref;
+  arg_ref<Char> precision_ref;
+};
+
+// Format spec handler that saves references to arguments representing dynamic
+// width and precision to be resolved at formatting time.
+template <typename ParseContext>
+class dynamic_specs_handler
+    : public specs_setter<typename ParseContext::char_type> {
+ public:
+  using char_type = typename ParseContext::char_type;
+
+  FMT_CONSTEXPR dynamic_specs_handler(dynamic_format_specs<char_type>& specs,
+                                      ParseContext& ctx)
+      : specs_setter<char_type>(specs), specs_(specs), context_(ctx) {}
+
+  FMT_CONSTEXPR dynamic_specs_handler(const dynamic_specs_handler& other)
+      : specs_setter<char_type>(other),
+        specs_(other.specs_),
+        context_(other.context_) {}
+
+  template <typename Id> FMT_CONSTEXPR void on_dynamic_width(Id arg_id) {
+    specs_.width_ref = make_arg_ref(arg_id);
+  }
+
+  template <typename Id> FMT_CONSTEXPR void on_dynamic_precision(Id arg_id) {
+    specs_.precision_ref = make_arg_ref(arg_id);
+  }
+
+  FMT_CONSTEXPR void on_error(const char* message) {
+    context_.on_error(message);
+  }
+
+ private:
+  using arg_ref_type = arg_ref<char_type>;
+
+  FMT_CONSTEXPR arg_ref_type make_arg_ref(int arg_id) {
+    context_.check_arg_id(arg_id);
+    return arg_ref_type(arg_id);
+  }
+
+  FMT_CONSTEXPR arg_ref_type make_arg_ref(auto_id) {
+    return arg_ref_type(context_.next_arg_id());
+  }
+
+  FMT_CONSTEXPR arg_ref_type make_arg_ref(basic_string_view<char_type> arg_id) {
+    context_.check_arg_id(arg_id);
+    basic_string_view<char_type> format_str(
+        context_.begin(), to_unsigned(context_.end() - context_.begin()));
+    return arg_ref_type(arg_id);
+  }
+
+  dynamic_format_specs<char_type>& specs_;
+  ParseContext& context_;
+};
+
+template <typename Char, typename IDHandler>
+FMT_CONSTEXPR const Char* parse_arg_id(const Char* begin, const Char* end,
+                                       IDHandler&& handler) {
+  FMT_ASSERT(begin != end, "");
+  Char c = *begin;
+  if (c == '}' || c == ':') {
+    handler();
+    return begin;
+  }
+  if (c >= '0' && c <= '9') {
+    int index = 0;
+    if (c != '0')
+      index = parse_nonnegative_int(begin, end, handler);
+    else
+      ++begin;
+    if (begin == end || (*begin != '}' && *begin != ':'))
+      handler.on_error("invalid format string");
+    else
+      handler(index);
+    return begin;
+  }
+  if (!is_name_start(c)) {
+    handler.on_error("invalid format string");
+    return begin;
+  }
+  auto it = begin;
+  do {
+    ++it;
+  } while (it != end && (is_name_start(c = *it) || ('0' <= c && c <= '9')));
+  handler(basic_string_view<Char>(begin, to_unsigned(it - begin)));
+  return it;
+}
+
+// Adapts SpecHandler to IDHandler API for dynamic width.
+template <typename SpecHandler, typename Char> struct width_adapter {
+  explicit FMT_CONSTEXPR width_adapter(SpecHandler& h) : handler(h) {}
+
+  FMT_CONSTEXPR void operator()() { handler.on_dynamic_width(auto_id()); }
+  FMT_CONSTEXPR void operator()(int id) { handler.on_dynamic_width(id); }
+  FMT_CONSTEXPR void operator()(basic_string_view<Char> id) {
+    handler.on_dynamic_width(id);
+  }
+
+  FMT_CONSTEXPR void on_error(const char* message) {
+    handler.on_error(message);
+  }
+
+  SpecHandler& handler;
+};
+
+// Adapts SpecHandler to IDHandler API for dynamic precision.
+template <typename SpecHandler, typename Char> struct precision_adapter {
+  explicit FMT_CONSTEXPR precision_adapter(SpecHandler& h) : handler(h) {}
+
+  FMT_CONSTEXPR void operator()() { handler.on_dynamic_precision(auto_id()); }
+  FMT_CONSTEXPR void operator()(int id) { handler.on_dynamic_precision(id); }
+  FMT_CONSTEXPR void operator()(basic_string_view<Char> id) {
+    handler.on_dynamic_precision(id);
+  }
+
+  FMT_CONSTEXPR void on_error(const char* message) {
+    handler.on_error(message);
+  }
+
+  SpecHandler& handler;
+};
+
+template <typename Char>
+FMT_CONSTEXPR int code_point_length(const Char* begin) {
+  if (const_check(sizeof(Char) != 1)) return 1;
+  constexpr char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                              0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
+  int len = lengths[static_cast<unsigned char>(*begin) >> 3];
+
+  // Compute the pointer to the next character early so that the next
+  // iteration can start working on the next character. Neither Clang
+  // nor GCC figure out this reordering on their own.
+  return len + !len;
+}
+
+template <typename Char> constexpr bool is_ascii_letter(Char c) {
+  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+}
+
+// Converts a character to ASCII. Returns a number > 127 on conversion failure.
+template <typename Char, FMT_ENABLE_IF(std::is_integral<Char>::value)>
+constexpr Char to_ascii(Char value) {
+  return value;
+}
+template <typename Char, FMT_ENABLE_IF(std::is_enum<Char>::value)>
+constexpr typename std::underlying_type<Char>::type to_ascii(Char value) {
+  return value;
+}
+
+// Parses fill and alignment.
+template <typename Char, typename Handler>
+FMT_CONSTEXPR const Char* parse_align(const Char* begin, const Char* end,
+                                      Handler&& handler) {
+  FMT_ASSERT(begin != end, "");
+  auto align = align::none;
+  auto p = begin + code_point_length(begin);
+  if (p >= end) p = begin;
+  for (;;) {
+    switch (to_ascii(*p)) {
+    case '<':
+      align = align::left;
+      break;
+    case '>':
+      align = align::right;
+      break;
+#if FMT_DEPRECATED_NUMERIC_ALIGN
+    case '=':
+      align = align::numeric;
+      break;
+#endif
+    case '^':
+      align = align::center;
+      break;
+    }
+    if (align != align::none) {
+      if (p != begin) {
+        auto c = *begin;
+        if (c == '{')
+          return handler.on_error("invalid fill character '{'"), begin;
+        handler.on_fill(basic_string_view<Char>(begin, to_unsigned(p - begin)));
+        begin = p + 1;
+      } else
+        ++begin;
+      handler.on_align(align);
+      break;
+    } else if (p == begin) {
+      break;
+    }
+    p = begin;
+  }
+  return begin;
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR const Char* parse_width(const Char* begin, const Char* end,
+                                      Handler&& handler) {
+  FMT_ASSERT(begin != end, "");
+  if ('0' <= *begin && *begin <= '9') {
+    handler.on_width(parse_nonnegative_int(begin, end, handler));
+  } else if (*begin == '{') {
+    ++begin;
+    if (begin != end)
+      begin = parse_arg_id(begin, end, width_adapter<Handler, Char>(handler));
+    if (begin == end || *begin != '}')
+      return handler.on_error("invalid format string"), begin;
+    ++begin;
+  }
+  return begin;
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR const Char* parse_precision(const Char* begin, const Char* end,
+                                          Handler&& handler) {
+  ++begin;
+  auto c = begin != end ? *begin : Char();
+  if ('0' <= c && c <= '9') {
+    handler.on_precision(parse_nonnegative_int(begin, end, handler));
+  } else if (c == '{') {
+    ++begin;
+    if (begin != end) {
+      begin =
+          parse_arg_id(begin, end, precision_adapter<Handler, Char>(handler));
+    }
+    if (begin == end || *begin++ != '}')
+      return handler.on_error("invalid format string"), begin;
+  } else {
+    return handler.on_error("missing precision specifier"), begin;
+  }
+  handler.end_precision();
+  return begin;
+}
+
+// Parses standard format specifiers and sends notifications about parsed
+// components to handler.
+template <typename Char, typename SpecHandler>
+FMT_CONSTEXPR const Char* parse_format_specs(const Char* begin, const Char* end,
+                                             SpecHandler&& handler) {
+  if (begin == end) return begin;
+
+  begin = parse_align(begin, end, handler);
+  if (begin == end) return begin;
+
+  // Parse sign.
+  switch (to_ascii(*begin)) {
+  case '+':
+    handler.on_plus();
+    ++begin;
+    break;
+  case '-':
+    handler.on_minus();
+    ++begin;
+    break;
+  case ' ':
+    handler.on_space();
+    ++begin;
+    break;
+  }
+  if (begin == end) return begin;
+
+  if (*begin == '#') {
+    handler.on_hash();
+    if (++begin == end) return begin;
+  }
+
+  // Parse zero flag.
+  if (*begin == '0') {
+    handler.on_zero();
+    if (++begin == end) return begin;
+  }
+
+  begin = parse_width(begin, end, handler);
+  if (begin == end) return begin;
+
+  // Parse precision.
+  if (*begin == '.') {
+    begin = parse_precision(begin, end, handler);
+  }
+
+  // Parse type.
+  if (begin != end && *begin != '}') handler.on_type(*begin++);
+  return begin;
+}
+
+// Return the result via the out param to workaround gcc bug 77539.
+template <bool IS_CONSTEXPR, typename T, typename Ptr = const T*>
+FMT_CONSTEXPR bool find(Ptr first, Ptr last, T value, Ptr& out) {
+  for (out = first; out != last; ++out) {
+    if (*out == value) return true;
+  }
+  return false;
+}
+
+template <>
+inline bool find<false, char>(const char* first, const char* last, char value,
+                              const char*& out) {
+  out = static_cast<const char*>(
+      std::memchr(first, value, detail::to_unsigned(last - first)));
+  return out != nullptr;
+}
+
+template <typename Handler, typename Char> struct id_adapter {
+  Handler& handler;
+  int arg_id;
+
+  FMT_CONSTEXPR void operator()() { arg_id = handler.on_arg_id(); }
+  FMT_CONSTEXPR void operator()(int id) { arg_id = handler.on_arg_id(id); }
+  FMT_CONSTEXPR void operator()(basic_string_view<Char> id) {
+    arg_id = handler.on_arg_id(id);
+  }
+  FMT_CONSTEXPR void on_error(const char* message) {
+    handler.on_error(message);
+  }
+};
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR const Char* parse_replacement_field(const Char* begin,
+                                                  const Char* end,
+                                                  Handler&& handler) {
+  ++begin;
+  if (begin == end) return handler.on_error("invalid format string"), end;
+  if (*begin == '}') {
+    handler.on_replacement_field(handler.on_arg_id(), begin);
+  } else if (*begin == '{') {
+    handler.on_text(begin, begin + 1);
+  } else {
+    auto adapter = id_adapter<Handler, Char>{handler, 0};
+    begin = parse_arg_id(begin, end, adapter);
+    Char c = begin != end ? *begin : Char();
+    if (c == '}') {
+      handler.on_replacement_field(adapter.arg_id, begin);
+    } else if (c == ':') {
+      begin = handler.on_format_specs(adapter.arg_id, begin + 1, end);
+      if (begin == end || *begin != '}')
+        return handler.on_error("unknown format specifier"), end;
+    } else {
+      return handler.on_error("missing '}' in format string"), end;
+    }
+  }
+  return begin + 1;
+}
+
+template <bool IS_CONSTEXPR, typename Char, typename Handler>
+FMT_CONSTEXPR_DECL FMT_INLINE void parse_format_string(
+    basic_string_view<Char> format_str, Handler&& handler) {
+  auto begin = format_str.data();
+  auto end = begin + format_str.size();
+  if (end - begin < 32) {
+    // Use a simple loop instead of memchr for small strings.
+    const Char* p = begin;
+    while (p != end) {
+      auto c = *p++;
+      if (c == '{') {
+        handler.on_text(begin, p - 1);
+        begin = p = parse_replacement_field(p - 1, end, handler);
+      } else if (c == '}') {
+        if (p == end || *p != '}')
+          return handler.on_error("unmatched '}' in format string");
+        handler.on_text(begin, p);
+        begin = ++p;
+      }
+    }
+    handler.on_text(begin, end);
+    return;
+  }
+  struct pfs_writer {
+    FMT_CONSTEXPR void operator()(const Char* pbegin, const Char* pend) {
+      if (pbegin == pend) return;
+      for (;;) {
+        const Char* p = nullptr;
+        if (!find<IS_CONSTEXPR>(pbegin, pend, '}', p))
+          return handler_.on_text(pbegin, pend);
+        ++p;
+        if (p == pend || *p != '}')
+          return handler_.on_error("unmatched '}' in format string");
+        handler_.on_text(pbegin, p);
+        pbegin = p + 1;
+      }
+    }
+    Handler& handler_;
+  } write{handler};
+  while (begin != end) {
+    // Doing two passes with memchr (one for '{' and another for '}') is up to
+    // 2.5x faster than the naive one-pass implementation on big format strings.
+    const Char* p = begin;
+    if (*begin != '{' && !find<IS_CONSTEXPR>(begin + 1, end, '{', p))
+      return write(begin, end);
+    write(begin, p);
+    begin = parse_replacement_field(p, end, handler);
+  }
+}
+
+template <typename T, typename ParseContext>
+FMT_CONSTEXPR const typename ParseContext::char_type* parse_format_specs(
+    ParseContext& ctx) {
+  using char_type = typename ParseContext::char_type;
+  using context = buffer_context<char_type>;
+  using mapped_type =
+      conditional_t<detail::mapped_type_constant<T, context>::value !=
+                        type::custom_type,
+                    decltype(arg_mapper<context>().map(std::declval<T>())), T>;
+  auto f = conditional_t<has_formatter<mapped_type, context>::value,
+                         formatter<mapped_type, char_type>,
+                         detail::fallback_formatter<T, char_type>>();
+  return f.parse(ctx);
+}
+
+template <typename OutputIt, typename Char, typename Context>
+struct format_handler : detail::error_handler {
+  basic_format_parse_context<Char> parse_context;
+  Context context;
+
+  format_handler(OutputIt out, basic_string_view<Char> str,
+                 basic_format_args<Context> format_args, detail::locale_ref loc)
+      : parse_context(str), context(out, format_args, loc) {}
+
+  void on_text(const Char* begin, const Char* end) {
+    auto size = to_unsigned(end - begin);
+    auto out = context.out();
+    auto&& it = reserve(out, size);
+    it = std::copy_n(begin, size, it);
+    context.advance_to(out);
+  }
+
+  int on_arg_id() { return parse_context.next_arg_id(); }
+  int on_arg_id(int id) { return parse_context.check_arg_id(id), id; }
+  int on_arg_id(basic_string_view<Char> id) {
+    int arg_id = context.arg_id(id);
+    if (arg_id < 0) on_error("argument not found");
+    return arg_id;
+  }
+
+  FMT_INLINE void on_replacement_field(int id, const Char*) {
+    auto arg = get_arg(context, id);
+    context.advance_to(visit_format_arg(
+        default_arg_formatter<OutputIt, Char>{context.out(), context.args(),
+                                              context.locale()},
+        arg));
+  }
+
+  const Char* on_format_specs(int id, const Char* begin, const Char* end) {
+    auto arg = get_arg(context, id);
+    if (arg.type() == type::custom_type) {
+      advance_to(parse_context, begin);
+      visit_format_arg(custom_formatter<Context>(parse_context, context), arg);
+      return parse_context.begin();
+    }
+    auto specs = basic_format_specs<Char>();
+    if (begin + 1 < end && begin[1] == '}' && is_ascii_letter(*begin)) {
+      specs.type = static_cast<char>(*begin++);
+    } else {
+      using parse_context_t = basic_format_parse_context<Char>;
+      specs_checker<specs_handler<parse_context_t, Context>> handler(
+          specs_handler<parse_context_t, Context>(specs, parse_context,
+                                                  context),
+          arg.type());
+      begin = parse_format_specs(begin, end, handler);
+      if (begin == end || *begin != '}')
+        on_error("missing '}' in format string");
+    }
+    context.advance_to(visit_format_arg(
+        arg_formatter<OutputIt, Char>(context, &parse_context, &specs), arg));
+    return begin;
+  }
+};
+
+// A parse context with extra argument id checks. It is only used at compile
+// time because adding checks at runtime would introduce substantial overhead
+// and would be redundant since argument ids are checked when arguments are
+// retrieved anyway.
+template <typename Char, typename ErrorHandler = error_handler>
+class compile_parse_context
+    : public basic_format_parse_context<Char, ErrorHandler> {
+ private:
+  int num_args_;
+  using base = basic_format_parse_context<Char, ErrorHandler>;
+
+ public:
+  explicit FMT_CONSTEXPR compile_parse_context(
+      basic_string_view<Char> format_str, int num_args = max_value<int>(),
+      ErrorHandler eh = {})
+      : base(format_str, eh), num_args_(num_args) {}
+
+  FMT_CONSTEXPR int next_arg_id() {
+    int id = base::next_arg_id();
+    if (id >= num_args_) this->on_error("argument not found");
+    return id;
+  }
+
+  FMT_CONSTEXPR void check_arg_id(int id) {
+    base::check_arg_id(id);
+    if (id >= num_args_) this->on_error("argument not found");
+  }
+  using base::check_arg_id;
+};
+
+template <typename Char, typename ErrorHandler, typename... Args>
+class format_string_checker {
+ public:
+  explicit FMT_CONSTEXPR format_string_checker(
+      basic_string_view<Char> format_str, ErrorHandler eh)
+      : context_(format_str, num_args, eh),
+        parse_funcs_{&parse_format_specs<Args, parse_context_type>...} {}
+
+  FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
+
+  FMT_CONSTEXPR int on_arg_id() { return context_.next_arg_id(); }
+  FMT_CONSTEXPR int on_arg_id(int id) { return context_.check_arg_id(id), id; }
+  FMT_CONSTEXPR int on_arg_id(basic_string_view<Char>) {
+    on_error("compile-time checks don't support named arguments");
+    return 0;
+  }
+
+  FMT_CONSTEXPR void on_replacement_field(int, const Char*) {}
+
+  FMT_CONSTEXPR const Char* on_format_specs(int id, const Char* begin,
+                                            const Char*) {
+    advance_to(context_, begin);
+    return id < num_args ? parse_funcs_[id](context_) : begin;
+  }
+
+  FMT_CONSTEXPR void on_error(const char* message) {
+    context_.on_error(message);
+  }
+
+ private:
+  using parse_context_type = compile_parse_context<Char, ErrorHandler>;
+  enum { num_args = sizeof...(Args) };
+
+  // Format specifier parsing function.
+  using parse_func = const Char* (*)(parse_context_type&);
+
+  parse_context_type context_;
+  parse_func parse_funcs_[num_args > 0 ? num_args : 1];
+};
+
+// Converts string literals to basic_string_view.
+template <typename Char, size_t N>
+FMT_CONSTEXPR basic_string_view<Char> compile_string_to_view(
+    const Char (&s)[N]) {
+  // Remove trailing null character if needed. Won't be present if this is used
+  // with raw character array (i.e. not defined as a string).
+  return {s,
+          N - ((std::char_traits<Char>::to_int_type(s[N - 1]) == 0) ? 1 : 0)};
+}
+
+// Converts string_view to basic_string_view.
+template <typename Char>
+FMT_CONSTEXPR basic_string_view<Char> compile_string_to_view(
+    const std_string_view<Char>& s) {
+  return {s.data(), s.size()};
+}
+
+#define FMT_STRING_IMPL(s, base)                                  \
+  [] {                                                            \
+    /* Use a macro-like name to avoid shadowing warnings. */      \
+    struct FMT_COMPILE_STRING : base {                            \
+      using char_type = fmt::remove_cvref_t<decltype(s[0])>;      \
+      FMT_MAYBE_UNUSED FMT_CONSTEXPR                              \
+      operator fmt::basic_string_view<char_type>() const {        \
+        return fmt::detail::compile_string_to_view<char_type>(s); \
+      }                                                           \
+    };                                                            \
+    return FMT_COMPILE_STRING();                                  \
+  }()
+
+/**
+  \rst
+  Constructs a compile-time format string from a string literal *s*.
+
+  **Example**::
+
+    // A compile-time error because 'd' is an invalid specifier for strings.
+    std::string s = fmt::format(FMT_STRING("{:d}"), "foo");
+  \endrst
+ */
+#define FMT_STRING(s) FMT_STRING_IMPL(s, fmt::compile_string)
+
+template <typename... Args, typename S,
+          enable_if_t<(is_compile_string<S>::value), int>>
+void check_format_string(S format_str) {
+  FMT_CONSTEXPR_DECL auto s = to_string_view(format_str);
+  using checker = format_string_checker<typename S::char_type, error_handler,
+                                        remove_cvref_t<Args>...>;
+  FMT_CONSTEXPR_DECL bool invalid_format =
+      (parse_format_string<true>(s, checker(s, {})), true);
+  (void)invalid_format;
+}
+
+template <template <typename> class Handler, typename Context>
+void handle_dynamic_spec(int& value, arg_ref<typename Context::char_type> ref,
+                         Context& ctx) {
+  switch (ref.kind) {
+  case arg_id_kind::none:
+    break;
+  case arg_id_kind::index:
+    value = detail::get_dynamic_spec<Handler>(ctx.arg(ref.val.index),
+                                              ctx.error_handler());
+    break;
+  case arg_id_kind::name:
+    value = detail::get_dynamic_spec<Handler>(ctx.arg(ref.val.name),
+                                              ctx.error_handler());
+    break;
+  }
+}
+
+using format_func = void (*)(detail::buffer<char>&, int, string_view);
+
+FMT_API void format_error_code(buffer<char>& out, int error_code,
+                               string_view message) FMT_NOEXCEPT;
+
+FMT_API void report_error(format_func func, int error_code,
+                          string_view message) FMT_NOEXCEPT;
+}  // namespace detail
+
+template <typename OutputIt, typename Char>
+using arg_formatter FMT_DEPRECATED_ALIAS =
+    detail::arg_formatter<OutputIt, Char>;
+
+/**
+ An error returned by an operating system or a language runtime,
+ for example a file opening error.
+*/
+FMT_CLASS_API
+class FMT_API system_error : public std::runtime_error {
+ private:
+  void init(int err_code, string_view format_str, format_args args);
+
+ protected:
+  int error_code_;
+
+  system_error() : std::runtime_error(""), error_code_(0) {}
+
+ public:
+  /**
+   \rst
+   Constructs a :class:`fmt::system_error` object with a description
+   formatted with `fmt::format_system_error`. *message* and additional
+   arguments passed into the constructor are formatted similarly to
+   `fmt::format`.
+
+   **Example**::
+
+     // This throws a system_error with the description
+     //   cannot open file 'madeup': No such file or directory
+     // or similar (system message may vary).
+     const char *filename = "madeup";
+     std::FILE *file = std::fopen(filename, "r");
+     if (!file)
+       throw fmt::system_error(errno, "cannot open file '{}'", filename);
+   \endrst
+  */
+  template <typename... Args>
+  system_error(int error_code, string_view message, const Args&... args)
+      : std::runtime_error("") {
+    init(error_code, message, make_format_args(args...));
+  }
+  system_error(const system_error&) = default;
+  system_error& operator=(const system_error&) = default;
+  system_error(system_error&&) = default;
+  system_error& operator=(system_error&&) = default;
+  ~system_error() FMT_NOEXCEPT FMT_OVERRIDE;
+
+  int error_code() const { return error_code_; }
+};
+
+/**
+  \rst
+  Formats an error returned by an operating system or a language runtime,
+  for example a file opening error, and writes it to *out* in the following
+  form:
+
+  .. parsed-literal::
+     *<message>*: *<system-message>*
+
+  where *<message>* is the passed message and *<system-message>* is
+  the system message corresponding to the error code.
+  *error_code* is a system error code as given by ``errno``.
+  If *error_code* is not a valid error code such as -1, the system message
+  may look like "Unknown error -1" and is platform-dependent.
+  \endrst
+ */
+FMT_API void format_system_error(detail::buffer<char>& out, int error_code,
+                                 string_view message) FMT_NOEXCEPT;
+
+// Reports a system error without throwing an exception.
+// Can be used to report errors from destructors.
+FMT_API void report_system_error(int error_code,
+                                 string_view message) FMT_NOEXCEPT;
+
+/** Fast integer formatter. */
+class format_int {
+ private:
+  // Buffer should be large enough to hold all digits (digits10 + 1),
+  // a sign and a null character.
+  enum { buffer_size = std::numeric_limits<unsigned long long>::digits10 + 3 };
+  mutable char buffer_[buffer_size];
+  char* str_;
+
+  template <typename UInt> char* format_unsigned(UInt value) {
+    auto n = static_cast<detail::uint32_or_64_or_128_t<UInt>>(value);
+    return detail::format_decimal(buffer_, n, buffer_size - 1).begin;
+  }
+
+  template <typename Int> char* format_signed(Int value) {
+    auto abs_value = static_cast<detail::uint32_or_64_or_128_t<Int>>(value);
+    bool negative = value < 0;
+    if (negative) abs_value = 0 - abs_value;
+    auto begin = format_unsigned(abs_value);
+    if (negative) *--begin = '-';
+    return begin;
+  }
+
+ public:
+  explicit format_int(int value) : str_(format_signed(value)) {}
+  explicit format_int(long value) : str_(format_signed(value)) {}
+  explicit format_int(long long value) : str_(format_signed(value)) {}
+  explicit format_int(unsigned value) : str_(format_unsigned(value)) {}
+  explicit format_int(unsigned long value) : str_(format_unsigned(value)) {}
+  explicit format_int(unsigned long long value)
+      : str_(format_unsigned(value)) {}
+
+  /** Returns the number of characters written to the output buffer. */
+  size_t size() const {
+    return detail::to_unsigned(buffer_ - str_ + buffer_size - 1);
+  }
+
+  /**
+    Returns a pointer to the output buffer content. No terminating null
+    character is appended.
+   */
+  const char* data() const { return str_; }
+
+  /**
+    Returns a pointer to the output buffer content with terminating null
+    character appended.
+   */
+  const char* c_str() const {
+    buffer_[buffer_size - 1] = '\0';
+    return str_;
+  }
+
+  /**
+    \rst
+    Returns the content of the output buffer as an ``std::string``.
+    \endrst
+   */
+  std::string str() const { return std::string(str_, size()); }
+};
+
+// A formatter specialization for the core types corresponding to detail::type
+// constants.
+template <typename T, typename Char>
+struct formatter<T, Char,
+                 enable_if_t<detail::type_constant<T, Char>::value !=
+                             detail::type::custom_type>> {
+  FMT_CONSTEXPR formatter() = default;
+
+  // Parses format specifiers stopping either at the end of the range or at the
+  // terminating '}'.
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    using handler_type = detail::dynamic_specs_handler<ParseContext>;
+    auto type = detail::type_constant<T, Char>::value;
+    detail::specs_checker<handler_type> handler(handler_type(specs_, ctx),
+                                                type);
+    auto it = parse_format_specs(ctx.begin(), ctx.end(), handler);
+    auto eh = ctx.error_handler();
+    switch (type) {
+    case detail::type::none_type:
+      FMT_ASSERT(false, "invalid argument type");
+      break;
+    case detail::type::int_type:
+    case detail::type::uint_type:
+    case detail::type::long_long_type:
+    case detail::type::ulong_long_type:
+    case detail::type::int128_type:
+    case detail::type::uint128_type:
+    case detail::type::bool_type:
+      handle_int_type_spec(specs_.type,
+                           detail::int_type_checker<decltype(eh)>(eh));
+      break;
+    case detail::type::char_type:
+      handle_char_specs(
+          &specs_, detail::char_specs_checker<decltype(eh)>(specs_.type, eh));
+      break;
+    case detail::type::float_type:
+      if (detail::const_check(FMT_USE_FLOAT))
+        detail::parse_float_type_spec(specs_, eh);
+      else
+        FMT_ASSERT(false, "float support disabled");
+      break;
+    case detail::type::double_type:
+      if (detail::const_check(FMT_USE_DOUBLE))
+        detail::parse_float_type_spec(specs_, eh);
+      else
+        FMT_ASSERT(false, "double support disabled");
+      break;
+    case detail::type::long_double_type:
+      if (detail::const_check(FMT_USE_LONG_DOUBLE))
+        detail::parse_float_type_spec(specs_, eh);
+      else
+        FMT_ASSERT(false, "long double support disabled");
+      break;
+    case detail::type::cstring_type:
+      detail::handle_cstring_type_spec(
+          specs_.type, detail::cstring_type_checker<decltype(eh)>(eh));
+      break;
+    case detail::type::string_type:
+      detail::check_string_type_spec(specs_.type, eh);
+      break;
+    case detail::type::pointer_type:
+      detail::check_pointer_type_spec(specs_.type, eh);
+      break;
+    case detail::type::custom_type:
+      // Custom format specifiers should be checked in parse functions of
+      // formatter specializations.
+      break;
+    }
+    return it;
+  }
+
+  template <typename FormatContext>
+  auto format(const T& val, FormatContext& ctx) -> decltype(ctx.out()) {
+    detail::handle_dynamic_spec<detail::width_checker>(specs_.width,
+                                                       specs_.width_ref, ctx);
+    detail::handle_dynamic_spec<detail::precision_checker>(
+        specs_.precision, specs_.precision_ref, ctx);
+    using af = detail::arg_formatter<typename FormatContext::iterator,
+                                     typename FormatContext::char_type>;
+    return visit_format_arg(af(ctx, nullptr, &specs_),
+                            detail::make_arg<FormatContext>(val));
+  }
+
+ private:
+  detail::dynamic_format_specs<Char> specs_;
+};
+
+#define FMT_FORMAT_AS(Type, Base)                                             \
+  template <typename Char>                                                    \
+  struct formatter<Type, Char> : formatter<Base, Char> {                      \
+    template <typename FormatContext>                                         \
+    auto format(Type const& val, FormatContext& ctx) -> decltype(ctx.out()) { \
+      return formatter<Base, Char>::format(val, ctx);                         \
+    }                                                                         \
+  }
+
+FMT_FORMAT_AS(signed char, int);
+FMT_FORMAT_AS(unsigned char, unsigned);
+FMT_FORMAT_AS(short, int);
+FMT_FORMAT_AS(unsigned short, unsigned);
+FMT_FORMAT_AS(long, long long);
+FMT_FORMAT_AS(unsigned long, unsigned long long);
+FMT_FORMAT_AS(Char*, const Char*);
+FMT_FORMAT_AS(std::basic_string<Char>, basic_string_view<Char>);
+FMT_FORMAT_AS(std::nullptr_t, const void*);
+FMT_FORMAT_AS(detail::std_string_view<Char>, basic_string_view<Char>);
+
+template <typename Char>
+struct formatter<void*, Char> : formatter<const void*, Char> {
+  template <typename FormatContext>
+  auto format(void* val, FormatContext& ctx) -> decltype(ctx.out()) {
+    return formatter<const void*, Char>::format(val, ctx);
+  }
+};
+
+template <typename Char, size_t N>
+struct formatter<Char[N], Char> : formatter<basic_string_view<Char>, Char> {
+  template <typename FormatContext>
+  auto format(const Char* val, FormatContext& ctx) -> decltype(ctx.out()) {
+    return formatter<basic_string_view<Char>, Char>::format(val, ctx);
+  }
+};
+
+// A formatter for types known only at run time such as variant alternatives.
+//
+// Usage:
+//   using variant = std::variant<int, std::string>;
+//   template <>
+//   struct formatter<variant>: dynamic_formatter<> {
+//     auto format(const variant& v, format_context& ctx) {
+//       return visit([&](const auto& val) {
+//           return dynamic_formatter<>::format(val, ctx);
+//       }, v);
+//     }
+//   };
+template <typename Char = char> class dynamic_formatter {
+ private:
+  struct null_handler : detail::error_handler {
+    void on_align(align_t) {}
+    void on_plus() {}
+    void on_minus() {}
+    void on_space() {}
+    void on_hash() {}
+  };
+
+ public:
+  template <typename ParseContext>
+  auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    format_str_ = ctx.begin();
+    // Checks are deferred to formatting time when the argument type is known.
+    detail::dynamic_specs_handler<ParseContext> handler(specs_, ctx);
+    return parse_format_specs(ctx.begin(), ctx.end(), handler);
+  }
+
+  template <typename T, typename FormatContext>
+  auto format(const T& val, FormatContext& ctx) -> decltype(ctx.out()) {
+    handle_specs(ctx);
+    detail::specs_checker<null_handler> checker(
+        null_handler(), detail::mapped_type_constant<T, FormatContext>::value);
+    checker.on_align(specs_.align);
+    switch (specs_.sign) {
+    case sign::none:
+      break;
+    case sign::plus:
+      checker.on_plus();
+      break;
+    case sign::minus:
+      checker.on_minus();
+      break;
+    case sign::space:
+      checker.on_space();
+      break;
+    }
+    if (specs_.alt) checker.on_hash();
+    if (specs_.precision >= 0) checker.end_precision();
+    using af = detail::arg_formatter<typename FormatContext::iterator,
+                                     typename FormatContext::char_type>;
+    visit_format_arg(af(ctx, nullptr, &specs_),
+                     detail::make_arg<FormatContext>(val));
+    return ctx.out();
+  }
+
+ private:
+  template <typename Context> void handle_specs(Context& ctx) {
+    detail::handle_dynamic_spec<detail::width_checker>(specs_.width,
+                                                       specs_.width_ref, ctx);
+    detail::handle_dynamic_spec<detail::precision_checker>(
+        specs_.precision, specs_.precision_ref, ctx);
+  }
+
+  detail::dynamic_format_specs<Char> specs_;
+  const Char* format_str_;
+};
+
+template <typename Char, typename ErrorHandler>
+FMT_CONSTEXPR void advance_to(
+    basic_format_parse_context<Char, ErrorHandler>& ctx, const Char* p) {
+  ctx.advance_to(ctx.begin() + (p - &*ctx.begin()));
+}
+
+/**
+  \rst
+  Converts ``p`` to ``const void*`` for pointer formatting.
+
+  **Example**::
+
+    auto s = fmt::format("{}", fmt::ptr(p));
+  \endrst
+ */
+template <typename T> inline const void* ptr(const T* p) { return p; }
+template <typename T> inline const void* ptr(const std::unique_ptr<T>& p) {
+  return p.get();
+}
+template <typename T> inline const void* ptr(const std::shared_ptr<T>& p) {
+  return p.get();
+}
+
+class bytes {
+ private:
+  string_view data_;
+  friend struct formatter<bytes>;
+
+ public:
+  explicit bytes(string_view data) : data_(data) {}
+};
+
+template <> struct formatter<bytes> {
+ private:
+  detail::dynamic_format_specs<char> specs_;
+
+ public:
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    using handler_type = detail::dynamic_specs_handler<ParseContext>;
+    detail::specs_checker<handler_type> handler(handler_type(specs_, ctx),
+                                                detail::type::string_type);
+    auto it = parse_format_specs(ctx.begin(), ctx.end(), handler);
+    detail::check_string_type_spec(specs_.type, ctx.error_handler());
+    return it;
+  }
+
+  template <typename FormatContext>
+  auto format(bytes b, FormatContext& ctx) -> decltype(ctx.out()) {
+    detail::handle_dynamic_spec<detail::width_checker>(specs_.width,
+                                                       specs_.width_ref, ctx);
+    detail::handle_dynamic_spec<detail::precision_checker>(
+        specs_.precision, specs_.precision_ref, ctx);
+    return detail::write_bytes(ctx.out(), b.data_, specs_);
+  }
+};
+
+template <typename It, typename Sentinel, typename Char>
+struct arg_join : detail::view {
+  It begin;
+  Sentinel end;
+  basic_string_view<Char> sep;
+
+  arg_join(It b, Sentinel e, basic_string_view<Char> s)
+      : begin(b), end(e), sep(s) {}
+};
+
+template <typename It, typename Sentinel, typename Char>
+struct formatter<arg_join<It, Sentinel, Char>, Char>
+    : formatter<typename std::iterator_traits<It>::value_type, Char> {
+  template <typename FormatContext>
+  auto format(const arg_join<It, Sentinel, Char>& value, FormatContext& ctx)
+      -> decltype(ctx.out()) {
+    using base = formatter<typename std::iterator_traits<It>::value_type, Char>;
+    auto it = value.begin;
+    auto out = ctx.out();
+    if (it != value.end) {
+      out = base::format(*it++, ctx);
+      while (it != value.end) {
+        out = std::copy(value.sep.begin(), value.sep.end(), out);
+        ctx.advance_to(out);
+        out = base::format(*it++, ctx);
+      }
+    }
+    return out;
+  }
+};
+
+/**
+  Returns an object that formats the iterator range `[begin, end)` with elements
+  separated by `sep`.
+ */
+template <typename It, typename Sentinel>
+arg_join<It, Sentinel, char> join(It begin, Sentinel end, string_view sep) {
+  return {begin, end, sep};
+}
+
+template <typename It, typename Sentinel>
+arg_join<It, Sentinel, wchar_t> join(It begin, Sentinel end, wstring_view sep) {
+  return {begin, end, sep};
+}
+
+/**
+  \rst
+  Returns an object that formats `range` with elements separated by `sep`.
+
+  **Example**::
+
+    std::vector<int> v = {1, 2, 3};
+    fmt::print("{}", fmt::join(v, ", "));
+    // Output: "1, 2, 3"
+
+  ``fmt::join`` applies passed format specifiers to the range elements::
+
+    fmt::print("{:02}", fmt::join(v, ", "));
+    // Output: "01, 02, 03"
+  \endrst
+ */
+template <typename Range>
+arg_join<detail::iterator_t<Range>, detail::sentinel_t<Range>, char> join(
+    Range&& range, string_view sep) {
+  return join(std::begin(range), std::end(range), sep);
+}
+
+template <typename Range>
+arg_join<detail::iterator_t<Range>, detail::sentinel_t<Range>, wchar_t> join(
+    Range&& range, wstring_view sep) {
+  return join(std::begin(range), std::end(range), sep);
+}
+
+/**
+  \rst
+  Converts *value* to ``std::string`` using the default format for type *T*.
+
+  **Example**::
+
+    #include <fmt/format.h>
+
+    std::string answer = fmt::to_string(42);
+  \endrst
+ */
+template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value)>
+inline std::string to_string(const T& value) {
+  std::string result;
+  detail::write<char>(std::back_inserter(result), value);
+  return result;
+}
+
+template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
+inline std::string to_string(T value) {
+  // The buffer should be large enough to store the number including the sign or
+  // "false" for bool.
+  constexpr int max_size = detail::digits10<T>() + 2;
+  char buffer[max_size > 5 ? static_cast<unsigned>(max_size) : 5];
+  char* begin = buffer;
+  return std::string(begin, detail::write<char>(begin, value));
+}
+
+/**
+  Converts *value* to ``std::wstring`` using the default format for type *T*.
+ */
+template <typename T> inline std::wstring to_wstring(const T& value) {
+  return format(L"{}", value);
+}
+
+template <typename Char, size_t SIZE>
+std::basic_string<Char> to_string(const basic_memory_buffer<Char, SIZE>& buf) {
+  auto size = buf.size();
+  detail::assume(size < std::basic_string<Char>().max_size());
+  return std::basic_string<Char>(buf.data(), size);
+}
+
+template <typename Char>
+void detail::vformat_to(
+    detail::buffer<Char>& buf, basic_string_view<Char> format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args,
+    detail::locale_ref loc) {
+  using iterator = typename buffer_context<Char>::iterator;
+  auto out = buffer_appender<Char>(buf);
+  if (format_str.size() == 2 && equal2(format_str.data(), "{}")) {
+    auto arg = args.get(0);
+    if (!arg) error_handler().on_error("argument not found");
+    visit_format_arg(default_arg_formatter<iterator, Char>{out, args, loc},
+                     arg);
+    return;
+  }
+  format_handler<iterator, Char, buffer_context<Char>> h(out, format_str, args,
+                                                         loc);
+  parse_format_string<false>(format_str, h);
+}
+
+#ifndef FMT_HEADER_ONLY
+extern template void detail::vformat_to(detail::buffer<char>&, string_view,
+                                        basic_format_args<format_context>,
+                                        detail::locale_ref);
+namespace detail {
+
+extern template FMT_API std::string grouping_impl<char>(locale_ref loc);
+extern template FMT_API std::string grouping_impl<wchar_t>(locale_ref loc);
+extern template FMT_API char thousands_sep_impl<char>(locale_ref loc);
+extern template FMT_API wchar_t thousands_sep_impl<wchar_t>(locale_ref loc);
+extern template FMT_API char decimal_point_impl(locale_ref loc);
+extern template FMT_API wchar_t decimal_point_impl(locale_ref loc);
+extern template int format_float<double>(double value, int precision,
+                                         float_specs specs, buffer<char>& buf);
+extern template int format_float<long double>(long double value, int precision,
+                                              float_specs specs,
+                                              buffer<char>& buf);
+int snprintf_float(float value, int precision, float_specs specs,
+                   buffer<char>& buf) = delete;
+extern template int snprintf_float<double>(double value, int precision,
+                                           float_specs specs,
+                                           buffer<char>& buf);
+extern template int snprintf_float<long double>(long double value,
+                                                int precision,
+                                                float_specs specs,
+                                                buffer<char>& buf);
+}  // namespace detail
+#endif
+
+template <typename S, typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_string<S>::value)>
+inline void vformat_to(
+    detail::buffer<Char>& buf, const S& format_str,
+    basic_format_args<FMT_BUFFER_CONTEXT(type_identity_t<Char>)> args) {
+  return detail::vformat_to(buf, to_string_view(format_str), args);
+}
+
+template <typename S, typename... Args, size_t SIZE = inline_buffer_size,
+          typename Char = enable_if_t<detail::is_string<S>::value, char_t<S>>>
+inline typename buffer_context<Char>::iterator format_to(
+    basic_memory_buffer<Char, SIZE>& buf, const S& format_str, Args&&... args) {
+  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
+  detail::vformat_to(buf, to_string_view(format_str), vargs);
+  return detail::buffer_appender<Char>(buf);
+}
+
+template <typename OutputIt, typename Char = char>
+using format_context_t = basic_format_context<OutputIt, Char>;
+
+template <typename OutputIt, typename Char = char>
+using format_args_t = basic_format_args<format_context_t<OutputIt, Char>>;
+
+template <typename OutputIt, typename Char = typename OutputIt::value_type>
+using format_to_n_context FMT_DEPRECATED_ALIAS = buffer_context<Char>;
+
+template <typename OutputIt, typename Char = typename OutputIt::value_type>
+using format_to_n_args FMT_DEPRECATED_ALIAS =
+    basic_format_args<buffer_context<Char>>;
+
+template <typename OutputIt, typename Char, typename... Args>
+FMT_DEPRECATED format_arg_store<buffer_context<Char>, Args...>
+make_format_to_n_args(const Args&... args) {
+  return format_arg_store<buffer_context<Char>, Args...>(args...);
+}
+
+template <typename Char, enable_if_t<(!std::is_same<Char, char>::value), int>>
+std::basic_string<Char> detail::vformat(
+    basic_string_view<Char> format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+  basic_memory_buffer<Char> buffer;
+  detail::vformat_to(buffer, format_str, args);
+  return to_string(buffer);
+}
+
+template <typename Char, FMT_ENABLE_IF(std::is_same<Char, wchar_t>::value)>
+void vprint(std::FILE* f, basic_string_view<Char> format_str,
+            wformat_args args) {
+  wmemory_buffer buffer;
+  detail::vformat_to(buffer, format_str, args);
+  buffer.push_back(L'\0');
+  if (std::fputws(buffer.data(), f) == -1)
+    FMT_THROW(system_error(errno, "cannot write to file"));
+}
+
+template <typename Char, FMT_ENABLE_IF(std::is_same<Char, wchar_t>::value)>
+void vprint(basic_string_view<Char> format_str, wformat_args args) {
+  vprint(stdout, format_str, args);
+}
+
+#if FMT_USE_USER_DEFINED_LITERALS
+namespace detail {
+
+#  if FMT_USE_UDL_TEMPLATE
+template <typename Char, Char... CHARS> class udl_formatter {
+ public:
+  template <typename... Args>
+  std::basic_string<Char> operator()(Args&&... args) const {
+    static FMT_CONSTEXPR_DECL Char s[] = {CHARS..., '\0'};
+    return format(FMT_STRING(s), std::forward<Args>(args)...);
+  }
+};
+#  else
+template <typename Char> struct udl_formatter {
+  basic_string_view<Char> str;
+
+  template <typename... Args>
+  std::basic_string<Char> operator()(Args&&... args) const {
+    return format(str, std::forward<Args>(args)...);
+  }
+};
+#  endif  // FMT_USE_UDL_TEMPLATE
+
+template <typename Char> struct udl_arg {
+  const Char* str;
+
+  template <typename T> named_arg<Char, T> operator=(T&& value) const {
+    return {str, std::forward<T>(value)};
+  }
+};
+}  // namespace detail
+
+inline namespace literals {
+#  if FMT_USE_UDL_TEMPLATE
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wpedantic"
+#    if FMT_CLANG_VERSION
+#      pragma GCC diagnostic ignored "-Wgnu-string-literal-operator-template"
+#    endif
+template <typename Char, Char... CHARS>
+FMT_CONSTEXPR detail::udl_formatter<Char, CHARS...> operator""_format() {
+  return {};
+}
+#    pragma GCC diagnostic pop
+#  else
+/**
+  \rst
+  User-defined literal equivalent of :func:`fmt::format`.
+
+  **Example**::
+
+    using namespace fmt::literals;
+    std::string message = "The answer is {}"_format(42);
+  \endrst
+ */
+FMT_CONSTEXPR detail::udl_formatter<char> operator"" _format(const char* s,
+                                                             size_t n) {
+  return {{s, n}};
+}
+FMT_CONSTEXPR detail::udl_formatter<wchar_t> operator"" _format(
+    const wchar_t* s, size_t n) {
+  return {{s, n}};
+}
+#  endif  // FMT_USE_UDL_TEMPLATE
+
+/**
+  \rst
+  User-defined literal equivalent of :func:`fmt::arg`.
+
+  **Example**::
+
+    using namespace fmt::literals;
+    fmt::print("Elapsed time: {s:.2f} seconds", "s"_a=1.23);
+  \endrst
+ */
+FMT_CONSTEXPR detail::udl_arg<char> operator"" _a(const char* s, size_t) {
+  return {s};
+}
+FMT_CONSTEXPR detail::udl_arg<wchar_t> operator"" _a(const wchar_t* s, size_t) {
+  return {s};
+}
+}  // namespace literals
+#endif  // FMT_USE_USER_DEFINED_LITERALS
+FMT_END_NAMESPACE
+
+#ifdef FMT_HEADER_ONLY
+#  define FMT_FUNC inline
+#  include "format-inl.h"
+#else
+#  define FMT_FUNC
+#endif
+
+#endif  // FMT_FORMAT_H_
diff --git a/include/vtkdiy2/thirdparty/fmt/locale.h b/include/vtkdiy2/thirdparty/fmt/locale.h
new file mode 100644
index 00000000000..7301bf92a24
--- /dev/null
+++ b/include/vtkdiy2/thirdparty/fmt/locale.h
@@ -0,0 +1,64 @@
+// Formatting library for C++ - std::locale support
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_LOCALE_H_
+#define FMT_LOCALE_H_
+
+#include <locale>
+
+#include "format.h"
+
+FMT_BEGIN_NAMESPACE
+
+namespace detail {
+template <typename Char>
+std::basic_string<Char> vformat(
+    const std::locale& loc, basic_string_view<Char> format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+  basic_memory_buffer<Char> buffer;
+  detail::vformat_to(buffer, format_str, args, detail::locale_ref(loc));
+  return fmt::to_string(buffer);
+}
+}  // namespace detail
+
+template <typename S, typename Char = char_t<S>>
+inline std::basic_string<Char> vformat(
+    const std::locale& loc, const S& format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+  return detail::vformat(loc, to_string_view(format_str), args);
+}
+
+template <typename S, typename... Args, typename Char = char_t<S>>
+inline std::basic_string<Char> format(const std::locale& loc,
+                                      const S& format_str, Args&&... args) {
+  return detail::vformat(loc, to_string_view(format_str),
+                         fmt::make_args_checked<Args...>(format_str, args...));
+}
+
+template <typename S, typename OutputIt, typename... Args,
+          typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value)>
+inline OutputIt vformat_to(
+    OutputIt out, const std::locale& loc, const S& format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+  decltype(detail::get_buffer<Char>(out)) buf(detail::get_buffer_init(out));
+  vformat_to(buf, to_string_view(format_str), args, detail::locale_ref(loc));
+  return detail::get_iterator(buf);
+}
+
+template <typename OutputIt, typename S, typename... Args,
+          bool enable = detail::is_output_iterator<OutputIt, char_t<S>>::value>
+inline auto format_to(OutputIt out, const std::locale& loc,
+                      const S& format_str, Args&&... args) ->
+    typename std::enable_if<enable, OutputIt>::type {
+  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
+  return vformat_to(out, loc, to_string_view(format_str), vargs);
+}
+
+FMT_END_NAMESPACE
+
+#endif  // FMT_LOCALE_H_
diff --git a/include/vtkdiy2/fmt/posix.h b/include/vtkdiy2/thirdparty/fmt/os.h
similarity index 55%
rename from include/vtkdiy2/fmt/posix.h
rename to include/vtkdiy2/thirdparty/fmt/os.h
index 6b2d7f8e4c4..d44ea0c904d 100644
--- a/include/vtkdiy2/fmt/posix.h
+++ b/include/vtkdiy2/thirdparty/fmt/os.h
@@ -1,25 +1,23 @@
-// A C++ interface to POSIX functions.
+// Formatting library for C++ - optional OS-specific functionality
 //
-// Copyright (c) 2012 - 2016, Victor Zverovich
+// Copyright (c) 2012 - present, Victor Zverovich
 // All rights reserved.
 //
 // For the license information refer to format.h.
 
-#ifndef FMT_POSIX_H_
-#define FMT_POSIX_H_
+#ifndef FMT_OS_H_
+#define FMT_OS_H_
 
 #if defined(__MINGW32__) || defined(__CYGWIN__)
 // Workaround MinGW bug https://sourceforge.net/p/mingw/bugs/2024/.
 #  undef __STRICT_ANSI__
 #endif
 
-#include <errno.h>
-#include <fcntl.h>   // for O_RDONLY
-#include <locale.h>  // for locale_t
-#include <stdio.h>
-#include <stdlib.h>  // for strtod_l
-
+#include <cerrno>
+#include <clocale>  // for locale_t
 #include <cstddef>
+#include <cstdio>
+#include <cstdlib>  // for strtod_l
 
 #if defined __APPLE__ || defined(__FreeBSD__)
 #  include <xlocale.h>  // for LC_NUMERIC_MASK on OS X
@@ -27,6 +25,19 @@
 
 #include "format.h"
 
+// UWP doesn't provide _pipe.
+#if FMT_HAS_INCLUDE("winapifamily.h")
+#  include <winapifamily.h>
+#endif
+#if (FMT_HAS_INCLUDE(<fcntl.h>) || defined(__APPLE__) || \
+     defined(__linux__)) &&                              \
+    (!defined(WINAPI_FAMILY) || (WINAPI_FAMILY == WINAPI_FAMILY_DESKTOP_APP))
+#  include <fcntl.h>  // for O_RDONLY
+#  define FMT_USE_FCNTL 1
+#else
+#  define FMT_USE_FCNTL 0
+#endif
+
 #ifndef FMT_POSIX
 #  if defined(_WIN32) && !defined(__MINGW32__)
 // Fix warnings about deprecated symbols.
@@ -40,7 +51,7 @@
 #ifdef FMT_SYSTEM
 #  define FMT_POSIX_CALL(call) FMT_SYSTEM(call)
 #else
-#  define FMT_SYSTEM(call) call
+#  define FMT_SYSTEM(call) ::call
 #  ifdef _WIN32
 // Fix warnings about deprecated symbols.
 #    define FMT_POSIX_CALL(call) ::_##call
@@ -54,8 +65,8 @@
 #ifndef _WIN32
 #  define FMT_RETRY_VAL(result, expression, error_result) \
     do {                                                  \
-      result = (expression);                              \
-    } while (result == error_result && errno == EINTR)
+      (result) = (expression);                            \
+    } while ((result) == (error_result) && errno == EINTR)
 #else
 #  define FMT_RETRY_VAL(result, expression, error_result) result = (expression)
 #endif
@@ -122,6 +133,78 @@ class error_code {
   int get() const FMT_NOEXCEPT { return value_; }
 };
 
+#ifdef _WIN32
+namespace detail {
+// A converter from UTF-16 to UTF-8.
+// It is only provided for Windows since other systems support UTF-8 natively.
+class utf16_to_utf8 {
+ private:
+  memory_buffer buffer_;
+
+ public:
+  utf16_to_utf8() {}
+  FMT_API explicit utf16_to_utf8(wstring_view s);
+  operator string_view() const { return string_view(&buffer_[0], size()); }
+  size_t size() const { return buffer_.size() - 1; }
+  const char* c_str() const { return &buffer_[0]; }
+  std::string str() const { return std::string(&buffer_[0], size()); }
+
+  // Performs conversion returning a system error code instead of
+  // throwing exception on conversion error. This method may still throw
+  // in case of memory allocation error.
+  FMT_API int convert(wstring_view s);
+};
+
+FMT_API void format_windows_error(buffer<char>& out, int error_code,
+                                  string_view message) FMT_NOEXCEPT;
+}  // namespace detail
+
+/** A Windows error. */
+class windows_error : public system_error {
+ private:
+  FMT_API void init(int error_code, string_view format_str, format_args args);
+
+ public:
+  /**
+   \rst
+   Constructs a :class:`fmt::windows_error` object with the description
+   of the form
+
+   .. parsed-literal::
+     *<message>*: *<system-message>*
+
+   where *<message>* is the formatted message and *<system-message>* is the
+   system message corresponding to the error code.
+   *error_code* is a Windows error code as given by ``GetLastError``.
+   If *error_code* is not a valid error code such as -1, the system message
+   will look like "error -1".
+
+   **Example**::
+
+     // This throws a windows_error with the description
+     //   cannot open file 'madeup': The system cannot find the file specified.
+     // or similar (system message may vary).
+     const char *filename = "madeup";
+     LPOFSTRUCT of = LPOFSTRUCT();
+     HFILE file = OpenFile(filename, &of, OF_READ);
+     if (file == HFILE_ERROR) {
+       throw fmt::windows_error(GetLastError(),
+                                "cannot open file '{}'", filename);
+     }
+   \endrst
+  */
+  template <typename... Args>
+  windows_error(int error_code, string_view message, const Args&... args) {
+    init(error_code, message, make_format_args(args...));
+  }
+};
+
+// Reports a Windows error without throwing an exception.
+// Can be used to report errors from destructors.
+FMT_API void report_windows_error(int error_code,
+                                  string_view message) FMT_NOEXCEPT;
+#endif  // _WIN32
+
 // A buffered file.
 class buffered_file {
  private:
@@ -132,16 +215,15 @@ class buffered_file {
   explicit buffered_file(FILE* f) : file_(f) {}
 
  public:
+  buffered_file(const buffered_file&) = delete;
+  void operator=(const buffered_file&) = delete;
+
   // Constructs a buffered_file object which doesn't represent any file.
   buffered_file() FMT_NOEXCEPT : file_(nullptr) {}
 
   // Destroys the object closing the file it represents if any.
   FMT_API ~buffered_file() FMT_NOEXCEPT;
 
- private:
-  buffered_file(const buffered_file&) = delete;
-  void operator=(const buffered_file&) = delete;
-
  public:
   buffered_file(buffered_file&& other) FMT_NOEXCEPT : file_(other.file_) {
     other.file_ = nullptr;
@@ -177,6 +259,7 @@ class buffered_file {
   }
 };
 
+#if FMT_USE_FCNTL
 // A file. Closed file is represented by a file object with descriptor -1.
 // Methods that are not declared with FMT_NOEXCEPT may throw
 // fmt::system_error in case of failure. Note that some errors such as
@@ -195,7 +278,9 @@ class file {
   enum {
     RDONLY = FMT_POSIX(O_RDONLY),  // Open for reading only.
     WRONLY = FMT_POSIX(O_WRONLY),  // Open for writing only.
-    RDWR = FMT_POSIX(O_RDWR)       // Open for reading and writing.
+    RDWR = FMT_POSIX(O_RDWR),      // Open for reading and writing.
+    CREATE = FMT_POSIX(O_CREAT),   // Create if the file doesn't exist.
+    APPEND = FMT_POSIX(O_APPEND)   // Open in append mode.
   };
 
   // Constructs a file object which doesn't represent any file.
@@ -204,14 +289,13 @@ class file {
   // Opens a file and constructs a file object representing this file.
   FMT_API file(cstring_view path, int oflag);
 
- private:
+ public:
   file(const file&) = delete;
   void operator=(const file&) = delete;
 
- public:
   file(file&& other) FMT_NOEXCEPT : fd_(other.fd_) { other.fd_ = -1; }
 
-  file& operator=(file&& other) {
+  file& operator=(file&& other) FMT_NOEXCEPT {
     close();
     fd_ = other.fd_;
     other.fd_ = -1;
@@ -232,10 +316,10 @@ class file {
   FMT_API long long size() const;
 
   // Attempts to read count bytes from the file into the specified buffer.
-  FMT_API std::size_t read(void* buffer, std::size_t count);
+  FMT_API size_t read(void* buffer, size_t count);
 
   // Attempts to write count bytes from the specified buffer to the file.
-  FMT_API std::size_t write(const void* buffer, std::size_t count);
+  FMT_API size_t write(const void* buffer, size_t count);
 
   // Duplicates a file descriptor with the dup function and returns
   // the duplicate as a file object.
@@ -261,38 +345,122 @@ class file {
 // Returns the memory page size.
 long getpagesize();
 
+namespace detail {
+
+struct buffer_size {
+  size_t value = 0;
+  buffer_size operator=(size_t val) const {
+    auto bs = buffer_size();
+    bs.value = val;
+    return bs;
+  }
+};
+
+struct ostream_params {
+  int oflag = file::WRONLY | file::CREATE;
+  size_t buffer_size = BUFSIZ > 32768 ? BUFSIZ : 32768;
+
+  ostream_params() {}
+
+  template <typename... T>
+  ostream_params(T... params, int oflag) : ostream_params(params...) {
+    this->oflag = oflag;
+  }
+
+  template <typename... T>
+  ostream_params(T... params, detail::buffer_size bs)
+      : ostream_params(params...) {
+    this->buffer_size = bs.value;
+  }
+};
+}  // namespace detail
+
+static constexpr detail::buffer_size buffer_size;
+
+// A fast output stream which is not thread-safe.
+class ostream final : private detail::buffer<char> {
+ private:
+  file file_;
+
+  void flush() {
+    if (size() == 0) return;
+    file_.write(data(), size());
+    clear();
+  }
+
+  FMT_API void grow(size_t) override final;
+
+  ostream(cstring_view path, const detail::ostream_params& params)
+      : file_(path, params.oflag) {
+    set(new char[params.buffer_size], params.buffer_size);
+  }
+
+ public:
+  ostream(ostream&& other)
+      : detail::buffer<char>(other.data(), other.size(), other.capacity()),
+        file_(std::move(other.file_)) {
+    other.set(nullptr, 0);
+  }
+  ~ostream() {
+    flush();
+    delete[] data();
+  }
+
+  template <typename... T>
+  friend ostream output_file(cstring_view path, T... params);
+
+  void close() {
+    flush();
+    file_.close();
+  }
+
+  template <typename S, typename... Args>
+  void print(const S& format_str, const Args&... args) {
+    format_to(detail::buffer_appender<char>(*this), format_str, args...);
+  }
+};
+
+/**
+  Opens a file for writing. Supported parameters passed in `params`:
+  * ``<integer>``: Output flags (``file::WRONLY | file::CREATE`` by default)
+  * ``buffer_size=<integer>``: Output buffer size
+ */
+template <typename... T>
+inline ostream output_file(cstring_view path, T... params) {
+  return {path, detail::ostream_params(params...)};
+}
+#endif  // FMT_USE_FCNTL
+
 #ifdef FMT_LOCALE
 // A "C" numeric locale.
-class Locale {
+class locale {
  private:
 #  ifdef _WIN32
   using locale_t = _locale_t;
 
-  enum { LC_NUMERIC_MASK = LC_NUMERIC };
-
-  static locale_t newlocale(int category_mask, const char* locale, locale_t) {
-    return _create_locale(category_mask, locale);
-  }
-
-  static void freelocale(locale_t locale) { _free_locale(locale); }
+  static void freelocale(locale_t loc) { _free_locale(loc); }
 
-  static double strtod_l(const char* nptr, char** endptr, _locale_t locale) {
-    return _strtod_l(nptr, endptr, locale);
+  static double strtod_l(const char* nptr, char** endptr, _locale_t loc) {
+    return _strtod_l(nptr, endptr, loc);
   }
 #  endif
 
   locale_t locale_;
 
-  Locale(const Locale&) = delete;
-  void operator=(const Locale&) = delete;
-
  public:
   using type = locale_t;
+  locale(const locale&) = delete;
+  void operator=(const locale&) = delete;
 
-  Locale() : locale_(newlocale(LC_NUMERIC_MASK, "C", nullptr)) {
+  locale() {
+#  ifndef _WIN32
+    locale_ = FMT_SYSTEM(newlocale(LC_NUMERIC_MASK, "C", nullptr));
+#  else
+    locale_ = _create_locale(LC_NUMERIC, "C");
+#  endif
     if (!locale_) FMT_THROW(system_error(errno, "cannot create locale"));
   }
-  ~Locale() { freelocale(locale_); }
+  ~locale() { freelocale(locale_); }
 
   type get() const { return locale_; }
 
@@ -305,7 +473,8 @@ class Locale {
     return result;
   }
 };
+using Locale FMT_DEPRECATED_ALIAS = locale;
 #endif  // FMT_LOCALE
 FMT_END_NAMESPACE
 
-#endif  // FMT_POSIX_H_
+#endif  // FMT_OS_H_
diff --git a/include/vtkdiy2/fmt/ostream.h b/include/vtkdiy2/thirdparty/fmt/ostream.h
similarity index 55%
rename from include/vtkdiy2/fmt/ostream.h
rename to include/vtkdiy2/thirdparty/fmt/ostream.h
index 69bac0e24a1..29c58ec13b1 100644
--- a/include/vtkdiy2/fmt/ostream.h
+++ b/include/vtkdiy2/thirdparty/fmt/ostream.h
@@ -9,10 +9,15 @@
 #define FMT_OSTREAM_H_
 
 #include <ostream>
+
 #include "format.h"
 
 FMT_BEGIN_NAMESPACE
-namespace internal {
+
+template <typename Char> class basic_printf_parse_context;
+template <typename OutputIt, typename Char> class basic_printf_context;
+
+namespace detail {
 
 template <class Char> class formatbuf : public std::basic_streambuf<Char> {
  private:
@@ -44,21 +49,35 @@ template <class Char> class formatbuf : public std::basic_streambuf<Char> {
   }
 };
 
+struct converter {
+  template <typename T, FMT_ENABLE_IF(is_integral<T>::value)> converter(T);
+};
+
 template <typename Char> struct test_stream : std::basic_ostream<Char> {
  private:
-  struct null;
-  // Hide all operator<< from std::basic_ostream<Char>.
-  void operator<<(null);
+  void_t<> operator<<(converter);
 };
 
+// Hide insertion operators for built-in types.
+template <typename Char, typename Traits>
+void_t<> operator<<(std::basic_ostream<Char, Traits>&, Char);
+template <typename Char, typename Traits>
+void_t<> operator<<(std::basic_ostream<Char, Traits>&, char);
+template <typename Traits>
+void_t<> operator<<(std::basic_ostream<char, Traits>&, char);
+template <typename Traits>
+void_t<> operator<<(std::basic_ostream<char, Traits>&, signed char);
+template <typename Traits>
+void_t<> operator<<(std::basic_ostream<char, Traits>&, unsigned char);
+
 // Checks if T has a user-defined operator<< (e.g. not a member of
 // std::ostream).
 template <typename T, typename Char> class is_streamable {
  private:
   template <typename U>
-  static decltype((void)(std::declval<test_stream<Char>&>()
-                         << std::declval<U>()),
-                  std::true_type())
+  static bool_constant<!std::is_same<decltype(std::declval<test_stream<Char>&>()
+                                              << std::declval<U>()),
+                                     void_t<>>::value>
   test(int);
 
   template <typename> static std::false_type test(...);
@@ -71,12 +90,11 @@ template <typename T, typename Char> class is_streamable {
 
 // Write the content of buf to os.
 template <typename Char>
-void write(std::basic_ostream<Char>& os, buffer<Char>& buf) {
+void write_buffer(std::basic_ostream<Char>& os, buffer<Char>& buf) {
   const Char* buf_data = buf.data();
   using unsigned_streamsize = std::make_unsigned<std::streamsize>::type;
   unsigned_streamsize size = buf.size();
-  unsigned_streamsize max_size =
-      to_unsigned((std::numeric_limits<std::streamsize>::max)());
+  unsigned_streamsize max_size = to_unsigned(max_value<std::streamsize>());
   do {
     unsigned_streamsize n = size <= max_size ? size : max_size;
     os.write(buf_data, static_cast<std::streamsize>(n));
@@ -86,34 +104,57 @@ void write(std::basic_ostream<Char>& os, buffer<Char>& buf) {
 }
 
 template <typename Char, typename T>
-void format_value(buffer<Char>& buf, const T& value) {
+void format_value(buffer<Char>& buf, const T& value,
+                  locale_ref loc = locale_ref()) {
   formatbuf<Char> format_buf(buf);
   std::basic_ostream<Char> output(&format_buf);
-  output.exceptions(std::ios_base::failbit | std::ios_base::badbit);
+#if !defined(FMT_STATIC_THOUSANDS_SEPARATOR)
+  if (loc) output.imbue(loc.get<std::locale>());
+#endif
   output << value;
-  buf.resize(buf.size());
+  output.exceptions(std::ios_base::failbit | std::ios_base::badbit);
+  buf.try_resize(buf.size());
 }
 
 // Formats an object of type T that has an overloaded ostream operator<<.
 template <typename T, typename Char>
 struct fallback_formatter<T, Char, enable_if_t<is_streamable<T, Char>::value>>
-    : formatter<basic_string_view<Char>, Char> {
-  template <typename Context>
-  auto format(const T& value, Context& ctx) -> decltype(ctx.out()) {
+    : private formatter<basic_string_view<Char>, Char> {
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    return formatter<basic_string_view<Char>, Char>::parse(ctx);
+  }
+  template <typename ParseCtx,
+            FMT_ENABLE_IF(std::is_same<
+                          ParseCtx, basic_printf_parse_context<Char>>::value)>
+  auto parse(ParseCtx& ctx) -> decltype(ctx.begin()) {
+    return ctx.begin();
+  }
+
+  template <typename OutputIt>
+  auto format(const T& value, basic_format_context<OutputIt, Char>& ctx)
+      -> OutputIt {
     basic_memory_buffer<Char> buffer;
-    format_value(buffer, value);
+    format_value(buffer, value, ctx.locale());
     basic_string_view<Char> str(buffer.data(), buffer.size());
     return formatter<basic_string_view<Char>, Char>::format(str, ctx);
   }
+  template <typename OutputIt>
+  auto format(const T& value, basic_printf_context<OutputIt, Char>& ctx)
+      -> OutputIt {
+    basic_memory_buffer<Char> buffer;
+    format_value(buffer, value, ctx.locale());
+    return std::copy(buffer.begin(), buffer.end(), ctx.out());
+  }
 };
-}  // namespace internal
+}  // namespace detail
 
 template <typename Char>
 void vprint(std::basic_ostream<Char>& os, basic_string_view<Char> format_str,
-            basic_format_args<buffer_context<Char>> args) {
+            basic_format_args<buffer_context<type_identity_t<Char>>> args) {
   basic_memory_buffer<Char> buffer;
-  internal::vformat_to(buffer, format_str, args);
-  internal::write(os, buffer);
+  detail::vformat_to(buffer, format_str, args);
+  detail::write_buffer(os, buffer);
 }
 
 /**
@@ -126,10 +167,10 @@ void vprint(std::basic_ostream<Char>& os, basic_string_view<Char> format_str,
   \endrst
  */
 template <typename S, typename... Args,
-          typename Char = enable_if_t<internal::is_string<S>::value, char_t<S>>>
+          typename Char = enable_if_t<detail::is_string<S>::value, char_t<S>>>
 void print(std::basic_ostream<Char>& os, const S& format_str, Args&&... args) {
   vprint(os, to_string_view(format_str),
-         {internal::make_args_checked<Args...>(format_str, args...)});
+         fmt::make_args_checked<Args...>(format_str, args...));
 }
 FMT_END_NAMESPACE
 
diff --git a/include/vtkdiy2/fmt/posix.cc b/include/vtkdiy2/thirdparty/fmt/posix.cc
similarity index 99%
rename from include/vtkdiy2/fmt/posix.cc
rename to include/vtkdiy2/thirdparty/fmt/posix.cc
index 69c27819d2b..f565e8c26a5 100644
--- a/include/vtkdiy2/fmt/posix.cc
+++ b/include/vtkdiy2/thirdparty/fmt/posix.cc
@@ -10,7 +10,7 @@
 #  define _CRT_SECURE_NO_WARNINGS
 #endif
 
-#include "fmt/posix.h"
+#include "posix.h"
 
 #include <limits.h>
 #include <sys/stat.h>
diff --git a/include/vtkdiy2/thirdparty/fmt/posix.h b/include/vtkdiy2/thirdparty/fmt/posix.h
new file mode 100644
index 00000000000..da19e9d530c
--- /dev/null
+++ b/include/vtkdiy2/thirdparty/fmt/posix.h
@@ -0,0 +1,2 @@
+#include "os.h"
+#warning "fmt/posix.h is deprecated; use fmt/os.h instead"
diff --git a/include/vtkdiy2/fmt/printf.h b/include/vtkdiy2/thirdparty/fmt/printf.h
similarity index 71%
rename from include/vtkdiy2/fmt/printf.h
rename to include/vtkdiy2/thirdparty/fmt/printf.h
index c803aa952bf..8c28ac23272 100644
--- a/include/vtkdiy2/fmt/printf.h
+++ b/include/vtkdiy2/thirdparty/fmt/printf.h
@@ -1,4 +1,4 @@
-// Formatting library for C++
+// Formatting library for C++ - legacy printf implementation
 //
 // Copyright (c) 2012 - 2016, Victor Zverovich
 // All rights reserved.
@@ -8,23 +8,19 @@
 #ifndef FMT_PRINTF_H_
 #define FMT_PRINTF_H_
 
-#include <algorithm>  // std::fill_n
+#include <algorithm>  // std::max
 #include <limits>     // std::numeric_limits
 
 #include "ostream.h"
 
 FMT_BEGIN_NAMESPACE
-namespace internal {
-
-// A helper function to suppress bogus "conditional expression is constant"
-// warnings.
-template <typename T> inline T const_check(T value) { return value; }
+namespace detail {
 
 // Checks if a value fits in int - used to avoid warnings about comparing
 // signed and unsigned integers.
 template <bool IsSigned> struct int_checker {
   template <typename T> static bool fits_in_int(T value) {
-    unsigned max = std::numeric_limits<int>::max();
+    unsigned max = max_value<int>();
     return value <= max;
   }
   static bool fits_in_int(bool) { return true; }
@@ -32,8 +28,8 @@ template <bool IsSigned> struct int_checker {
 
 template <> struct int_checker<true> {
   template <typename T> static bool fits_in_int(T value) {
-    return value >= std::numeric_limits<int>::min() &&
-           value <= std::numeric_limits<int>::max();
+    return value >= (std::numeric_limits<int>::min)() &&
+           value <= max_value<int>();
   }
   static bool fits_in_int(int) { return true; }
 };
@@ -94,11 +90,11 @@ template <typename T, typename Context> class arg_converter {
     if (const_check(sizeof(target_type) <= sizeof(int))) {
       // Extra casts are used to silence warnings.
       if (is_signed) {
-        arg_ = internal::make_arg<Context>(
+        arg_ = detail::make_arg<Context>(
             static_cast<int>(static_cast<target_type>(value)));
       } else {
         using unsigned_type = typename make_unsigned_or_bool<target_type>::type;
-        arg_ = internal::make_arg<Context>(
+        arg_ = detail::make_arg<Context>(
             static_cast<unsigned>(static_cast<unsigned_type>(value)));
       }
     } else {
@@ -106,9 +102,9 @@ template <typename T, typename Context> class arg_converter {
         // glibc's printf doesn't sign extend arguments of smaller types:
         //   std::printf("%lld", -42);  // prints "4294967254"
         // but we don't have to do the same because it's a UB.
-        arg_ = internal::make_arg<Context>(static_cast<long long>(value));
+        arg_ = detail::make_arg<Context>(static_cast<long long>(value));
       } else {
-        arg_ = internal::make_arg<Context>(
+        arg_ = detail::make_arg<Context>(
             static_cast<typename make_unsigned_or_bool<U>::type>(value));
       }
     }
@@ -137,7 +133,7 @@ template <typename Context> class char_converter {
 
   template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
   void operator()(T value) {
-    arg_ = internal::make_arg<Context>(
+    arg_ = detail::make_arg<Context>(
         static_cast<typename Context::char_type>(value));
   }
 
@@ -145,6 +141,13 @@ template <typename Context> class char_converter {
   void operator()(T) {}  // No conversion needed for non-integral types.
 };
 
+// An argument visitor that return a pointer to a C string if argument is a
+// string or null otherwise.
+template <typename Char> struct get_cstring {
+  template <typename T> const Char* operator()(T) { return nullptr; }
+  const Char* operator()(const Char* s) { return s; }
+};
+
 // Checks if an argument is a valid printf width specifier and sets
 // left alignment if it is negative.
 template <typename Char> class printf_width_handler {
@@ -158,12 +161,12 @@ template <typename Char> class printf_width_handler {
 
   template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
   unsigned operator()(T value) {
-    auto width = static_cast<uint32_or_64_t<T>>(value);
-    if (internal::is_negative(value)) {
+    auto width = static_cast<uint32_or_64_or_128_t<T>>(value);
+    if (detail::is_negative(value)) {
       specs_.align = align::left;
       width = 0 - width;
     }
-    unsigned int_max = std::numeric_limits<int>::max();
+    unsigned int_max = max_value<int>();
     if (width > int_max) FMT_THROW(format_error("number is too big"));
     return static_cast<unsigned>(width);
   }
@@ -176,23 +179,25 @@ template <typename Char> class printf_width_handler {
 };
 
 template <typename Char, typename Context>
-void printf(buffer<Char>& buf, basic_string_view<Char> format,
-            basic_format_args<Context> args) {
-  Context(std::back_inserter(buf), format, args).format();
+void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
+             basic_format_args<Context> args) {
+  Context(buffer_appender<Char>(buf), format, args).format();
 }
+}  // namespace detail
 
-template <typename OutputIt, typename Char, typename Context>
-internal::truncating_iterator<OutputIt> printf(
-    internal::truncating_iterator<OutputIt> it, basic_string_view<Char> format,
-    basic_format_args<Context> args) {
-  return Context(it, format, args).format();
+// For printing into memory_buffer.
+template <typename Char, typename Context>
+FMT_DEPRECATED void printf(detail::buffer<Char>& buf,
+                           basic_string_view<Char> format,
+                           basic_format_args<Context> args) {
+  return detail::vprintf(buf, format, args);
 }
-}  // namespace internal
-
-using internal::printf;  // For printing into memory_buffer.
-
-template <typename Range> class printf_arg_formatter;
+using detail::vprintf;
 
+template <typename Char>
+class basic_printf_parse_context : public basic_format_parse_context<Char> {
+  using basic_format_parse_context<Char>::basic_format_parse_context;
+};
 template <typename OutputIt, typename Char> class basic_printf_context;
 
 /**
@@ -200,15 +205,15 @@ template <typename OutputIt, typename Char> class basic_printf_context;
   The ``printf`` argument formatter.
   \endrst
  */
-template <typename Range>
-class printf_arg_formatter : public internal::arg_formatter_base<Range> {
+template <typename OutputIt, typename Char>
+class printf_arg_formatter : public detail::arg_formatter_base<OutputIt, Char> {
  public:
-  using iterator = typename Range::iterator;
+  using iterator = OutputIt;
 
  private:
-  using char_type = typename Range::value_type;
-  using base = internal::arg_formatter_base<Range>;
-  using context_type = basic_printf_context<iterator, char_type>;
+  using char_type = Char;
+  using base = detail::arg_formatter_base<OutputIt, Char>;
+  using context_type = basic_printf_context<OutputIt, Char>;
 
   context_type& context_;
 
@@ -233,9 +238,9 @@ class printf_arg_formatter : public internal::arg_formatter_base<Range> {
     \endrst
    */
   printf_arg_formatter(iterator iter, format_specs& specs, context_type& ctx)
-      : base(Range(iter), &specs, internal::locale_ref()), context_(ctx) {}
+      : base(iter, &specs, detail::locale_ref()), context_(ctx) {}
 
-  template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
+  template <typename T, FMT_ENABLE_IF(fmt::detail::is_integral<T>::value)>
   iterator operator()(T value) {
     // MSVC2013 fails to compile separate overloads for bool and char_type so
     // use std::is_same instead.
@@ -250,7 +255,11 @@ class printf_arg_formatter : public internal::arg_formatter_base<Range> {
         return (*this)(static_cast<int>(value));
       fmt_specs.sign = sign::none;
       fmt_specs.alt = false;
-      fmt_specs.align = align::right;
+      fmt_specs.fill[0] = ' ';  // Ignore '0' flag for char types.
+      // align::numeric needs to be overwritten here since the '0' flag is
+      // ignored for non-numeric types
+      if (fmt_specs.align == align::none || fmt_specs.align == align::numeric)
+        fmt_specs.align = align::right;
       return base::operator()(value);
     } else {
       return base::operator()(value);
@@ -307,6 +316,8 @@ class printf_arg_formatter : public internal::arg_formatter_base<Range> {
 };
 
 template <typename T> struct printf_formatter {
+  printf_formatter() = delete;
+
   template <typename ParseContext>
   auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
     return ctx.begin();
@@ -314,17 +325,21 @@ template <typename T> struct printf_formatter {
 
   template <typename FormatContext>
   auto format(const T& value, FormatContext& ctx) -> decltype(ctx.out()) {
-    internal::format_value(internal::get_container(ctx.out()), value);
+    detail::format_value(detail::get_container(ctx.out()), value);
     return ctx.out();
   }
 };
 
-/** This template formats data and writes the output to a writer. */
+/**
+ This template formats data and writes the output through an output iterator.
+ */
 template <typename OutputIt, typename Char> class basic_printf_context {
  public:
   /** The character type for the output. */
   using char_type = Char;
+  using iterator = OutputIt;
   using format_arg = basic_format_arg<basic_printf_context>;
+  using parse_context_type = basic_printf_parse_context<Char>;
   template <typename T> using formatter_type = printf_formatter<T>;
 
  private:
@@ -332,24 +347,23 @@ template <typename OutputIt, typename Char> class basic_printf_context {
 
   OutputIt out_;
   basic_format_args<basic_printf_context> args_;
-  basic_parse_context<Char> parse_ctx_;
+  parse_context_type parse_ctx_;
 
   static void parse_flags(format_specs& specs, const Char*& it,
                           const Char* end);
 
-  // Returns the argument with specified index or, if arg_index is equal
-  // to the maximum unsigned value, the next argument.
-  format_arg get_arg(unsigned arg_index = std::numeric_limits<unsigned>::max());
+  // Returns the argument with specified index or, if arg_index is -1, the next
+  // argument.
+  format_arg get_arg(int arg_index = -1);
 
   // Parses argument index, flags and width and returns the argument index.
-  unsigned parse_header(const Char*& it, const Char* end, format_specs& specs);
+  int parse_header(const Char*& it, const Char* end, format_specs& specs);
 
  public:
   /**
    \rst
-   Constructs a ``printf_context`` object. References to the arguments and
-   the writer are stored in the context object so make sure they have
-   appropriate lifetimes.
+   Constructs a ``printf_context`` object. References to the arguments are
+   stored in the context object so make sure they have appropriate lifetimes.
    \endrst
    */
   basic_printf_context(OutputIt out, basic_string_view<char_type> format_str,
@@ -359,17 +373,18 @@ template <typename OutputIt, typename Char> class basic_printf_context {
   OutputIt out() { return out_; }
   void advance_to(OutputIt it) { out_ = it; }
 
-  format_arg arg(unsigned id) const { return args_.get(id); }
+  detail::locale_ref locale() { return {}; }
+
+  format_arg arg(int id) const { return args_.get(id); }
 
-  basic_parse_context<Char>& parse_context() { return parse_ctx_; }
+  parse_context_type& parse_context() { return parse_ctx_; }
 
   FMT_CONSTEXPR void on_error(const char* message) {
     parse_ctx_.on_error(message);
   }
 
   /** Formats stored arguments and writes the output to the range. */
-  template <typename ArgFormatter =
-                printf_arg_formatter<internal::buffer_range<Char>>>
+  template <typename ArgFormatter = printf_arg_formatter<OutputIt, Char>>
   OutputIt format();
 };
 
@@ -389,7 +404,9 @@ void basic_printf_context<OutputIt, Char>::parse_flags(format_specs& specs,
       specs.fill[0] = '0';
       break;
     case ' ':
-      specs.sign = sign::space;
+      if (specs.sign != sign::plus) {
+        specs.sign = sign::space;
+      }
       break;
     case '#':
       specs.alt = true;
@@ -402,24 +419,25 @@ void basic_printf_context<OutputIt, Char>::parse_flags(format_specs& specs,
 
 template <typename OutputIt, typename Char>
 typename basic_printf_context<OutputIt, Char>::format_arg
-basic_printf_context<OutputIt, Char>::get_arg(unsigned arg_index) {
-  if (arg_index == std::numeric_limits<unsigned>::max())
+basic_printf_context<OutputIt, Char>::get_arg(int arg_index) {
+  if (arg_index < 0)
     arg_index = parse_ctx_.next_arg_id();
   else
     parse_ctx_.check_arg_id(--arg_index);
-  return internal::get_arg(*this, arg_index);
+  return detail::get_arg(*this, arg_index);
 }
 
 template <typename OutputIt, typename Char>
-unsigned basic_printf_context<OutputIt, Char>::parse_header(
-    const Char*& it, const Char* end, format_specs& specs) {
-  unsigned arg_index = std::numeric_limits<unsigned>::max();
+int basic_printf_context<OutputIt, Char>::parse_header(const Char*& it,
+                                                       const Char* end,
+                                                       format_specs& specs) {
+  int arg_index = -1;
   char_type c = *it;
   if (c >= '0' && c <= '9') {
     // Parse an argument index (if followed by '$') or a width possibly
     // preceded with '0' flag(s).
-    internal::error_handler eh;
-    unsigned value = parse_nonnegative_int(it, end, eh);
+    detail::error_handler eh;
+    int value = parse_nonnegative_int(it, end, eh);
     if (it != end && *it == '$') {  // value is an argument index
       ++it;
       arg_index = value;
@@ -437,12 +455,12 @@ unsigned basic_printf_context<OutputIt, Char>::parse_header(
   // Parse width.
   if (it != end) {
     if (*it >= '0' && *it <= '9') {
-      internal::error_handler eh;
+      detail::error_handler eh;
       specs.width = parse_nonnegative_int(it, end, eh);
     } else if (*it == '*') {
       ++it;
-      specs.width = visit_format_arg(
-          internal::printf_width_handler<char_type>(specs), get_arg());
+      specs.width = static_cast<int>(visit_format_arg(
+          detail::printf_width_handler<char_type>(specs), get_arg()));
     }
   }
   return arg_index;
@@ -469,38 +487,53 @@ OutputIt basic_printf_context<OutputIt, Char>::format() {
     specs.align = align::right;
 
     // Parse argument index, flags and width.
-    unsigned arg_index = parse_header(it, end, specs);
+    int arg_index = parse_header(it, end, specs);
+    if (arg_index == 0) on_error("argument not found");
 
     // Parse precision.
     if (it != end && *it == '.') {
       ++it;
       c = it != end ? *it : 0;
       if ('0' <= c && c <= '9') {
-        internal::error_handler eh;
-        specs.precision = static_cast<int>(parse_nonnegative_int(it, end, eh));
+        detail::error_handler eh;
+        specs.precision = parse_nonnegative_int(it, end, eh);
       } else if (c == '*') {
         ++it;
-        specs.precision =
-            visit_format_arg(internal::printf_precision_handler(), get_arg());
+        specs.precision = static_cast<int>(
+            visit_format_arg(detail::printf_precision_handler(), get_arg()));
       } else {
         specs.precision = 0;
       }
     }
 
     format_arg arg = get_arg(arg_index);
-    if (specs.alt && visit_format_arg(internal::is_zero_int(), arg))
+    // For d, i, o, u, x, and X conversion specifiers, if a precision is
+    // specified, the '0' flag is ignored
+    if (specs.precision >= 0 && arg.is_integral())
+      specs.fill[0] =
+          ' ';  // Ignore '0' flag for non-numeric types or if '-' present.
+    if (specs.precision >= 0 && arg.type() == detail::type::cstring_type) {
+      auto str = visit_format_arg(detail::get_cstring<Char>(), arg);
+      auto str_end = str + specs.precision;
+      auto nul = std::find(str, str_end, Char());
+      arg = detail::make_arg<basic_printf_context>(basic_string_view<Char>(
+          str,
+          detail::to_unsigned(nul != str_end ? nul - str : specs.precision)));
+    }
+    if (specs.alt && visit_format_arg(detail::is_zero_int(), arg))
       specs.alt = false;
     if (specs.fill[0] == '0') {
-      if (arg.is_arithmetic())
+      if (arg.is_arithmetic() && specs.align != align::left)
         specs.align = align::numeric;
       else
-        specs.fill[0] = ' ';  // Ignore '0' flag for non-numeric types.
+        specs.fill[0] = ' ';  // Ignore '0' flag for non-numeric types or if '-'
+                              // flag is also present.
     }
 
     // Parse length and convert the argument to the required type.
     c = it != end ? *it++ : 0;
     char_type t = it != end ? *it : 0;
-    using internal::convert_arg;
+    using detail::convert_arg;
     switch (c) {
     case 'h':
       if (t == 'h') {
@@ -524,7 +557,7 @@ OutputIt basic_printf_context<OutputIt, Char>::format() {
       convert_arg<intmax_t>(arg, t);
       break;
     case 'z':
-      convert_arg<std::size_t>(arg, t);
+      convert_arg<size_t>(arg, t);
       break;
     case 't':
       convert_arg<std::ptrdiff_t>(arg, t);
@@ -549,7 +582,7 @@ OutputIt basic_printf_context<OutputIt, Char>::format() {
         specs.type = 'd';
         break;
       case 'c':
-        visit_format_arg(internal::char_converter<basic_printf_context>(arg),
+        visit_format_arg(detail::char_converter<basic_printf_context>(arg),
                          arg);
         break;
       }
@@ -558,15 +591,14 @@ OutputIt basic_printf_context<OutputIt, Char>::format() {
     start = it;
 
     // Format argument.
-    visit_format_arg(ArgFormatter(out, specs, *this), arg);
+    out = visit_format_arg(ArgFormatter(out, specs, *this), arg);
   }
   return std::copy(start, it, out);
 }
 
 template <typename Char>
 using basic_printf_context_t =
-    basic_printf_context<std::back_insert_iterator<internal::buffer<Char>>,
-                         Char>;
+    basic_printf_context<detail::buffer_appender<Char>, Char>;
 
 using printf_context = basic_printf_context_t<char>;
 using wprintf_context = basic_printf_context_t<wchar_t>;
@@ -600,9 +632,10 @@ inline format_arg_store<wprintf_context, Args...> make_wprintf_args(
 
 template <typename S, typename Char = char_t<S>>
 inline std::basic_string<Char> vsprintf(
-    const S& format, basic_format_args<basic_printf_context_t<Char>> args) {
+    const S& format,
+    basic_format_args<basic_printf_context_t<type_identity_t<Char>>> args) {
   basic_memory_buffer<Char> buffer;
-  printf(buffer, to_string_view(format), args);
+  vprintf(buffer, to_string_view(format), args);
   return to_string(buffer);
 }
 
@@ -616,18 +649,19 @@ inline std::basic_string<Char> vsprintf(
   \endrst
 */
 template <typename S, typename... Args,
-          typename Char = enable_if_t<internal::is_string<S>::value, char_t<S>>>
+          typename Char = enable_if_t<detail::is_string<S>::value, char_t<S>>>
 inline std::basic_string<Char> sprintf(const S& format, const Args&... args) {
   using context = basic_printf_context_t<Char>;
-  return vsprintf(to_string_view(format), {make_format_args<context>(args...)});
+  return vsprintf(to_string_view(format), make_format_args<context>(args...));
 }
 
 template <typename S, typename Char = char_t<S>>
-inline int vfprintf(std::FILE* f, const S& format,
-                    basic_format_args<basic_printf_context_t<Char>> args) {
+inline int vfprintf(
+    std::FILE* f, const S& format,
+    basic_format_args<basic_printf_context_t<type_identity_t<Char>>> args) {
   basic_memory_buffer<Char> buffer;
-  printf(buffer, to_string_view(format), args);
-  std::size_t size = buffer.size();
+  vprintf(buffer, to_string_view(format), args);
+  size_t size = buffer.size();
   return std::fwrite(buffer.data(), sizeof(Char), size, f) < size
              ? -1
              : static_cast<int>(size);
@@ -643,16 +677,17 @@ inline int vfprintf(std::FILE* f, const S& format,
   \endrst
  */
 template <typename S, typename... Args,
-          typename Char = enable_if_t<internal::is_string<S>::value, char_t<S>>>
+          typename Char = enable_if_t<detail::is_string<S>::value, char_t<S>>>
 inline int fprintf(std::FILE* f, const S& format, const Args&... args) {
   using context = basic_printf_context_t<Char>;
   return vfprintf(f, to_string_view(format),
-                  {make_format_args<context>(args...)});
+                  make_format_args<context>(args...));
 }
 
 template <typename S, typename Char = char_t<S>>
-inline int vprintf(const S& format,
-                   basic_format_args<basic_printf_context_t<Char>> args) {
+inline int vprintf(
+    const S& format,
+    basic_format_args<basic_printf_context_t<type_identity_t<Char>>> args) {
   return vfprintf(stdout, to_string_view(format), args);
 }
 
@@ -666,19 +701,20 @@ inline int vprintf(const S& format,
   \endrst
  */
 template <typename S, typename... Args,
-          FMT_ENABLE_IF(internal::is_string<S>::value)>
+          FMT_ENABLE_IF(detail::is_string<S>::value)>
 inline int printf(const S& format_str, const Args&... args) {
   using context = basic_printf_context_t<char_t<S>>;
   return vprintf(to_string_view(format_str),
-                 {make_format_args<context>(args...)});
+                 make_format_args<context>(args...));
 }
 
 template <typename S, typename Char = char_t<S>>
-inline int vfprintf(std::basic_ostream<Char>& os, const S& format,
-                    basic_format_args<basic_printf_context_t<Char>> args) {
+inline int vfprintf(
+    std::basic_ostream<Char>& os, const S& format,
+    basic_format_args<basic_printf_context_t<type_identity_t<Char>>> args) {
   basic_memory_buffer<Char> buffer;
-  printf(buffer, to_string_view(format), args);
-  internal::write(os, buffer);
+  vprintf(buffer, to_string_view(format), args);
+  detail::write_buffer(os, buffer);
   return static_cast<int>(buffer.size());
 }
 
@@ -686,9 +722,9 @@ inline int vfprintf(std::basic_ostream<Char>& os, const S& format,
 template <typename ArgFormatter, typename Char,
           typename Context =
               basic_printf_context<typename ArgFormatter::iterator, Char>>
-typename ArgFormatter::iterator vprintf(internal::buffer<Char>& out,
-                                        basic_string_view<Char> format_str,
-                                        basic_format_args<Context> args) {
+typename ArgFormatter::iterator vprintf(
+    detail::buffer<Char>& out, basic_string_view<Char> format_str,
+    basic_format_args<type_identity_t<Context>> args) {
   typename ArgFormatter::iterator iter(out);
   Context(iter, format_str, args).template format<ArgFormatter>();
   return iter;
@@ -708,7 +744,7 @@ inline int fprintf(std::basic_ostream<Char>& os, const S& format_str,
                    const Args&... args) {
   using context = basic_printf_context_t<Char>;
   return vfprintf(os, to_string_view(format_str),
-                  {make_format_args<context>(args...)});
+                  make_format_args<context>(args...));
 }
 FMT_END_NAMESPACE
 
diff --git a/include/vtkdiy2/fmt/ranges.h b/include/vtkdiy2/thirdparty/fmt/ranges.h
similarity index 58%
rename from include/vtkdiy2/fmt/ranges.h
rename to include/vtkdiy2/thirdparty/fmt/ranges.h
index cf0d41aaa5e..632f04949c8 100644
--- a/include/vtkdiy2/fmt/ranges.h
+++ b/include/vtkdiy2/thirdparty/fmt/ranges.h
@@ -12,7 +12,9 @@
 #ifndef FMT_RANGES_H_
 #define FMT_RANGES_H_
 
+#include <initializer_list>
 #include <type_traits>
+
 #include "format.h"
 
 // output only up to N items from the range.
@@ -31,7 +33,7 @@ template <typename Char> struct formatting_base {
 
 template <typename Char, typename Enable = void>
 struct formatting_range : formatting_base<Char> {
-  static FMT_CONSTEXPR_DECL const std::size_t range_length_limit =
+  static FMT_CONSTEXPR_DECL const size_t range_length_limit =
       FMT_RANGE_OUTPUT_LENGTH_LIMIT;  // output only up to N items from the
                                       // range.
   Char prefix;
@@ -52,7 +54,7 @@ struct formatting_tuple : formatting_base<Char> {
   static FMT_CONSTEXPR_DECL const bool add_prepostfix_space = false;
 };
 
-namespace internal {
+namespace detail {
 
 template <typename RangeT, typename OutputIterator>
 OutputIterator copy(const RangeT& range, OutputIterator out) {
@@ -104,10 +106,7 @@ struct is_range_<
 /// tuple_size and tuple_element check.
 template <typename T> class is_tuple_like_ {
   template <typename U>
-  static auto check(U* p)
-      -> decltype(std::tuple_size<U>::value,
-                  (void)std::declval<typename std::tuple_element<0, U>::type>(),
-                  int());
+  static auto check(U* p) -> decltype(std::tuple_size<U>::value, int());
   template <typename> static void check(...);
 
  public:
@@ -119,26 +118,24 @@ template <typename T> class is_tuple_like_ {
 #if defined(__cpp_lib_integer_sequence) || FMT_MSC_VER >= 1900
 template <typename T, T... N>
 using integer_sequence = std::integer_sequence<T, N...>;
-template <std::size_t... N> using index_sequence = std::index_sequence<N...>;
-template <std::size_t N>
-using make_index_sequence = std::make_index_sequence<N>;
+template <size_t... N> using index_sequence = std::index_sequence<N...>;
+template <size_t N> using make_index_sequence = std::make_index_sequence<N>;
 #else
 template <typename T, T... N> struct integer_sequence {
   using value_type = T;
 
-  static FMT_CONSTEXPR std::size_t size() { return sizeof...(N); }
+  static FMT_CONSTEXPR size_t size() { return sizeof...(N); }
 };
 
-template <std::size_t... N>
-using index_sequence = integer_sequence<std::size_t, N...>;
+template <size_t... N> using index_sequence = integer_sequence<size_t, N...>;
 
-template <typename T, std::size_t N, T... Ns>
+template <typename T, size_t N, T... Ns>
 struct make_integer_sequence : make_integer_sequence<T, N - 1, N - 1, Ns...> {};
 template <typename T, T... Ns>
 struct make_integer_sequence<T, 0, Ns...> : integer_sequence<T, Ns...> {};
 
-template <std::size_t N>
-using make_index_sequence = make_integer_sequence<std::size_t, N>;
+template <size_t N>
+using make_index_sequence = make_integer_sequence<size_t, N>;
 #endif
 
 template <class Tuple, class F, size_t... Is>
@@ -160,6 +157,9 @@ template <class Tuple, class F> void for_each(Tuple&& tup, F&& f) {
   for_each(indexes, std::forward<Tuple>(tup), std::forward<F>(f));
 }
 
+template <typename Range>
+using value_type = remove_cvref_t<decltype(*std::declval<Range>().begin())>;
+
 template <typename Arg, FMT_ENABLE_IF(!is_like_std_string<
                                       typename std::decay<Arg>::type>::value)>
 FMT_CONSTEXPR const char* format_str_quoted(bool add_space, const Arg&) {
@@ -185,12 +185,11 @@ FMT_CONSTEXPR const char* format_str_quoted(bool add_space, const char) {
 FMT_CONSTEXPR const wchar_t* format_str_quoted(bool add_space, const wchar_t) {
   return add_space ? L" '{}'" : L"'{}'";
 }
-
-}  // namespace internal
+}  // namespace detail
 
 template <typename T> struct is_tuple_like {
   static FMT_CONSTEXPR_DECL const bool value =
-      internal::is_tuple_like_<T>::value && !internal::is_range_<T>::value;
+      detail::is_tuple_like_<T>::value && !detail::is_range_<T>::value;
 };
 
 template <typename TupleT, typename Char>
@@ -203,17 +202,17 @@ struct formatter<TupleT, Char, enable_if_t<fmt::is_tuple_like<TupleT>::value>> {
         if (formatting.add_prepostfix_space) {
           *out++ = ' ';
         }
-        out = internal::copy(formatting.delimiter, out);
+        out = detail::copy(formatting.delimiter, out);
       }
       out = format_to(out,
-                      internal::format_str_quoted(
+                      detail::format_str_quoted(
                           (formatting.add_delimiter_spaces && i > 0), v),
                       v);
       ++i;
     }
 
     formatting_tuple<Char>& formatting;
-    std::size_t& i;
+    size_t& i;
     typename std::add_lvalue_reference<decltype(
         std::declval<FormatContext>().out())>::type out;
   };
@@ -229,14 +228,14 @@ struct formatter<TupleT, Char, enable_if_t<fmt::is_tuple_like<TupleT>::value>> {
   template <typename FormatContext = format_context>
   auto format(const TupleT& values, FormatContext& ctx) -> decltype(ctx.out()) {
     auto out = ctx.out();
-    std::size_t i = 0;
-    internal::copy(formatting.prefix, out);
+    size_t i = 0;
+    detail::copy(formatting.prefix, out);
 
-    internal::for_each(values, format_each<FormatContext>{formatting, i, out});
+    detail::for_each(values, format_each<FormatContext>{formatting, i, out});
     if (formatting.add_prepostfix_space) {
       *out++ = ' ';
     }
-    internal::copy(formatting.postfix, out);
+    detail::copy(formatting.postfix, out);
 
     return ctx.out();
   }
@@ -244,14 +243,23 @@ struct formatter<TupleT, Char, enable_if_t<fmt::is_tuple_like<TupleT>::value>> {
 
 template <typename T, typename Char> struct is_range {
   static FMT_CONSTEXPR_DECL const bool value =
-      internal::is_range_<T>::value &&
-      !internal::is_like_std_string<T>::value &&
-      !std::is_convertible<T, std::basic_string<Char>>::value;
+      detail::is_range_<T>::value && !detail::is_like_std_string<T>::value &&
+      !std::is_convertible<T, std::basic_string<Char>>::value &&
+      !std::is_constructible<detail::std_string_view<Char>, T>::value;
 };
 
-template <typename RangeT, typename Char>
-struct formatter<RangeT, Char,
-                 enable_if_t<fmt::is_range<RangeT, Char>::value>> {
+template <typename T, typename Char>
+struct formatter<
+    T, Char,
+    enable_if_t<fmt::is_range<T, Char>::value
+// Workaround a bug in MSVC 2017 and earlier.
+#if !FMT_MSC_VER || FMT_MSC_VER >= 1927
+                &&
+                (has_formatter<detail::value_type<T>, format_context>::value ||
+                 detail::has_fallback_formatter<detail::value_type<T>,
+                                                format_context>::value)
+#endif
+                >> {
   formatting_range<Char> formatting;
 
   template <typename ParseContext>
@@ -260,17 +268,18 @@ struct formatter<RangeT, Char,
   }
 
   template <typename FormatContext>
-  typename FormatContext::iterator format(const RangeT& values,
-                                          FormatContext& ctx) {
-    auto out = internal::copy(formatting.prefix, ctx.out());
-    std::size_t i = 0;
-    for (auto it = values.begin(), end = values.end(); it != end; ++it) {
+  typename FormatContext::iterator format(const T& values, FormatContext& ctx) {
+    auto out = detail::copy(formatting.prefix, ctx.out());
+    size_t i = 0;
+    auto it = values.begin();
+    auto end = values.end();
+    for (; it != end; ++it) {
       if (i > 0) {
         if (formatting.add_prepostfix_space) *out++ = ' ';
-        out = internal::copy(formatting.delimiter, out);
+        out = detail::copy(formatting.delimiter, out);
       }
       out = format_to(out,
-                      internal::format_str_quoted(
+                      detail::format_str_quoted(
                           (formatting.add_delimiter_spaces && i > 0), *it),
                       *it);
       if (++i > formatting.range_length_limit) {
@@ -279,10 +288,109 @@ struct formatter<RangeT, Char,
       }
     }
     if (formatting.add_prepostfix_space) *out++ = ' ';
-    return internal::copy(formatting.postfix, out);
+    return detail::copy(formatting.postfix, out);
+  }
+};
+
+template <typename Char, typename... T> struct tuple_arg_join : detail::view {
+  const std::tuple<T...>& tuple;
+  basic_string_view<Char> sep;
+
+  tuple_arg_join(const std::tuple<T...>& t, basic_string_view<Char> s)
+      : tuple{t}, sep{s} {}
+};
+
+template <typename Char, typename... T>
+struct formatter<tuple_arg_join<Char, T...>, Char> {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  typename FormatContext::iterator format(
+      const tuple_arg_join<Char, T...>& value, FormatContext& ctx) {
+    return format(value, ctx, detail::make_index_sequence<sizeof...(T)>{});
+  }
+
+ private:
+  template <typename FormatContext, size_t... N>
+  typename FormatContext::iterator format(
+      const tuple_arg_join<Char, T...>& value, FormatContext& ctx,
+      detail::index_sequence<N...>) {
+    return format_args(value, ctx, std::get<N>(value.tuple)...);
+  }
+
+  template <typename FormatContext>
+  typename FormatContext::iterator format_args(
+      const tuple_arg_join<Char, T...>&, FormatContext& ctx) {
+    // NOTE: for compilers that support C++17, this empty function instantiation
+    // can be replaced with a constexpr branch in the variadic overload.
+    return ctx.out();
+  }
+
+  template <typename FormatContext, typename Arg, typename... Args>
+  typename FormatContext::iterator format_args(
+      const tuple_arg_join<Char, T...>& value, FormatContext& ctx,
+      const Arg& arg, const Args&... args) {
+    using base = formatter<typename std::decay<Arg>::type, Char>;
+    auto out = ctx.out();
+    out = base{}.format(arg, ctx);
+    if (sizeof...(Args) > 0) {
+      out = std::copy(value.sep.begin(), value.sep.end(), out);
+      ctx.advance_to(out);
+      return format_args(value, ctx, args...);
+    }
+    return out;
   }
 };
 
+/**
+  \rst
+  Returns an object that formats `tuple` with elements separated by `sep`.
+
+  **Example**::
+
+    std::tuple<int, char> t = {1, 'a'};
+    fmt::print("{}", fmt::join(t, ", "));
+    // Output: "1, a"
+  \endrst
+ */
+template <typename... T>
+FMT_CONSTEXPR tuple_arg_join<char, T...> join(const std::tuple<T...>& tuple,
+                                              string_view sep) {
+  return {tuple, sep};
+}
+
+template <typename... T>
+FMT_CONSTEXPR tuple_arg_join<wchar_t, T...> join(const std::tuple<T...>& tuple,
+                                                 wstring_view sep) {
+  return {tuple, sep};
+}
+
+/**
+  \rst
+  Returns an object that formats `initializer_list` with elements separated by
+  `sep`.
+
+  **Example**::
+
+    fmt::print("{}", fmt::join({1, 2, 3}, ", "));
+    // Output: "1, 2, 3"
+  \endrst
+ */
+template <typename T>
+arg_join<const T*, const T*, char> join(std::initializer_list<T> list,
+                                        string_view sep) {
+  return join(std::begin(list), std::end(list), sep);
+}
+
+template <typename T>
+arg_join<const T*, const T*, wchar_t> join(std::initializer_list<T> list,
+                                           wstring_view sep) {
+  return join(std::begin(list), std::end(list), sep);
+}
+
 FMT_END_NAMESPACE
 
 #endif  // FMT_RANGES_H_
diff --git a/include/vtkdiy2/fmt/safe-duration-cast.h b/include/vtkdiy2/thirdparty/fmt/safe-duration-cast.h
similarity index 100%
rename from include/vtkdiy2/fmt/safe-duration-cast.h
rename to include/vtkdiy2/thirdparty/fmt/safe-duration-cast.h
diff --git a/include/vtkdiy2/itlib/small_vector.hpp b/include/vtkdiy2/thirdparty/itlib/small_vector.hpp
similarity index 100%
rename from include/vtkdiy2/itlib/small_vector.hpp
rename to include/vtkdiy2/thirdparty/itlib/small_vector.hpp
diff --git a/include/vtkdiy2/thread/fast_mutex.h b/include/vtkdiy2/thirdparty/thread/fast_mutex.h
similarity index 100%
rename from include/vtkdiy2/thread/fast_mutex.h
rename to include/vtkdiy2/thirdparty/thread/fast_mutex.h
diff --git a/include/vtkdiy2/thread.hpp b/include/vtkdiy2/thread.hpp
index 1c9149a42e3..f48ee7e1acd 100644
--- a/include/vtkdiy2/thread.hpp
+++ b/include/vtkdiy2/thread.hpp
@@ -1,11 +1,15 @@
 #ifndef DIY_THREAD_H
 #define DIY_THREAD_H
 
+#include <map>
+
 #ifdef DIY_NO_THREADS
 #include "no-thread.hpp"
 #else
 
-#include "thread/fast_mutex.h"
+#if !defined(_MSC_VER)
+#include "thirdparty/thread/fast_mutex.h"
+#endif
 
 #include <thread>
 #include <mutex>
@@ -17,15 +21,74 @@ namespace diy
     using std::recursive_mutex;
     namespace this_thread = std::this_thread;
 
+#if defined(_MSC_VER)
+    // fast_mutex implementation has issues on MSVC. Just use std::mutex
+    using fast_mutex = std::mutex;
+#else
     // TODO: replace with our own implementation using std::atomic_flag
     using fast_mutex = tthread::fast_mutex;
+#endif
 
     template<class Mutex>
     using lock_guard = std::unique_lock<Mutex>;
+
+    template<class T, class U>
+    struct concurrent_map;
 }
 
-#endif
+#endif // DIY_NO_THREADS
 
 #include "critical-resource.hpp"
 
+#if !defined(DIY_NO_THREADS)
+
+#include <memory>       // for shared_ptr
+
+template<class T, class U>
+struct diy::concurrent_map
+{
+    using Map       = std::map<T,U>;
+    using SharedPtr = std::shared_ptr<lock_guard<fast_mutex>>;
+
+    template<class MapIterator>
+    struct iterator_
+    {
+        MapIterator     it;
+        SharedPtr       lock_ptr;
+
+                        iterator_(const MapIterator& it_, const SharedPtr& lock_ptr_ = SharedPtr()):
+                            it(it_), lock_ptr(lock_ptr_)                        {}
+
+        iterator_&      operator++()        { ++it; return *this; }
+        iterator_       operator++(int)     { iterator_ retval = *this; ++(*this); return retval; }
+
+        bool            operator==(const iterator_& other) const     { return it == other.it;}
+        bool            operator!=(const iterator_& other) const     { return !(*this == other); }
+
+        decltype(*it)               operator*() const   { return *it; }
+        decltype(it.operator->())   operator->() const  { return it.operator->(); }
+    };
+
+    using iterator       = iterator_<typename Map::iterator>;
+    using const_iterator = iterator_<typename Map::const_iterator>;
+
+    U&              operator[](const T& x)  { lock_guard<fast_mutex> l(mutex_); return map_[x]; }
+
+    iterator        begin()                 { auto p = std::make_shared<lock_guard<fast_mutex>>(mutex_); return iterator(map_.begin(), p); }
+    iterator        end()                   { return iterator(map_.end()); }
+
+    const_iterator  begin() const           { auto p = std::make_shared<lock_guard<fast_mutex>>(mutex_); return const_iterator(map_.begin(), p); }
+    const_iterator  end() const             { return const_iterator(map_.end()); }
+
+    iterator        find(const T& x)        { auto p = std::make_shared<lock_guard<fast_mutex>>(mutex_); return iterator(map_.find(x), p); }
+    const_iterator  find(const T& x) const  { auto p = std::make_shared<lock_guard<fast_mutex>>(mutex_); return const_iterator(map_.find(x), p); }
+
+    void            clear()                 { lock_guard<fast_mutex> l(mutex_); map_.clear(); }
+    bool            empty()                 { lock_guard<fast_mutex> l(mutex_); return map_.empty(); }
+
+    Map                 map_;
+    mutable fast_mutex  mutex_;
+};
+#endif // !defined(DIY_NO_THREADS)
+
 #endif
diff --git a/include/vtkdiy2/time.hpp b/include/vtkdiy2/time.hpp
index 692cf367310..671e69ddf39 100644
--- a/include/vtkdiy2/time.hpp
+++ b/include/vtkdiy2/time.hpp
@@ -3,10 +3,10 @@
 
 #ifndef _WIN32
 #include <sys/time.h>
-#ifdef __MACH__
+#if defined(__MACH__) && defined(__APPLE__)
 #include <mach/clock.h>
 #include <mach/mach.h>
-#endif // __MACH__
+#endif // __MACH__ && __APPLE__
 #endif // ifndef _WIN32
 
 namespace diy
@@ -16,7 +16,7 @@ typedef     unsigned long       time_type;
 
 inline time_type get_time()
 {
-#ifdef __MACH__ // OS X does not have clock_gettime, use clock_get_time
+#if defined(__MACH__) && defined(__APPLE__) // OS X does not have clock_gettime, use clock_get_time
     clock_serv_t cclock;
     mach_timespec_t ts;
     host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
diff --git a/include/vtkdiy2/types.hpp b/include/vtkdiy2/types.hpp
index 28a1a05824b..1dc380bb415 100644
--- a/include/vtkdiy2/types.hpp
+++ b/include/vtkdiy2/types.hpp
@@ -4,9 +4,12 @@
 #include <iostream>
 #include "constants.h"
 #include "dynamic-point.hpp"
+#include "point.hpp"
 
 namespace diy
 {
+    using Work = unsigned int;
+
     struct BlockID
     {
         int gid, proc;
@@ -23,11 +26,44 @@ namespace diy
 
         Point min, max;
 
-        DEPRECATED("Default Bounds constructor should not be used; old behavior is preserved for compatibility. Pass explicitly the dimension of the Bounds instead.")
-        Bounds():
-            Bounds(DIY_MAX_DIM)                                             {}
         Bounds(int dim): min(dim), max(dim)                                 {}
         Bounds(const Point& _min, const Point& _max) : min(_min), max(_max) {}
+
+        bool contains(const Point& p) const
+        {
+            assert(p.dimension() == min.dimension());
+
+            for(unsigned i = 0; i < min.dimension(); ++i)
+                if (p[i] < min[i] || p[i] > max[i])
+                    return false;
+            return true;
+        }
+
+        template<unsigned int D>
+        bool contains(const diy::Point<Coordinate_, D>& p) const
+        {
+            assert(p.dimension() == min.dimension());
+
+            for(unsigned i = 0; i < min.dimension(); ++i)
+                if (p[i] < min[i] || p[i] > max[i])
+                    return false;
+            return true;
+        }
+
+        inline friend std::ostream& operator<<(std::ostream& out, const Bounds& b)
+        {
+            out << "Bounds(min=" << b.min << ", max=" << b.max << ")";
+            return out;
+        }
+
+        private:
+            // make default constructor private to explicitly break old deprecated behavior;
+            // any call to the default constructor should be replaced by a call to Bounds(0)
+            Bounds():
+                Bounds(0)                                                   {}
+
+            template<class T> friend struct diy::Serialization;
+
     };
     using DiscreteBounds   = Bounds<int>;
     using ContinuousBounds = Bounds<float>;
@@ -66,11 +102,7 @@ namespace diy
           if (dim > 3 && dir & DIY_T1) (*this)[3] += 1;
       }
 
-        DEPRECATED("Direction without dimension is deprecated")
-              Direction(int dir):
-                  Direction(DIY_MAX_DIM, dir)       // if we are decoding the old constants, we assume DIY_MAX_DIM dimensional space
-      {
-      }
+      static Direction from_bits(int dir, int dim = DIY_MAX_DIM)    { return Direction(dim, dir); }
 
       bool
       operator==(const diy::Direction& y) const
diff --git a/include/vtkdiy2/utils.hpp b/include/vtkdiy2/utils.hpp
new file mode 100644
index 00000000000..7a9ad0d1021
--- /dev/null
+++ b/include/vtkdiy2/utils.hpp
@@ -0,0 +1,15 @@
+#ifndef DIY_UTILS_HPP
+#define DIY_UTILS_HPP
+
+#include <memory>
+
+namespace diy
+{
+    template<typename T, typename... Args>
+    std::unique_ptr<T> make_unique(Args&&... args)
+    {
+        return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+    }
+}
+
+#endif
-- 
GitLab