DIY  3.0
data-parallel out-of-core C++ library
 All Classes Namespaces Functions Typedefs Groups Pages
master.hpp
1 #ifndef DIY_MASTER_HPP
2 #define DIY_MASTER_HPP
3 
4 #include <vector>
5 #include <map>
6 #include <list>
7 #include <deque>
8 #include <algorithm>
9 #include <functional>
10 
11 #include "link.hpp"
12 #include "collection.hpp"
13 
14 // Communicator functionality
15 #include "mpi.hpp"
16 #include "serialization.hpp"
17 #include "detail/collectives.hpp"
18 #include "time.hpp"
19 
20 #include "thread.hpp"
21 
22 #include "detail/block_traits.hpp"
23 
24 #include "log.hpp"
25 #include "stats.hpp"
26 
27 namespace diy
28 {
29  // Stores and manages blocks; initiates serialization and communication when necessary.
30  //
31  // Provides a foreach function, which is meant as the main entry point.
32  //
33  // Provides a conversion between global and local block ids,
34  // which is hidden from blocks via a communicator proxy.
35  class Master
36  {
37  public:
38  struct ProcessBlock;
39 
40  template<class Block>
41  struct Binder;
42 
43  // Commands
44  struct BaseCommand;
45 
46  template<class Block>
47  struct Command;
48 
49  typedef std::vector<BaseCommand*> Commands;
50 
51  // Skip
52  using Skip = std::function<bool(int, const Master&)>;
53 
54  struct SkipNoIncoming;
55  struct NeverSkip { bool operator()(int, const Master&) const { return false; } };
56 
57  // Collection
58  typedef Collection::Create CreateBlock;
59  typedef Collection::Destroy DestroyBlock;
60  typedef Collection::Save SaveBlock;
61  typedef Collection::Load LoadBlock;
62 
63  public:
64  // Communicator types
65  struct Proxy;
66  struct ProxyWithLink;
67 
68  // foreach callback
69  template<class Block>
70  using Callback = std::function<void(Block*, const ProxyWithLink&)>;
71 
72  struct QueuePolicy
73  {
74  virtual bool unload_incoming(const Master& master, int from, int to, size_t size) const =0;
75  virtual bool unload_outgoing(const Master& master, int from, size_t size) const =0;
76  virtual ~QueuePolicy() {}
77  };
78 
81  {
82  QueueSizePolicy(size_t sz): size(sz) {}
83  bool unload_incoming(const Master&, int, int, size_t sz) const { return sz > size; }
84  bool unload_outgoing(const Master& master, int from, size_t sz) const { return sz > size*master.outgoing_count(from); }
85 
86  size_t size;
87  };
88 
89  struct MessageInfo
90  {
91  int from, to;
92  int round;
93  };
94 
95  struct InFlightSend
96  {
97  std::shared_ptr<MemoryBuffer> message;
98  mpi::request request;
99 
100  // for debug purposes:
101  MessageInfo info;
102  };
103 
105  {
106  MemoryBuffer message;
107  MessageInfo info{ -1, -1, -1 };
108  };
109 
110  struct Collective;
111  struct tags { enum { queue, piece }; };
112 
113  typedef std::list<InFlightSend> InFlightSendsList;
114  typedef std::map<int, InFlightRecv> InFlightRecvsMap;
115  typedef std::list<int> ToSendList; // [gid]
116  typedef std::list<Collective> CollectivesList;
117  typedef std::map<int, CollectivesList> CollectivesMap; // gid -> [collectives]
118 
119 
120  struct QueueRecord
121  {
122  QueueRecord(size_t s = 0, int e = -1): size(s), external(e) {}
123  size_t size;
124  int external;
125  };
126 
127  typedef std::map<int, QueueRecord> InQueueRecords; // gid -> (size, external)
128  typedef std::map<int, MemoryBuffer> IncomingQueues; // gid -> queue
129  typedef std::map<BlockID, MemoryBuffer> OutgoingQueues; // (gid, proc) -> queue
130  typedef std::map<BlockID, QueueRecord> OutQueueRecords; // (gid, proc) -> (size, external)
132  {
133  InQueueRecords records;
134  IncomingQueues queues;
135  };
137  {
138  OutgoingQueuesRecord(int e = -1): external(e) {}
139  int external;
140  OutQueueRecords external_local;
141  OutgoingQueues queues;
142  };
143  typedef std::map<int, IncomingQueuesRecords> IncomingQueuesMap; // gid -> { gid -> queue }
144  typedef std::map<int, OutgoingQueuesRecord> OutgoingQueuesMap; // gid -> { (gid,proc) -> queue }
145 
147  {
148  IncomingQueuesMap map;
149  int received{0};
150  };
151  typedef std::map<int, IncomingRound> IncomingRoundMap;
152 
153 
154  public:
165  int threads__ = 1,
166  int limit__ = -1,
167  CreateBlock create_ = 0,
168  DestroyBlock destroy_ = 0,
169  ExternalStorage* storage = 0,
170  SaveBlock save = 0,
171  LoadBlock load_ = 0,
172  QueuePolicy* q_policy = new QueueSizePolicy(4096)):
173  blocks_(create_, destroy_, storage, save, load_),
174  queue_policy_(q_policy),
175  limit_(limit__),
176  threads_(threads__ == -1 ? static_cast<int>(thread::hardware_concurrency()) : threads__),
177  storage_(storage),
178  // Communicator functionality
179  comm_(comm),
180  expected_(0),
181  exchange_round_(-1),
182  immediate_(true)
183  {}
184  ~Master() { set_immediate(true); clear(); delete queue_policy_; }
185  inline void clear();
186  inline void destroy(int i) { if (blocks_.own()) blocks_.destroy(i); }
187 
188  inline int add(int gid, void* b, Link* l);
189  inline void* release(int i);
190 
192  inline void* block(int i) const { return blocks_.find(i); }
193  template<class Block>
194  Block* block(int i) const { return static_cast<Block*>(block(i)); }
195  inline Link* link(int i) const { return links_[i]; }
196  inline int loaded_block() const { return blocks_.available(); }
197 
198  inline void unload(int i);
199  inline void load(int i);
200  void unload(std::vector<int>& loaded) { for(unsigned i = 0; i < loaded.size(); ++i) unload(loaded[i]); loaded.clear(); }
201  void unload_all() { for(unsigned i = 0; i < size(); ++i) if (block(i) != 0) unload(i); }
202  inline bool has_incoming(int i) const;
203 
204  inline void unload_queues(int i);
205  inline void unload_incoming(int gid);
206  inline void unload_outgoing(int gid);
207  inline void load_queues(int i);
208  inline void load_incoming(int gid);
209  inline void load_outgoing(int gid);
210 
212  const mpi::communicator& communicator() const { return comm_; }
214  mpi::communicator& communicator() { return comm_; }
215 
217  void* get(int i) { return blocks_.get(i); }
219  int gid(int i) const { return gids_[i]; }
221  int lid(int gid__) const { return local(gid__) ? lids_.find(gid__)->second : -1; }
223  bool local(int gid__) const { return lids_.find(gid__) != lids_.end(); }
224 
226  inline void exchange();
227  inline void process_collectives();
228 
229  inline
230  ProxyWithLink proxy(int i) const;
231 
233  unsigned int size() const { return static_cast<unsigned int>(blocks_.size()); }
234  void* create() const { return blocks_.create(); }
235 
236  // accessors
237  int limit() const { return limit_; }
238  int threads() const { return threads_; }
239  int in_memory() const { return *blocks_.in_memory().const_access(); }
240 
241  void set_threads(int threads__) { threads_ = threads__; }
242 
243  CreateBlock creator() const { return blocks_.creator(); }
244  DestroyBlock destroyer() const { return blocks_.destroyer(); }
245  LoadBlock loader() const { return blocks_.loader(); }
246  SaveBlock saver() const { return blocks_.saver(); }
247 
249  template<class Block>
250  void foreach_(const Callback<Block>& f, const Skip& s = NeverSkip());
251 
252  template<class F>
253  void foreach(const F& f, const Skip& s = NeverSkip())
254  {
255  using Block = typename detail::block_traits<F>::type;
256  foreach_<Block>(f, s);
257  }
258 
259  inline void execute();
260 
261  bool immediate() const { return immediate_; }
262  void set_immediate(bool i) { if (i && !immediate_) execute(); immediate_ = i; }
263 
264  public:
265  // Communicator functionality
266  IncomingQueues& incoming(int gid__) { return incoming_[exchange_round_].map[gid__].queues; }
267  OutgoingQueues& outgoing(int gid__) { return outgoing_[gid__].queues; }
268  CollectivesList& collectives(int gid__) { return collectives_[gid__]; }
269  size_t incoming_count(int gid__) const
270  {
271  IncomingRoundMap::const_iterator round_it = incoming_.find(exchange_round_);
272  if (round_it == incoming_.end())
273  return 0;
274  IncomingQueuesMap::const_iterator queue_it = round_it->second.map.find(gid__);
275  if (queue_it == round_it->second.map.end())
276  return 0;
277  return queue_it->second.queues.size();
278  }
279  size_t outgoing_count(int gid__) const { OutgoingQueuesMap::const_iterator it = outgoing_.find(gid__); if (it == outgoing_.end()) return 0; return it->second.queues.size(); }
280 
281  void set_expected(int expected) { expected_ = expected; }
282  void add_expected(int i) { expected_ += i; }
283  int expected() const { return expected_; }
284  void replace_link(int i, Link* link__) { expected_ -= links_[i]->size_unique(); delete links_[i]; links_[i] = link__; expected_ += links_[i]->size_unique(); }
285 
286  public:
287  // Communicator functionality
288  inline void flush(); // makes sure all the serialized queues migrate to their target processors
289 
290  private:
291  // Communicator functionality
292  inline void comm_exchange(ToSendList& to_send, int out_queues_limit); // possibly called in between block computations
293  inline bool nudge();
294 
295  void cancel_requests(); // TODO
296 
297  // debug
298  inline void show_incoming_records() const;
299 
300  private:
301  std::vector<Link*> links_;
302  Collection blocks_;
303  std::vector<int> gids_;
304  std::map<int, int> lids_;
305 
306  QueuePolicy* queue_policy_;
307 
308  int limit_;
309  int threads_;
310  ExternalStorage* storage_;
311 
312  private:
313  // Communicator
314  mpi::communicator comm_;
315  IncomingRoundMap incoming_;
316  OutgoingQueuesMap outgoing_;
317  InFlightSendsList inflight_sends_;
318  InFlightRecvsMap inflight_recvs_;
319  CollectivesMap collectives_;
320  int expected_;
321  int exchange_round_;
322  bool immediate_;
323  Commands commands_;
324 
325  private:
326  fast_mutex add_mutex_;
327 
328  public:
329  std::shared_ptr<spd::logger> log = get_logger();
330  stats::Profiler prof;
331  };
332 
334  {
335  virtual ~BaseCommand() {} // to delete derived classes
336  virtual void execute(void* b, const ProxyWithLink& cp) const =0;
337  virtual bool skip(int i, const Master& master) const =0;
338  };
339 
340  template<class Block>
341  struct Master::Command: public BaseCommand
342  {
343  Command(Callback<Block> f_, const Skip& s_):
344  f(f_), s(s_) {}
345 
346  void execute(void* b, const ProxyWithLink& cp) const override { f(static_cast<Block*>(b), cp); }
347  bool skip(int i, const Master& m) const override { return s(i,m); }
348 
349  Callback<Block> f;
350  Skip s;
351  };
352 
354  { bool operator()(int i, const Master& master) const { return !master.has_incoming(i); } };
355 
357  {
358  Collective():
359  cop_(0) {}
360  Collective(detail::CollectiveOp* cop):
361  cop_(cop) {}
362  // this copy constructor is very ugly, but need it to insert Collectives into a list
363  Collective(const Collective& other):
364  cop_(0) { swap(const_cast<Collective&>(other)); }
365  ~Collective() { delete cop_; }
366 
367  void init() { cop_->init(); }
368  void swap(Collective& other) { std::swap(cop_, other.cop_); }
369  void update(const Collective& other) { cop_->update(*other.cop_); }
370  void global(const mpi::communicator& c) { cop_->global(c); }
371  void copy_from(Collective& other) const { cop_->copy_from(*other.cop_); }
372  void result_out(void* x) const { cop_->result_out(x); }
373 
374  detail::CollectiveOp* cop_;
375 
376  private:
377  Collective& operator=(const Collective& other);
378  };
379 }
380 
381 #include "proxy.hpp"
382 
383 // --- ProcessBlock ---
385 {
386  ProcessBlock(Master& master_,
387  const std::deque<int>& blocks__,
388  int local_limit_,
389  critical_resource<int>& idx_):
390  master(master_),
391  blocks(blocks__),
392  local_limit(local_limit_),
393  idx(idx_)
394  {}
395 
396  void process()
397  {
398  master.log->debug("Processing with thread: {}", this_thread::get_id());
399 
400  std::vector<int> local;
401  do
402  {
403  int cur = (*idx.access())++;
404 
405  if ((size_t)cur >= blocks.size())
406  return;
407 
408  int i = blocks[cur];
409  if (master.block(i))
410  {
411  if (local.size() == (size_t)local_limit)
412  master.unload(local);
413  local.push_back(i);
414  }
415 
416  master.log->debug("Processing block: {}", master.gid(i));
417 
418  bool skip_block = true;
419  for (size_t cmd = 0; cmd < master.commands_.size(); ++cmd)
420  {
421  if (!master.commands_[cmd]->skip(i, master))
422  {
423  skip_block = false;
424  break;
425  }
426  }
427 
428  IncomingQueuesMap &current_incoming = master.incoming_[master.exchange_round_].map;
429  if (skip_block)
430  {
431  if (master.block(i) == 0)
432  master.load_queues(i); // even though we are skipping the block, the queues might be necessary
433 
434  for (size_t cmd = 0; cmd < master.commands_.size(); ++cmd)
435  {
436  master.commands_[cmd]->execute(0, master.proxy(i)); // 0 signals that we are skipping the block (even if it's loaded)
437 
438  // no longer need them, so get rid of them, rather than risk reloading
439  current_incoming[master.gid(i)].queues.clear();
440  current_incoming[master.gid(i)].records.clear();
441  }
442 
443  if (master.block(i) == 0)
444  master.unload_queues(i); // even though we are skipping the block, the queues might be necessary
445  }
446  else
447  {
448  if (master.block(i) == 0) // block unloaded
449  {
450  if (local.size() == (size_t)local_limit) // reached the local limit
451  master.unload(local);
452 
453  master.load(i);
454  local.push_back(i);
455  }
456 
457  for (size_t cmd = 0; cmd < master.commands_.size(); ++cmd)
458  {
459  master.commands_[cmd]->execute(master.block(i), master.proxy(i));
460 
461  // no longer need them, so get rid of them
462  current_incoming[master.gid(i)].queues.clear();
463  current_incoming[master.gid(i)].records.clear();
464  }
465  }
466  } while(true);
467 
468  // TODO: invoke opportunistic communication
469  // don't forget to adjust Master::exchange()
470  }
471 
472  static void run(void* bf) { static_cast<ProcessBlock*>(bf)->process(); }
473 
474  Master& master;
475  const std::deque<int>& blocks;
476  int local_limit;
478 };
479 // --------------------
480 
481 void
482 diy::Master::
483 clear()
484 {
485  for (unsigned i = 0; i < size(); ++i)
486  delete links_[i];
487  blocks_.clear();
488  links_.clear();
489  gids_.clear();
490  lids_.clear();
491  expected_ = 0;
492 }
493 
494 void
495 diy::Master::
496 unload(int i)
497 {
498  log->debug("Unloading block: {}", gid(i));
499 
500  blocks_.unload(i);
501  unload_queues(i);
502 }
503 
504 void
505 diy::Master::
506 unload_queues(int i)
507 {
508  unload_incoming(gid(i));
509  unload_outgoing(gid(i));
510 }
511 
512 void
513 diy::Master::
514 unload_incoming(int gid__)
515 {
516  for (IncomingRoundMap::iterator round_itr = incoming_.begin(); round_itr != incoming_.end(); ++round_itr)
517  {
518  IncomingQueuesMap::iterator qmap_itr = round_itr->second.map.find(gid__);
519  if (qmap_itr == round_itr->second.map.end())
520  {
521  continue;
522  }
523  IncomingQueuesRecords& in_qrs = qmap_itr->second;
524  for (InQueueRecords::iterator it = in_qrs.records.begin(); it != in_qrs.records.end(); ++it)
525  {
526  QueueRecord& qr = it->second;
527  if (queue_policy_->unload_incoming(*this, it->first, gid__, qr.size))
528  {
529  log->debug("Unloading queue: {} <- {}", gid__, it->first);
530  qr.external = storage_->put(in_qrs.queues[it->first]);
531  }
532  }
533  }
534 }
535 
536 void
537 diy::Master::
538 unload_outgoing(int gid__)
539 {
540  OutgoingQueuesRecord& out_qr = outgoing_[gid__];
541 
542  size_t out_queues_size = sizeof(size_t); // map size
543  size_t count = 0;
544  for (OutgoingQueues::iterator it = out_qr.queues.begin(); it != out_qr.queues.end(); ++it)
545  {
546  if (it->first.proc == comm_.rank()) continue;
547 
548  out_queues_size += sizeof(BlockID); // target
549  out_queues_size += sizeof(size_t); // buffer.position
550  out_queues_size += sizeof(size_t); // buffer.size
551  out_queues_size += it->second.size(); // buffer contents
552  ++count;
553  }
554  if (queue_policy_->unload_outgoing(*this, gid__, out_queues_size - sizeof(size_t)))
555  {
556  log->debug("Unloading outgoing queues: {} -> ...; size = {}\n", gid__, out_queues_size);
557  MemoryBuffer bb; bb.reserve(out_queues_size);
558  diy::save(bb, count);
559 
560  for (OutgoingQueues::iterator it = out_qr.queues.begin(); it != out_qr.queues.end();)
561  {
562  if (it->first.proc == comm_.rank())
563  {
564  // treat as incoming
565  if (queue_policy_->unload_incoming(*this, gid__, it->first.gid, it->second.size()))
566  {
567  QueueRecord& qr = out_qr.external_local[it->first];
568  qr.size = it->second.size();
569  qr.external = storage_->put(it->second);
570 
571  out_qr.queues.erase(it++);
572  continue;
573  } // else keep in memory
574  } else
575  {
576  diy::save(bb, it->first);
577  diy::save(bb, it->second);
578 
579  out_qr.queues.erase(it++);
580  continue;
581  }
582  ++it;
583  }
584 
585  // TODO: this mechanism could be adjusted for direct saving to disk
586  // (without intermediate binary buffer serialization)
587  out_qr.external = storage_->put(bb);
588  }
589 }
590 
591 void
592 diy::Master::
593 load(int i)
594 {
595  log->debug("Loading block: {}", gid(i));
596 
597  blocks_.load(i);
598  load_queues(i);
599 }
600 
601 void
602 diy::Master::
603 load_queues(int i)
604 {
605  load_incoming(gid(i));
606  load_outgoing(gid(i));
607 }
608 
609 void
610 diy::Master::
611 load_incoming(int gid__)
612 {
613  IncomingQueuesRecords& in_qrs = incoming_[exchange_round_].map[gid__];
614  for (InQueueRecords::iterator it = in_qrs.records.begin(); it != in_qrs.records.end(); ++it)
615  {
616  QueueRecord& qr = it->second;
617  if (qr.external != -1)
618  {
619  log->debug("Loading queue: {} <- {}", gid__, it->first);
620  storage_->get(qr.external, in_qrs.queues[it->first]);
621  qr.external = -1;
622  }
623  }
624 }
625 
626 void
627 diy::Master::
628 load_outgoing(int gid__)
629 {
630  // TODO: we could adjust this mechanism to read directly from storage,
631  // bypassing an intermediate MemoryBuffer
632  OutgoingQueuesRecord& out_qr = outgoing_[gid__];
633  if (out_qr.external != -1)
634  {
635  MemoryBuffer bb;
636  storage_->get(out_qr.external, bb);
637  out_qr.external = -1;
638 
639  size_t count;
640  diy::load(bb, count);
641  for (size_t i = 0; i < count; ++i)
642  {
643  BlockID to;
644  diy::load(bb, to);
645  diy::load(bb, out_qr.queues[to]);
646  }
647  }
648 }
649 
651 diy::Master::
652 proxy(int i) const
653 { return ProxyWithLink(Proxy(const_cast<Master*>(this), gid(i)), block(i), link(i)); }
654 
655 
656 int
658 add(int gid__, void* b, Link* l)
659 {
660  if (*blocks_.in_memory().const_access() == limit_)
661  unload_all();
662 
663  lock_guard<fast_mutex> lock(add_mutex_); // allow to add blocks from multiple threads
664 
665  blocks_.add(b);
666  links_.push_back(l);
667  gids_.push_back(gid__);
668 
669  int lid__ = static_cast<int>(gids_.size()) - 1;
670  lids_[gid__] = lid__;
671  add_expected(l->size_unique()); // NB: at every iteration we expect a message from each unique neighbor
672 
673  return lid__;
674 }
675 
676 void*
678 release(int i)
679 {
680  void* b = blocks_.release(i);
681  delete link(i); links_[i] = 0;
682  lids_.erase(gid(i));
683  return b;
684 }
685 
686 bool
687 diy::Master::
688 has_incoming(int i) const
689 {
690  const IncomingQueuesRecords& in_qrs = const_cast<Master&>(*this).incoming_[exchange_round_].map[gid(i)];
691  for (InQueueRecords::const_iterator it = in_qrs.records.begin(); it != in_qrs.records.end(); ++it)
692  {
693  const QueueRecord& qr = it->second;
694  if (qr.size != 0)
695  return true;
696  }
697  return false;
698 }
699 
700 template<class Block>
701 void
703 foreach_(const Callback<Block>& f, const Skip& skip)
704 {
705  auto scoped = prof.scoped("foreach");
706  DIY_UNUSED(scoped);
707 
708  commands_.push_back(new Command<Block>(f, skip));
709 
710  if (immediate())
711  execute();
712 }
713 
714 void
715 diy::Master::
716 execute()
717 {
718  log->debug("Entered execute()");
719  auto scoped = prof.scoped("execute");
720  DIY_UNUSED(scoped);
721  //show_incoming_records();
722 
723  // touch the outgoing and incoming queues as well as collectives to make sure they exist
724  for (unsigned i = 0; i < size(); ++i)
725  {
726  outgoing(gid(i));
727  incoming(gid(i)); // implicitly touches queue records
728  collectives(gid(i));
729  }
730 
731  if (commands_.empty())
732  return;
733 
734  // Order the blocks, so the loaded ones come first
735  std::deque<int> blocks;
736  for (unsigned i = 0; i < size(); ++i)
737  if (block(i) == 0)
738  blocks.push_back(i);
739  else
740  blocks.push_front(i);
741 
742  // don't use more threads than we can have blocks in memory
743  int num_threads;
744  int blocks_per_thread;
745  if (limit_ == -1)
746  {
747  num_threads = threads_;
748  blocks_per_thread = size();
749  }
750  else
751  {
752  num_threads = std::min(threads_, limit_);
753  blocks_per_thread = limit_/num_threads;
754  }
755 
756  // idx is shared
757  critical_resource<int> idx(0);
758 
759  typedef ProcessBlock BlockFunctor;
760  if (num_threads > 1)
761  {
762  // launch the threads
763  typedef std::pair<thread*, BlockFunctor*> ThreadFunctorPair;
764  typedef std::list<ThreadFunctorPair> ThreadFunctorList;
765  ThreadFunctorList threads;
766  for (unsigned i = 0; i < (unsigned)num_threads; ++i)
767  {
768  BlockFunctor* bf = new BlockFunctor(*this, blocks, blocks_per_thread, idx);
769  threads.push_back(ThreadFunctorPair(new thread(&BlockFunctor::run, bf), bf));
770  }
771 
772  // join the threads
773  for(ThreadFunctorList::iterator it = threads.begin(); it != threads.end(); ++it)
774  {
775  thread* t = it->first;
776  BlockFunctor* bf = it->second;
777  t->join();
778  delete t;
779  delete bf;
780  }
781  } else
782  {
783  BlockFunctor bf(*this, blocks, blocks_per_thread, idx);
784  BlockFunctor::run(&bf);
785  }
786 
787  // clear incoming queues
788  incoming_[exchange_round_].map.clear();
789 
790  if (limit() != -1 && in_memory() > limit())
791  throw std::runtime_error(fmt::format("Fatal: {} blocks in memory, with limit {}", in_memory(), limit()));
792 
793  // clear commands
794  for (size_t i = 0; i < commands_.size(); ++i)
795  delete commands_[i];
796  commands_.clear();
797 }
798 
799 void
802 {
803  auto scoped = prof.scoped("exchange");
804  DIY_UNUSED(scoped);
805 
806  execute();
807 
808  log->debug("Starting exchange");
809 
810  // make sure there is a queue for each neighbor
811  for (int i = 0; i < (int)size(); ++i)
812  {
813  OutgoingQueues& outgoing_queues = outgoing_[gid(i)].queues;
814  OutQueueRecords& external_local = outgoing_[gid(i)].external_local;
815  if (outgoing_queues.size() < (size_t)link(i)->size())
816  for (unsigned j = 0; j < (unsigned)link(i)->size(); ++j)
817  {
818  if (external_local.find(link(i)->target(j)) == external_local.end())
819  outgoing_queues[link(i)->target(j)]; // touch the outgoing queue, creating it if necessary
820  }
821  }
822 
823  flush();
824  log->debug("Finished exchange");
825 }
826 
827 namespace diy
828 {
829 namespace detail
830 {
831  template <typename T>
832  struct VectorWindow
833  {
834  T *begin;
835  size_t count;
836  };
837 } // namespace detail
838 
839 namespace mpi
840 {
841 namespace detail
842 {
843  template<typename T> struct is_mpi_datatype< diy::detail::VectorWindow<T> > { typedef true_type type; };
844 
845  template <typename T>
846  struct mpi_datatype< diy::detail::VectorWindow<T> >
847  {
848  typedef diy::detail::VectorWindow<T> VecWin;
849  static MPI_Datatype datatype() { return get_mpi_datatype<T>(); }
850  static const void* address(const VecWin& x) { return x.begin; }
851  static void* address(VecWin& x) { return x.begin; }
852  static int count(const VecWin& x) { return static_cast<int>(x.count); }
853  };
854 }
855 } // namespace mpi::detail
856 
857 } // namespace diy
858 
859 /* Communicator */
860 void
861 diy::Master::
862 comm_exchange(ToSendList& to_send, int out_queues_limit)
863 {
864  static const size_t MAX_MPI_MESSAGE_COUNT = INT_MAX;
865 
866  IncomingRound &current_incoming = incoming_[exchange_round_];
867  // isend outgoing queues, up to the out_queues_limit
868  while(inflight_sends_.size() < (size_t)out_queues_limit && !to_send.empty())
869  {
870  int from = to_send.front();
871 
872  // deal with external_local queues
873  for (OutQueueRecords::iterator it = outgoing_[from].external_local.begin(); it != outgoing_[from].external_local.end(); ++it)
874  {
875  int to = it->first.gid;
876 
877  log->debug("Processing local queue: {} <- {} of size {}", to, from, it->second.size);
878 
879  QueueRecord& in_qr = current_incoming.map[to].records[from];
880  bool in_external = block(lid(to)) == 0;
881 
882  if (in_external)
883  in_qr = it->second;
884  else
885  {
886  // load the queue
887  in_qr.size = it->second.size;
888  in_qr.external = -1;
889 
890  MemoryBuffer bb;
891  storage_->get(it->second.external, bb);
892 
893  current_incoming.map[to].queues[from].swap(bb);
894  }
895  ++current_incoming.received;
896  }
897  outgoing_[from].external_local.clear();
898 
899  if (outgoing_[from].external != -1)
900  load_outgoing(from);
901  to_send.pop_front();
902 
903  OutgoingQueues& outgoing = outgoing_[from].queues;
904  for (OutgoingQueues::iterator it = outgoing.begin(); it != outgoing.end(); ++it)
905  {
906  BlockID to_proc = it->first;
907  int to = to_proc.gid;
908  int proc = to_proc.proc;
909 
910  log->debug("Processing queue: {} <- {} of size {}", to, from, outgoing_[from].queues[to_proc].size());
911 
912  // There may be local outgoing queues that remained in memory
913  if (proc == comm_.rank()) // sending to ourselves: simply swap buffers
914  {
915  log->debug("Moving queue in-place: {} <- {}", to, from);
916 
917  QueueRecord& in_qr = current_incoming.map[to].records[from];
918  bool in_external = block(lid(to)) == 0;
919  if (in_external)
920  {
921  log->debug("Unloading outgoing directly as incoming: {} <- {}", to, from);
922  MemoryBuffer& bb = it->second;
923  in_qr.size = bb.size();
924  if (queue_policy_->unload_incoming(*this, from, to, in_qr.size))
925  in_qr.external = storage_->put(bb);
926  else
927  {
928  MemoryBuffer& in_bb = current_incoming.map[to].queues[from];
929  in_bb.swap(bb);
930  in_bb.reset();
931  in_qr.external = -1;
932  }
933  } else // !in_external
934  {
935  log->debug("Swapping in memory: {} <- {}", to, from);
936  MemoryBuffer& bb = current_incoming.map[to].queues[from];
937  bb.swap(it->second);
938  bb.reset();
939  in_qr.size = bb.size();
940  in_qr.external = -1;
941  }
942 
943  ++current_incoming.received;
944  continue;
945  }
946 
947  std::shared_ptr<MemoryBuffer> buffer = std::make_shared<MemoryBuffer>();
948  buffer->swap(it->second);
949 
950  MessageInfo info{from, to, exchange_round_};
951  if (buffer->size() <= (MAX_MPI_MESSAGE_COUNT - sizeof(info)))
952  {
953  diy::save(*buffer, info);
954 
955  inflight_sends_.emplace_back();
956  inflight_sends_.back().info = info;
957  inflight_sends_.back().request = comm_.isend(proc, tags::queue, buffer->buffer);
958  inflight_sends_.back().message = buffer;
959  }
960  else
961  {
962  int npieces = static_cast<int>((buffer->size() + MAX_MPI_MESSAGE_COUNT - 1)/MAX_MPI_MESSAGE_COUNT);
963 
964  // first send the head
965  std::shared_ptr<MemoryBuffer> hb = std::make_shared<MemoryBuffer>();
966  diy::save(*hb, buffer->size());
967  diy::save(*hb, info);
968 
969  inflight_sends_.emplace_back();
970  inflight_sends_.back().info = info;
971  inflight_sends_.back().request = comm_.isend(proc, tags::piece, hb->buffer);
972  inflight_sends_.back().message = hb;
973 
974  // send the message pieces
975  size_t msg_buff_idx = 0;
976  for (int i = 0; i < npieces; ++i, msg_buff_idx += MAX_MPI_MESSAGE_COUNT)
977  {
978  int tag = (i == (npieces - 1)) ? tags::queue : tags::piece;
979 
980  detail::VectorWindow<char> window;
981  window.begin = &buffer->buffer[msg_buff_idx];
982  window.count = std::min(MAX_MPI_MESSAGE_COUNT, buffer->size() - msg_buff_idx);
983 
984  inflight_sends_.emplace_back();
985  inflight_sends_.back().info = info;
986  inflight_sends_.back().request = comm_.isend(proc, tag, window);
987  inflight_sends_.back().message = buffer;
988  }
989  }
990  }
991  }
992 
993  // kick requests
994  while(nudge());
995 
996  // check incoming queues
997  mpi::optional<mpi::status> ostatus = comm_.iprobe(mpi::any_source, mpi::any_tag);
998  while(ostatus)
999  {
1000  InFlightRecv &ir = inflight_recvs_[ostatus->source()];
1001 
1002  if (ir.info.from == -1) // uninitialized
1003  {
1004  MemoryBuffer bb;
1005  comm_.recv(ostatus->source(), ostatus->tag(), bb.buffer);
1006 
1007  if (ostatus->tag() == tags::piece)
1008  {
1009  size_t msg_size;
1010  diy::load(bb, msg_size);
1011  diy::load(bb, ir.info);
1012 
1013  ir.message.buffer.reserve(msg_size);
1014  }
1015  else // tags::queue
1016  {
1017  diy::load_back(bb, ir.info);
1018  ir.message.swap(bb);
1019  }
1020  }
1021  else
1022  {
1023  size_t start_idx = ir.message.buffer.size();
1024  size_t count = ostatus->count<char>();
1025  ir.message.buffer.resize(start_idx + count);
1026 
1027  detail::VectorWindow<char> window;
1028  window.begin = &ir.message.buffer[start_idx];
1029  window.count = count;
1030 
1031  comm_.recv(ostatus->source(), ostatus->tag(), window);
1032  }
1033 
1034  if (ostatus->tag() == tags::queue)
1035  {
1036  size_t size = ir.message.size();
1037  int from = ir.info.from;
1038  int to = ir.info.to;
1039  int external = -1;
1040 
1041  assert(ir.info.round >= exchange_round_);
1042  IncomingRound *in = &incoming_[ir.info.round];
1043 
1044  bool unload_queue = ((ir.info.round == exchange_round_) ? (block(lid(to)) == 0) : (limit_ != -1)) &&
1045  queue_policy_->unload_incoming(*this, from, to, size);
1046  if (unload_queue)
1047  {
1048  log->debug("Directly unloading queue {} <- {}", to, from);
1049  external = storage_->put(ir.message); // unload directly
1050  }
1051  else
1052  {
1053  in->map[to].queues[from].swap(ir.message);
1054  in->map[to].queues[from].reset(); // buffer position = 0
1055  }
1056  in->map[to].records[from] = QueueRecord(size, external);
1057 
1058  ++(in->received);
1059  ir = InFlightRecv(); // reset
1060  }
1061 
1062  ostatus = comm_.iprobe(mpi::any_source, mpi::any_tag);
1063  }
1064 }
1065 
1066 void
1067 diy::Master::
1068 flush()
1069 {
1070 #ifdef DEBUG
1071  time_type start = get_time();
1072  unsigned wait = 1;
1073 #endif
1074 
1075  // prepare for next round
1076  incoming_.erase(exchange_round_);
1077  ++exchange_round_;
1078 
1079  // make a list of outgoing queues to send (the ones in memory come first)
1080  ToSendList to_send;
1081  for (OutgoingQueuesMap::iterator it = outgoing_.begin(); it != outgoing_.end(); ++it)
1082  {
1083  OutgoingQueuesRecord& out = it->second;
1084  if (out.external == -1)
1085  to_send.push_front(it->first);
1086  else
1087  to_send.push_back(it->first);
1088  }
1089  log->debug("to_send.size(): {}", to_send.size());
1090 
1091  // XXX: we probably want a cleverer limit than block limit times average number of queues per block
1092  // XXX: with queues we could easily maintain a specific space limit
1093  int out_queues_limit;
1094  if (limit_ == -1 || size() == 0)
1095  out_queues_limit = static_cast<int>(to_send.size());
1096  else
1097  out_queues_limit = static_cast<int>(std::max((size_t) 1, to_send.size()/size()*limit_)); // average number of queues per block * in-memory block limit
1098 
1099  do
1100  {
1101  comm_exchange(to_send, out_queues_limit);
1102 
1103 #ifdef DEBUG
1104  time_type cur = get_time();
1105  if (cur - start > wait*1000)
1106  {
1107  log->warn("Waiting in flush [{}]: {} - {} out of {}",
1108  comm_.rank(), inflight_sends_.size(), incoming_[exchange_round_].received, expected_);
1109  wait *= 2;
1110  }
1111 #endif
1112  } while (!inflight_sends_.empty() || incoming_[exchange_round_].received < expected_ || !to_send.empty());
1113 
1114  outgoing_.clear();
1115 
1116  log->debug("Done in flush");
1117  //show_incoming_records();
1118 
1119  process_collectives();
1120 }
1121 
1122 void
1123 diy::Master::
1124 process_collectives()
1125 {
1126  auto scoped = prof.scoped("collectives");
1127  DIY_UNUSED(scoped);
1128 
1129  if (collectives_.empty())
1130  return;
1131 
1132  typedef CollectivesList::iterator CollectivesIterator;
1133  std::vector<CollectivesIterator> iters;
1134  std::vector<int> gids;
1135  for (CollectivesMap::iterator cur = collectives_.begin(); cur != collectives_.end(); ++cur)
1136  {
1137  gids.push_back(cur->first);
1138  iters.push_back(cur->second.begin());
1139  }
1140 
1141  while (iters[0] != collectives_.begin()->second.end())
1142  {
1143  iters[0]->init();
1144  for (unsigned j = 1; j < iters.size(); ++j)
1145  {
1146  // NB: this assumes that the operations are commutative
1147  iters[0]->update(*iters[j]);
1148  }
1149  iters[0]->global(comm_); // do the mpi collective
1150 
1151  for (unsigned j = 1; j < iters.size(); ++j)
1152  {
1153  iters[j]->copy_from(*iters[0]);
1154  ++iters[j];
1155  }
1156 
1157  ++iters[0];
1158  }
1159 }
1160 
1161 bool
1162 diy::Master::
1163 nudge()
1164 {
1165  bool success = false;
1166  for (InFlightSendsList::iterator it = inflight_sends_.begin(); it != inflight_sends_.end();)
1167  {
1168  mpi::optional<mpi::status> ostatus = it->request.test();
1169  if (ostatus)
1170  {
1171  success = true;
1172  it = inflight_sends_.erase(it);
1173  }
1174  else
1175  {
1176  ++it;
1177  }
1178  }
1179  return success;
1180 }
1181 
1182 void
1183 diy::Master::
1184 show_incoming_records() const
1185 {
1186  for (IncomingRoundMap::const_iterator rounds_itr = incoming_.begin(); rounds_itr != incoming_.end(); ++rounds_itr)
1187  {
1188  for (IncomingQueuesMap::const_iterator it = rounds_itr->second.map.begin(); it != rounds_itr->second.map.end(); ++it)
1189  {
1190  const IncomingQueuesRecords& in_qrs = it->second;
1191  for (InQueueRecords::const_iterator cur = in_qrs.records.begin(); cur != in_qrs.records.end(); ++cur)
1192  {
1193  const QueueRecord& qr = cur->second;
1194  log->info("round: {}, {} <- {}: (size,external) = ({},{})",
1195  rounds_itr->first,
1196  it->first, cur->first,
1197  qr.size,
1198  qr.external);
1199  }
1200  for (IncomingQueues::const_iterator cur = in_qrs.queues.begin(); cur != in_qrs.queues.end(); ++cur)
1201  {
1202  log->info("round: {}, {} <- {}: queue.size() = {}",
1203  rounds_itr->first,
1204  it->first, cur->first,
1205  const_cast<IncomingQueuesRecords&>(in_qrs).queues[cur->first].size());
1206  }
1207  }
1208  }
1209 }
1210 
1211 #endif
Definition: master.hpp:120
Definition: master.hpp:95
void load(BinaryBuffer &bb, T &x)
Loads x from bb by calling diy::Serialization<T>::load(bb,x).
Definition: serialization.hpp:106
void load_back(BinaryBuffer &bb, T &x)
Supports only binary data copying (meant for simple footers).
Definition: serialization.hpp:120
Definition: master.hpp:356
Definition: master.hpp:146
Definition: storage.hpp:38
void save(BinaryBuffer &bb, const T &x)
Saves x to bb by calling diy::Serialization<T>::save(bb,x).
Definition: serialization.hpp:102
int gid(int i) const
return gid of the i-th block
Definition: master.hpp:219
Definition: master.hpp:384
unsigned int size() const
return the number of local blocks
Definition: master.hpp:233
int lid(int gid__) const
return the local id of the local block with global id gid, or -1 if not local
Definition: master.hpp:221
Master(mpi::communicator comm, int threads__=1, int limit__=-1, CreateBlock create_=0, DestroyBlock destroy_=0, ExternalStorage *storage=0, SaveBlock save=0, LoadBlock load_=0, QueuePolicy *q_policy=new QueueSizePolicy(4096))
The main DIY object.
Definition: master.hpp:164
void foreach_(const Callback< Block > &f, const Skip &s=NeverSkip())
call f with every block
Definition: master.hpp:703
void in(const RegularLink< Bounds > &link, const Point &p, OutIter out, const Bounds &domain)
Finds the neighbor(s) containing the target point.
Definition: pick.hpp:102
Definition: master.hpp:41
int add(int gid, void *b, Link *l)
add a block
Definition: master.hpp:658
Definition: master.hpp:35
Simple wrapper around MPI_Comm.
Definition: communicator.hpp:8
Definition: master.hpp:111
Communication proxy, used for enqueueing and dequeueing items for future exchange.
Definition: proxy.hpp:8
Definition: serialization.hpp:26
Definition: no-thread.hpp:27
Definition: master.hpp:353
Definition: master.hpp:72
Definition: master.hpp:333
Definition: master.hpp:47
const mpi::communicator & communicator() const
return the MPI communicator
Definition: master.hpp:212
void * release(int i)
release ownership of the block
Definition: master.hpp:678
Definition: request.hpp:5
Definition: master.hpp:136
Definition: master.hpp:55
bool local(int gid__) const
whether the block with global id gid is local
Definition: master.hpp:223
Definition: master.hpp:89
void exchange()
exchange the queues between all the blocks (collective operation)
Definition: master.hpp:801
Definition: master.hpp:104
Move queues out of core if their size exceeds a parameter given in the constructor.
Definition: master.hpp:80
Definition: master.hpp:131
mpi::communicator & communicator()
return the MPI communicator
Definition: master.hpp:214
Definition: no-thread.hpp:9