DIY  3.0
data-parallel out-of-core C++ library
 All Classes Namespaces Functions Typedefs Groups Pages
master.hpp
1 #ifndef DIY_MASTER_HPP
2 #define DIY_MASTER_HPP
3 
4 #include <vector>
5 #include <map>
6 #include <list>
7 #include <deque>
8 #include <algorithm>
9 #include <functional>
10 
11 #include "link.hpp"
12 #include "collection.hpp"
13 
14 // Communicator functionality
15 #include "mpi.hpp"
16 #include "serialization.hpp"
17 #include "detail/collectives.hpp"
18 #include "time.hpp"
19 
20 #include "thread.hpp"
21 
22 #include "detail/block_traits.hpp"
23 
24 #include "log.hpp"
25 #include "stats.hpp"
26 
27 namespace diy
28 {
29  // Stores and manages blocks; initiates serialization and communication when necessary.
30  //
31  // Provides a foreach function, which is meant as the main entry point.
32  //
33  // Provides a conversion between global and local block ids,
34  // which is hidden from blocks via a communicator proxy.
35  class Master
36  {
37  public:
38  struct ProcessBlock;
39 
40  template<class Block>
41  struct Binder;
42 
43  // Commands
44  struct BaseCommand;
45 
46  template<class Block>
47  struct Command;
48 
49  typedef std::vector<BaseCommand*> Commands;
50 
51  // Skip
52  using Skip = std::function<bool(int, const Master&)>;
53 
54  struct SkipNoIncoming;
55  struct NeverSkip { bool operator()(int i, const Master& master) const { return false; } };
56 
57  // Collection
58  typedef Collection::Create CreateBlock;
59  typedef Collection::Destroy DestroyBlock;
60  typedef Collection::Save SaveBlock;
61  typedef Collection::Load LoadBlock;
62 
63  public:
64  // Communicator types
65  struct Proxy;
66  struct ProxyWithLink;
67 
68  // foreach callback
69  template<class Block>
70  using Callback = std::function<void(Block*, const ProxyWithLink&)>;
71 
72  struct QueuePolicy
73  {
74  virtual bool unload_incoming(const Master& master, int from, int to, size_t size) const =0;
75  virtual bool unload_outgoing(const Master& master, int from, size_t size) const =0;
76  virtual ~QueuePolicy() {}
77  };
78 
81  {
82  QueueSizePolicy(size_t sz): size(sz) {}
83  bool unload_incoming(const Master& master, int from, int to, size_t sz) const { return sz > size; }
84  bool unload_outgoing(const Master& master, int from, size_t sz) const { return sz > size*master.outgoing_count(from); }
85 
86  size_t size;
87  };
88 
89  struct MessageInfo
90  {
91  int from, to;
92  int round;
93  };
94 
95  struct InFlightSend
96  {
97  std::shared_ptr<MemoryBuffer> message;
98  mpi::request request;
99 
100  // for debug purposes:
101  MessageInfo info;
102  };
103 
105  {
106  MemoryBuffer message;
107  MessageInfo info{ -1, -1, -1 };
108  };
109 
110  struct Collective;
111  struct tags { enum { queue, piece }; };
112 
113  typedef std::list<InFlightSend> InFlightSendsList;
114  typedef std::map<int, InFlightRecv> InFlightRecvsMap;
115  typedef std::list<int> ToSendList; // [gid]
116  typedef std::list<Collective> CollectivesList;
117  typedef std::map<int, CollectivesList> CollectivesMap; // gid -> [collectives]
118 
119 
120  struct QueueRecord
121  {
122  QueueRecord(size_t s = 0, int e = -1): size(s), external(e) {}
123  size_t size;
124  int external;
125  };
126 
127  typedef std::map<int, QueueRecord> InQueueRecords; // gid -> (size, external)
128  typedef std::map<int, MemoryBuffer> IncomingQueues; // gid -> queue
129  typedef std::map<BlockID, MemoryBuffer> OutgoingQueues; // (gid, proc) -> queue
130  typedef std::map<BlockID, QueueRecord> OutQueueRecords; // (gid, proc) -> (size, external)
132  {
133  InQueueRecords records;
134  IncomingQueues queues;
135  };
137  {
138  OutgoingQueuesRecord(int e = -1): external(e) {}
139  int external;
140  OutQueueRecords external_local;
141  OutgoingQueues queues;
142  };
143  typedef std::map<int, IncomingQueuesRecords> IncomingQueuesMap; // gid -> { gid -> queue }
144  typedef std::map<int, OutgoingQueuesRecord> OutgoingQueuesMap; // gid -> { (gid,proc) -> queue }
145 
147  {
148  IncomingQueuesMap map;
149  int received{0};
150  };
151  typedef std::map<int, IncomingRound> IncomingRoundMap;
152 
153 
154  public:
165  int threads = 1,
166  int limit = -1,
167  CreateBlock create = 0,
168  DestroyBlock destroy = 0,
169  ExternalStorage* storage = 0,
170  SaveBlock save = 0,
171  LoadBlock load = 0,
172  QueuePolicy* q_policy = new QueueSizePolicy(4096)):
173  blocks_(create, destroy, storage, save, load),
174  queue_policy_(q_policy),
175  limit_(limit),
176  threads_(threads == -1 ? thread::hardware_concurrency() : threads),
177  storage_(storage),
178  // Communicator functionality
179  comm_(comm),
180  expected_(0),
181  exchange_round_(-1),
182  immediate_(true)
183  {}
184  ~Master() { set_immediate(true); clear(); delete queue_policy_; }
185  inline void clear();
186  inline void destroy(int i) { if (blocks_.own()) blocks_.destroy(i); }
187 
188  inline int add(int gid, void* b, Link* l);
189  inline void* release(int i);
190 
192  inline void* block(int i) const { return blocks_.find(i); }
193  template<class Block>
194  Block* block(int i) const { return static_cast<Block*>(block(i)); }
195  inline Link* link(int i) const { return links_[i]; }
196  inline int loaded_block() const { return blocks_.available(); }
197 
198  inline void unload(int i);
199  inline void load(int i);
200  void unload(std::vector<int>& loaded) { for(unsigned i = 0; i < loaded.size(); ++i) unload(loaded[i]); loaded.clear(); }
201  void unload_all() { for(unsigned i = 0; i < size(); ++i) if (block(i) != 0) unload(i); }
202  inline bool has_incoming(int i) const;
203 
204  inline void unload_queues(int i);
205  inline void unload_incoming(int gid);
206  inline void unload_outgoing(int gid);
207  inline void load_queues(int i);
208  inline void load_incoming(int gid);
209  inline void load_outgoing(int gid);
210 
212  const mpi::communicator& communicator() const { return comm_; }
214  mpi::communicator& communicator() { return comm_; }
215 
217  void* get(int i) { return blocks_.get(i); }
219  int gid(int i) const { return gids_[i]; }
221  int lid(int gid) const { return local(gid) ? lids_.find(gid)->second : -1; }
223  bool local(int gid) const { return lids_.find(gid) != lids_.end(); }
224 
226  inline void exchange();
227  inline void process_collectives();
228 
229  inline
230  ProxyWithLink proxy(int i) const;
231 
233  unsigned size() const { return blocks_.size(); }
234  void* create() const { return blocks_.create(); }
235 
236  // accessors
237  int limit() const { return limit_; }
238  int threads() const { return threads_; }
239  int in_memory() const { return *blocks_.in_memory().const_access(); }
240 
241  void set_threads(int threads) { threads_ = threads; }
242 
243  CreateBlock creator() const { return blocks_.creator(); }
244  DestroyBlock destroyer() const { return blocks_.destroyer(); }
245  LoadBlock loader() const { return blocks_.loader(); }
246  SaveBlock saver() const { return blocks_.saver(); }
247 
249  template<class Block>
250  void foreach_(const Callback<Block>& f, const Skip& s = NeverSkip());
251 
252  template<class F>
253  void foreach(const F& f, const Skip& s = NeverSkip())
254  {
255  using Block = typename detail::block_traits<F>::type;
256  foreach_<Block>(f, s);
257  }
258 
259  inline void execute();
260 
261  bool immediate() const { return immediate_; }
262  void set_immediate(bool i) { if (i && !immediate_) execute(); immediate_ = i; }
263 
264  public:
265  // Communicator functionality
266  IncomingQueues& incoming(int gid) { return incoming_[exchange_round_].map[gid].queues; }
267  OutgoingQueues& outgoing(int gid) { return outgoing_[gid].queues; }
268  CollectivesList& collectives(int gid) { return collectives_[gid]; }
269  size_t incoming_count(int gid) const
270  {
271  IncomingRoundMap::const_iterator round_it = incoming_.find(exchange_round_);
272  if (round_it == incoming_.end())
273  return 0;
274  IncomingQueuesMap::const_iterator queue_it = round_it->second.map.find(gid);
275  if (queue_it == round_it->second.map.end())
276  return 0;
277  return queue_it->second.queues.size();
278  }
279  size_t outgoing_count(int gid) const { OutgoingQueuesMap::const_iterator it = outgoing_.find(gid); if (it == outgoing_.end()) return 0; return it->second.queues.size(); }
280 
281  void set_expected(int expected) { expected_ = expected; }
282  void add_expected(int i) { expected_ += i; }
283  int expected() const { return expected_; }
284  void replace_link(int i, Link* link) { expected_ -= links_[i]->size_unique(); delete links_[i]; links_[i] = link; expected_ += links_[i]->size_unique(); }
285 
286  public:
287  // Communicator functionality
288  inline void flush(); // makes sure all the serialized queues migrate to their target processors
289 
290  private:
291  // Communicator functionality
292  inline void comm_exchange(ToSendList& to_send, int out_queues_limit); // possibly called in between block computations
293  inline bool nudge();
294 
295  void cancel_requests(); // TODO
296 
297  // debug
298  inline void show_incoming_records() const;
299 
300  private:
301  std::vector<Link*> links_;
302  Collection blocks_;
303  std::vector<int> gids_;
304  std::map<int, int> lids_;
305 
306  QueuePolicy* queue_policy_;
307 
308  int limit_;
309  int threads_;
310  ExternalStorage* storage_;
311 
312  private:
313  // Communicator
314  mpi::communicator comm_;
315  IncomingRoundMap incoming_;
316  OutgoingQueuesMap outgoing_;
317  InFlightSendsList inflight_sends_;
318  InFlightRecvsMap inflight_recvs_;
319  CollectivesMap collectives_;
320  int expected_;
321  int exchange_round_;
322  bool immediate_;
323  Commands commands_;
324 
325  private:
326  fast_mutex add_mutex_;
327 
328  public:
329  std::shared_ptr<spd::logger> log = get_logger();
330  stats::Profiler prof;
331  };
332 
334  {
335  virtual ~BaseCommand() {} // to delete derived classes
336  virtual void execute(void* b, const ProxyWithLink& cp) const =0;
337  virtual bool skip(int i, const Master& master) const =0;
338  };
339 
340  template<class Block>
341  struct Master::Command: public BaseCommand
342  {
343  Command(Callback<Block> f_, const Skip& s_):
344  f(f_), s(s_) {}
345 
346  void execute(void* b, const ProxyWithLink& cp) const override { f(static_cast<Block*>(b), cp); }
347  bool skip(int i, const Master& m) const override { return s(i,m); }
348 
349  Callback<Block> f;
350  Skip s;
351  };
352 
354  { bool operator()(int i, const Master& master) const { return !master.has_incoming(i); } };
355 
357  {
358  Collective():
359  cop_(0) {}
360  Collective(detail::CollectiveOp* cop):
361  cop_(cop) {}
362  // this copy constructor is very ugly, but need it to insert Collectives into a list
363  Collective(const Collective& other):
364  cop_(0) { swap(const_cast<Collective&>(other)); }
365  ~Collective() { delete cop_; }
366 
367  void init() { cop_->init(); }
368  void swap(Collective& other) { std::swap(cop_, other.cop_); }
369  void update(const Collective& other) { cop_->update(*other.cop_); }
370  void global(const mpi::communicator& c) { cop_->global(c); }
371  void copy_from(Collective& other) const { cop_->copy_from(*other.cop_); }
372  void result_out(void* x) const { cop_->result_out(x); }
373 
374  detail::CollectiveOp* cop_;
375 
376  private:
377  Collective& operator=(const Collective& other);
378  };
379 }
380 
381 #include "proxy.hpp"
382 
383 // --- ProcessBlock ---
385 {
386  ProcessBlock(Master& master_,
387  const std::deque<int>& blocks_,
388  int local_limit_,
389  critical_resource<int>& idx_):
390  master(master_),
391  blocks(blocks_),
392  local_limit(local_limit_),
393  idx(idx_)
394  {}
395 
396  void process()
397  {
398  master.log->debug("Processing with thread: {}", this_thread::get_id());
399 
400  std::vector<int> local;
401  do
402  {
403  int cur = (*idx.access())++;
404 
405  if ((size_t)cur >= blocks.size())
406  return;
407 
408  int i = blocks[cur];
409  if (master.block(i))
410  {
411  if (local.size() == (size_t)local_limit)
412  master.unload(local);
413  local.push_back(i);
414  }
415 
416  master.log->debug("Processing block: {}", master.gid(i));
417 
418  bool skip_block = true;
419  for (size_t cmd = 0; cmd < master.commands_.size(); ++cmd)
420  {
421  if (!master.commands_[cmd]->skip(i, master))
422  {
423  skip_block = false;
424  break;
425  }
426  }
427 
428  IncomingQueuesMap &current_incoming = master.incoming_[master.exchange_round_].map;
429  if (skip_block)
430  {
431  if (master.block(i) == 0)
432  master.load_queues(i); // even though we are skipping the block, the queues might be necessary
433 
434  for (size_t cmd = 0; cmd < master.commands_.size(); ++cmd)
435  {
436  master.commands_[cmd]->execute(0, master.proxy(i)); // 0 signals that we are skipping the block (even if it's loaded)
437 
438  // no longer need them, so get rid of them, rather than risk reloading
439  current_incoming[master.gid(i)].queues.clear();
440  current_incoming[master.gid(i)].records.clear();
441  }
442 
443  if (master.block(i) == 0)
444  master.unload_queues(i); // even though we are skipping the block, the queues might be necessary
445  }
446  else
447  {
448  if (master.block(i) == 0) // block unloaded
449  {
450  if (local.size() == (size_t)local_limit) // reached the local limit
451  master.unload(local);
452 
453  master.load(i);
454  local.push_back(i);
455  }
456 
457  for (size_t cmd = 0; cmd < master.commands_.size(); ++cmd)
458  {
459  master.commands_[cmd]->execute(master.block(i), master.proxy(i));
460 
461  // no longer need them, so get rid of them
462  current_incoming[master.gid(i)].queues.clear();
463  current_incoming[master.gid(i)].records.clear();
464  }
465  }
466  } while(true);
467 
468  // TODO: invoke opportunistic communication
469  // don't forget to adjust Master::exchange()
470  }
471 
472  static void run(void* bf) { static_cast<ProcessBlock*>(bf)->process(); }
473 
474  Master& master;
475  const std::deque<int>& blocks;
476  int local_limit;
478 };
479 // --------------------
480 
481 void
482 diy::Master::
483 clear()
484 {
485  for (unsigned i = 0; i < size(); ++i)
486  delete links_[i];
487  blocks_.clear();
488  links_.clear();
489  gids_.clear();
490  lids_.clear();
491  expected_ = 0;
492 }
493 
494 void
495 diy::Master::
496 unload(int i)
497 {
498  log->debug("Unloading block: {}", gid(i));
499 
500  blocks_.unload(i);
501  unload_queues(i);
502 }
503 
504 void
505 diy::Master::
506 unload_queues(int i)
507 {
508  unload_incoming(gid(i));
509  unload_outgoing(gid(i));
510 }
511 
512 void
513 diy::Master::
514 unload_incoming(int gid)
515 {
516  for (IncomingRoundMap::iterator round_itr = incoming_.begin(); round_itr != incoming_.end(); ++round_itr)
517  {
518  IncomingQueuesMap::iterator qmap_itr = round_itr->second.map.find(gid);
519  if (qmap_itr == round_itr->second.map.end())
520  {
521  continue;
522  }
523  IncomingQueuesRecords& in_qrs = qmap_itr->second;
524  for (InQueueRecords::iterator it = in_qrs.records.begin(); it != in_qrs.records.end(); ++it)
525  {
526  QueueRecord& qr = it->second;
527  if (queue_policy_->unload_incoming(*this, it->first, gid, qr.size))
528  {
529  log->debug("Unloading queue: {} <- {}", gid, it->first);
530  qr.external = storage_->put(in_qrs.queues[it->first]);
531  }
532  }
533  }
534 }
535 
536 void
537 diy::Master::
538 unload_outgoing(int gid)
539 {
540  OutgoingQueuesRecord& out_qr = outgoing_[gid];
541 
542  size_t out_queues_size = sizeof(size_t); // map size
543  size_t count = 0;
544  for (OutgoingQueues::iterator it = out_qr.queues.begin(); it != out_qr.queues.end(); ++it)
545  {
546  if (it->first.proc == comm_.rank()) continue;
547 
548  out_queues_size += sizeof(BlockID); // target
549  out_queues_size += sizeof(size_t); // buffer.position
550  out_queues_size += sizeof(size_t); // buffer.size
551  out_queues_size += it->second.size(); // buffer contents
552  ++count;
553  }
554  if (queue_policy_->unload_outgoing(*this, gid, out_queues_size - sizeof(size_t)))
555  {
556  log->debug("Unloading outgoing queues: {} -> ...; size = {}\n", gid, out_queues_size);
557  MemoryBuffer bb; bb.reserve(out_queues_size);
558  diy::save(bb, count);
559 
560  for (OutgoingQueues::iterator it = out_qr.queues.begin(); it != out_qr.queues.end();)
561  {
562  if (it->first.proc == comm_.rank())
563  {
564  // treat as incoming
565  if (queue_policy_->unload_incoming(*this, gid, it->first.gid, it->second.size()))
566  {
567  QueueRecord& qr = out_qr.external_local[it->first];
568  qr.size = it->second.size();
569  qr.external = storage_->put(it->second);
570 
571  out_qr.queues.erase(it++);
572  continue;
573  } // else keep in memory
574  } else
575  {
576  diy::save(bb, it->first);
577  diy::save(bb, it->second);
578 
579  out_qr.queues.erase(it++);
580  continue;
581  }
582  ++it;
583  }
584 
585  // TODO: this mechanism could be adjusted for direct saving to disk
586  // (without intermediate binary buffer serialization)
587  out_qr.external = storage_->put(bb);
588  }
589 }
590 
591 void
592 diy::Master::
593 load(int i)
594 {
595  log->debug("Loading block: {}", gid(i));
596 
597  blocks_.load(i);
598  load_queues(i);
599 }
600 
601 void
602 diy::Master::
603 load_queues(int i)
604 {
605  load_incoming(gid(i));
606  load_outgoing(gid(i));
607 }
608 
609 void
610 diy::Master::
611 load_incoming(int gid)
612 {
613  IncomingQueuesRecords& in_qrs = incoming_[exchange_round_].map[gid];
614  for (InQueueRecords::iterator it = in_qrs.records.begin(); it != in_qrs.records.end(); ++it)
615  {
616  QueueRecord& qr = it->second;
617  if (qr.external != -1)
618  {
619  log->debug("Loading queue: {} <- {}", gid, it->first);
620  storage_->get(qr.external, in_qrs.queues[it->first]);
621  qr.external = -1;
622  }
623  }
624 }
625 
626 void
627 diy::Master::
628 load_outgoing(int gid)
629 {
630  // TODO: we could adjust this mechanism to read directly from storage,
631  // bypassing an intermediate MemoryBuffer
632  OutgoingQueuesRecord& out_qr = outgoing_[gid];
633  if (out_qr.external != -1)
634  {
635  MemoryBuffer bb;
636  storage_->get(out_qr.external, bb);
637  out_qr.external = -1;
638 
639  size_t count;
640  diy::load(bb, count);
641  for (size_t i = 0; i < count; ++i)
642  {
643  BlockID to;
644  diy::load(bb, to);
645  diy::load(bb, out_qr.queues[to]);
646  }
647  }
648 }
649 
651 diy::Master::
652 proxy(int i) const
653 { return ProxyWithLink(Proxy(const_cast<Master*>(this), gid(i)), block(i), link(i)); }
654 
655 
656 int
658 add(int gid, void* b, Link* l)
659 {
660  if (*blocks_.in_memory().const_access() == limit_)
661  unload_all();
662 
663  lock_guard<fast_mutex> lock(add_mutex_); // allow to add blocks from multiple threads
664 
665  blocks_.add(b);
666  links_.push_back(l);
667  gids_.push_back(gid);
668 
669  int lid = gids_.size() - 1;
670  lids_[gid] = lid;
671  add_expected(l->size_unique()); // NB: at every iteration we expect a message from each unique neighbor
672 
673  return lid;
674 }
675 
676 void*
678 release(int i)
679 {
680  void* b = blocks_.release(i);
681  delete link(i); links_[i] = 0;
682  lids_.erase(gid(i));
683  return b;
684 }
685 
686 bool
687 diy::Master::
688 has_incoming(int i) const
689 {
690  const IncomingQueuesRecords& in_qrs = const_cast<Master&>(*this).incoming_[exchange_round_].map[gid(i)];
691  for (InQueueRecords::const_iterator it = in_qrs.records.begin(); it != in_qrs.records.end(); ++it)
692  {
693  const QueueRecord& qr = it->second;
694  if (qr.size != 0)
695  return true;
696  }
697  return false;
698 }
699 
700 template<class Block>
701 void
703 foreach_(const Callback<Block>& f, const Skip& skip)
704 {
705  auto scoped = prof.scoped("foreach");
706  commands_.push_back(new Command<Block>(f, skip));
707 
708  if (immediate())
709  execute();
710 }
711 
712 void
713 diy::Master::
714 execute()
715 {
716  log->debug("Entered execute()");
717  auto scoped = prof.scoped("execute");
718  //show_incoming_records();
719 
720  // touch the outgoing and incoming queues as well as collectives to make sure they exist
721  for (unsigned i = 0; i < size(); ++i)
722  {
723  outgoing(gid(i));
724  incoming(gid(i)); // implicitly touches queue records
725  collectives(gid(i));
726  }
727 
728  if (commands_.empty())
729  return;
730 
731  // Order the blocks, so the loaded ones come first
732  std::deque<int> blocks;
733  for (unsigned i = 0; i < size(); ++i)
734  if (block(i) == 0)
735  blocks.push_back(i);
736  else
737  blocks.push_front(i);
738 
739  // don't use more threads than we can have blocks in memory
740  int num_threads;
741  int blocks_per_thread;
742  if (limit_ == -1)
743  {
744  num_threads = threads_;
745  blocks_per_thread = size();
746  }
747  else
748  {
749  num_threads = std::min(threads_, limit_);
750  blocks_per_thread = limit_/num_threads;
751  }
752 
753  // idx is shared
754  critical_resource<int> idx(0);
755 
756  typedef ProcessBlock BlockFunctor;
757  if (num_threads > 1)
758  {
759  // launch the threads
760  typedef std::pair<thread*, BlockFunctor*> ThreadFunctorPair;
761  typedef std::list<ThreadFunctorPair> ThreadFunctorList;
762  ThreadFunctorList threads;
763  for (unsigned i = 0; i < (unsigned)num_threads; ++i)
764  {
765  BlockFunctor* bf = new BlockFunctor(*this, blocks, blocks_per_thread, idx);
766  threads.push_back(ThreadFunctorPair(new thread(&BlockFunctor::run, bf), bf));
767  }
768 
769  // join the threads
770  for(ThreadFunctorList::iterator it = threads.begin(); it != threads.end(); ++it)
771  {
772  thread* t = it->first;
773  BlockFunctor* bf = it->second;
774  t->join();
775  delete t;
776  delete bf;
777  }
778  } else
779  {
780  BlockFunctor bf(*this, blocks, blocks_per_thread, idx);
781  BlockFunctor::run(&bf);
782  }
783 
784  // clear incoming queues
785  incoming_[exchange_round_].map.clear();
786 
787  if (limit() != -1 && in_memory() > limit())
788  throw std::runtime_error(fmt::format("Fatal: {} blocks in memory, with limit {}", in_memory(), limit()));
789 
790  // clear commands
791  for (size_t i = 0; i < commands_.size(); ++i)
792  delete commands_[i];
793  commands_.clear();
794 }
795 
796 void
799 {
800  auto scoped = prof.scoped("exchange");
801  execute();
802 
803  log->debug("Starting exchange");
804 
805  // make sure there is a queue for each neighbor
806  for (int i = 0; i < (int)size(); ++i)
807  {
808  OutgoingQueues& outgoing_queues = outgoing_[gid(i)].queues;
809  OutQueueRecords& external_local = outgoing_[gid(i)].external_local;
810  if (outgoing_queues.size() < (size_t)link(i)->size())
811  for (unsigned j = 0; j < (unsigned)link(i)->size(); ++j)
812  {
813  if (external_local.find(link(i)->target(j)) == external_local.end())
814  outgoing_queues[link(i)->target(j)]; // touch the outgoing queue, creating it if necessary
815  }
816  }
817 
818  flush();
819  log->debug("Finished exchange");
820 }
821 
822 namespace diy
823 {
824 namespace detail
825 {
826  template <typename T>
827  struct VectorWindow
828  {
829  T *begin;
830  size_t count;
831  };
832 } // namespace detail
833 
834 namespace mpi
835 {
836 namespace detail
837 {
838  template<typename T> struct is_mpi_datatype< diy::detail::VectorWindow<T> > { typedef true_type type; };
839 
840  template <typename T>
841  struct mpi_datatype< diy::detail::VectorWindow<T> >
842  {
843  typedef diy::detail::VectorWindow<T> VecWin;
844  static MPI_Datatype datatype() { return get_mpi_datatype<T>(); }
845  static const void* address(const VecWin& x) { return x.begin; }
846  static void* address(VecWin& x) { return x.begin; }
847  static int count(const VecWin& x) { return static_cast<int>(x.count); }
848  };
849 }
850 } // namespace mpi::detail
851 
852 } // namespace diy
853 
854 /* Communicator */
855 void
856 diy::Master::
857 comm_exchange(ToSendList& to_send, int out_queues_limit)
858 {
859  static const size_t MAX_MPI_MESSAGE_COUNT = INT_MAX;
860 
861  IncomingRound &current_incoming = incoming_[exchange_round_];
862  // isend outgoing queues, up to the out_queues_limit
863  while(inflight_sends_.size() < (size_t)out_queues_limit && !to_send.empty())
864  {
865  int from = to_send.front();
866 
867  // deal with external_local queues
868  for (OutQueueRecords::iterator it = outgoing_[from].external_local.begin(); it != outgoing_[from].external_local.end(); ++it)
869  {
870  int to = it->first.gid;
871 
872  log->debug("Processing local queue: {} <- {} of size {}", to, from, it->second.size);
873 
874  QueueRecord& in_qr = current_incoming.map[to].records[from];
875  bool in_external = block(lid(to)) == 0;
876 
877  if (in_external)
878  in_qr = it->second;
879  else
880  {
881  // load the queue
882  in_qr.size = it->second.size;
883  in_qr.external = -1;
884 
885  MemoryBuffer bb;
886  storage_->get(it->second.external, bb);
887 
888  current_incoming.map[to].queues[from].swap(bb);
889  }
890  ++current_incoming.received;
891  }
892  outgoing_[from].external_local.clear();
893 
894  if (outgoing_[from].external != -1)
895  load_outgoing(from);
896  to_send.pop_front();
897 
898  OutgoingQueues& outgoing = outgoing_[from].queues;
899  for (OutgoingQueues::iterator it = outgoing.begin(); it != outgoing.end(); ++it)
900  {
901  BlockID to_proc = it->first;
902  int to = to_proc.gid;
903  int proc = to_proc.proc;
904 
905  log->debug("Processing queue: {} <- {} of size {}", to, from, outgoing_[from].queues[to_proc].size());
906 
907  // There may be local outgoing queues that remained in memory
908  if (proc == comm_.rank()) // sending to ourselves: simply swap buffers
909  {
910  log->debug("Moving queue in-place: {} <- {}", to, from);
911 
912  QueueRecord& in_qr = current_incoming.map[to].records[from];
913  bool in_external = block(lid(to)) == 0;
914  if (in_external)
915  {
916  log->debug("Unloading outgoing directly as incoming: {} <- {}", to, from);
917  MemoryBuffer& bb = it->second;
918  in_qr.size = bb.size();
919  if (queue_policy_->unload_incoming(*this, from, to, in_qr.size))
920  in_qr.external = storage_->put(bb);
921  else
922  {
923  MemoryBuffer& in_bb = current_incoming.map[to].queues[from];
924  in_bb.swap(bb);
925  in_bb.reset();
926  in_qr.external = -1;
927  }
928  } else // !in_external
929  {
930  log->debug("Swapping in memory: {} <- {}", to, from);
931  MemoryBuffer& bb = current_incoming.map[to].queues[from];
932  bb.swap(it->second);
933  bb.reset();
934  in_qr.size = bb.size();
935  in_qr.external = -1;
936  }
937 
938  ++current_incoming.received;
939  continue;
940  }
941 
942  std::shared_ptr<MemoryBuffer> buffer = std::make_shared<MemoryBuffer>();
943  buffer->swap(it->second);
944 
945  MessageInfo info{from, to, exchange_round_};
946  if (buffer->size() <= (MAX_MPI_MESSAGE_COUNT - sizeof(info)))
947  {
948  diy::save(*buffer, info);
949 
950  inflight_sends_.emplace_back();
951  inflight_sends_.back().info = info;
952  inflight_sends_.back().request = comm_.isend(proc, tags::queue, buffer->buffer);
953  inflight_sends_.back().message = buffer;
954  }
955  else
956  {
957  int npieces = static_cast<int>((buffer->size() + MAX_MPI_MESSAGE_COUNT - 1)/MAX_MPI_MESSAGE_COUNT);
958 
959  // first send the head
960  std::shared_ptr<MemoryBuffer> hb = std::make_shared<MemoryBuffer>();
961  diy::save(*hb, buffer->size());
962  diy::save(*hb, info);
963 
964  inflight_sends_.emplace_back();
965  inflight_sends_.back().info = info;
966  inflight_sends_.back().request = comm_.isend(proc, tags::piece, hb->buffer);
967  inflight_sends_.back().message = hb;
968 
969  // send the message pieces
970  size_t msg_buff_idx = 0;
971  for (int i = 0; i < npieces; ++i, msg_buff_idx += MAX_MPI_MESSAGE_COUNT)
972  {
973  int tag = (i == (npieces - 1)) ? tags::queue : tags::piece;
974 
975  detail::VectorWindow<char> window;
976  window.begin = &buffer->buffer[msg_buff_idx];
977  window.count = std::min(MAX_MPI_MESSAGE_COUNT, buffer->size() - msg_buff_idx);
978 
979  inflight_sends_.emplace_back();
980  inflight_sends_.back().info = info;
981  inflight_sends_.back().request = comm_.isend(proc, tag, window);
982  inflight_sends_.back().message = buffer;
983  }
984  }
985  }
986  }
987 
988  // kick requests
989  while(nudge());
990 
991  // check incoming queues
992  mpi::optional<mpi::status> ostatus = comm_.iprobe(mpi::any_source, mpi::any_tag);
993  while(ostatus)
994  {
995  InFlightRecv &ir = inflight_recvs_[ostatus->source()];
996 
997  if (ir.info.from == -1) // uninitialized
998  {
999  MemoryBuffer bb;
1000  comm_.recv(ostatus->source(), ostatus->tag(), bb.buffer);
1001 
1002  if (ostatus->tag() == tags::piece)
1003  {
1004  size_t msg_size;
1005  diy::load(bb, msg_size);
1006  diy::load(bb, ir.info);
1007 
1008  ir.message.buffer.reserve(msg_size);
1009  }
1010  else // tags::queue
1011  {
1012  diy::load_back(bb, ir.info);
1013  ir.message.swap(bb);
1014  }
1015  }
1016  else
1017  {
1018  size_t start_idx = ir.message.buffer.size();
1019  size_t count = ostatus->count<char>();
1020  ir.message.buffer.resize(start_idx + count);
1021 
1022  detail::VectorWindow<char> window;
1023  window.begin = &ir.message.buffer[start_idx];
1024  window.count = count;
1025 
1026  comm_.recv(ostatus->source(), ostatus->tag(), window);
1027  }
1028 
1029  if (ostatus->tag() == tags::queue)
1030  {
1031  size_t size = ir.message.size();
1032  int from = ir.info.from;
1033  int to = ir.info.to;
1034  int external = -1;
1035 
1036  assert(ir.info.round >= exchange_round_);
1037  IncomingRound *in = &incoming_[ir.info.round];
1038 
1039  bool unload_queue = ((ir.info.round == exchange_round_) ? (block(lid(to)) == 0) : (limit_ != -1)) &&
1040  queue_policy_->unload_incoming(*this, from, to, size);
1041  if (unload_queue)
1042  {
1043  log->debug("Directly unloading queue {} <- {}", to, from);
1044  external = storage_->put(ir.message); // unload directly
1045  }
1046  else
1047  {
1048  in->map[to].queues[from].swap(ir.message);
1049  in->map[to].queues[from].reset(); // buffer position = 0
1050  }
1051  in->map[to].records[from] = QueueRecord(size, external);
1052 
1053  ++(in->received);
1054  ir = InFlightRecv(); // reset
1055  }
1056 
1057  ostatus = comm_.iprobe(mpi::any_source, mpi::any_tag);
1058  }
1059 }
1060 
1061 void
1062 diy::Master::
1063 flush()
1064 {
1065 #ifdef DEBUG
1066  time_type start = get_time();
1067  unsigned wait = 1;
1068 #endif
1069 
1070  // prepare for next round
1071  incoming_.erase(exchange_round_);
1072  ++exchange_round_;
1073 
1074  // make a list of outgoing queues to send (the ones in memory come first)
1075  ToSendList to_send;
1076  for (OutgoingQueuesMap::iterator it = outgoing_.begin(); it != outgoing_.end(); ++it)
1077  {
1078  OutgoingQueuesRecord& out = it->second;
1079  if (out.external == -1)
1080  to_send.push_front(it->first);
1081  else
1082  to_send.push_back(it->first);
1083  }
1084  log->debug("to_send.size(): {}", to_send.size());
1085 
1086  // XXX: we probably want a cleverer limit than block limit times average number of queues per block
1087  // XXX: with queues we could easily maintain a specific space limit
1088  int out_queues_limit;
1089  if (limit_ == -1 || size() == 0)
1090  out_queues_limit = to_send.size();
1091  else
1092  out_queues_limit = std::max((size_t) 1, to_send.size()/size()*limit_); // average number of queues per block * in-memory block limit
1093 
1094  do
1095  {
1096  comm_exchange(to_send, out_queues_limit);
1097 
1098 #ifdef DEBUG
1099  time_type cur = get_time();
1100  if (cur - start > wait*1000)
1101  {
1102  log->warn("Waiting in flush [{}]: {} - {} out of {}",
1103  comm_.rank(), inflight_sends_.size(), incoming_[exchange_round_].received, expected_);
1104  wait *= 2;
1105  }
1106 #endif
1107  } while (!inflight_sends_.empty() || incoming_[exchange_round_].received < expected_ || !to_send.empty());
1108 
1109  outgoing_.clear();
1110 
1111  log->debug("Done in flush");
1112  //show_incoming_records();
1113 
1114  process_collectives();
1115 }
1116 
1117 void
1118 diy::Master::
1119 process_collectives()
1120 {
1121  auto scoped = prof.scoped("collectives");
1122 
1123  if (collectives_.empty())
1124  return;
1125 
1126  typedef CollectivesList::iterator CollectivesIterator;
1127  std::vector<CollectivesIterator> iters;
1128  std::vector<int> gids;
1129  for (CollectivesMap::iterator cur = collectives_.begin(); cur != collectives_.end(); ++cur)
1130  {
1131  gids.push_back(cur->first);
1132  iters.push_back(cur->second.begin());
1133  }
1134 
1135  while (iters[0] != collectives_.begin()->second.end())
1136  {
1137  iters[0]->init();
1138  for (unsigned j = 1; j < iters.size(); ++j)
1139  {
1140  // NB: this assumes that the operations are commutative
1141  iters[0]->update(*iters[j]);
1142  }
1143  iters[0]->global(comm_); // do the mpi collective
1144 
1145  for (unsigned j = 1; j < iters.size(); ++j)
1146  {
1147  iters[j]->copy_from(*iters[0]);
1148  ++iters[j];
1149  }
1150 
1151  ++iters[0];
1152  }
1153 }
1154 
1155 bool
1156 diy::Master::
1157 nudge()
1158 {
1159  bool success = false;
1160  for (InFlightSendsList::iterator it = inflight_sends_.begin(); it != inflight_sends_.end(); ++it)
1161  {
1162  mpi::optional<mpi::status> ostatus = it->request.test();
1163  if (ostatus)
1164  {
1165  success = true;
1166  InFlightSendsList::iterator rm = it;
1167  --it;
1168  inflight_sends_.erase(rm);
1169  }
1170  }
1171  return success;
1172 }
1173 
1174 void
1175 diy::Master::
1176 show_incoming_records() const
1177 {
1178  for (IncomingRoundMap::const_iterator rounds_itr = incoming_.begin(); rounds_itr != incoming_.end(); ++rounds_itr)
1179  {
1180  for (IncomingQueuesMap::const_iterator it = rounds_itr->second.map.begin(); it != rounds_itr->second.map.end(); ++it)
1181  {
1182  const IncomingQueuesRecords& in_qrs = it->second;
1183  for (InQueueRecords::const_iterator cur = in_qrs.records.begin(); cur != in_qrs.records.end(); ++cur)
1184  {
1185  const QueueRecord& qr = cur->second;
1186  log->info("round: {}, {} <- {}: (size,external) = ({},{})",
1187  rounds_itr->first,
1188  it->first, cur->first,
1189  qr.size,
1190  qr.external);
1191  }
1192  for (IncomingQueues::const_iterator cur = in_qrs.queues.begin(); cur != in_qrs.queues.end(); ++cur)
1193  {
1194  log->info("round: {}, {} <- {}: queue.size() = {}",
1195  rounds_itr->first,
1196  it->first, cur->first,
1197  const_cast<IncomingQueuesRecords&>(in_qrs).queues[cur->first].size());
1198  }
1199  }
1200  }
1201 }
1202 
1203 #endif
Definition: master.hpp:120
Definition: master.hpp:95
void load(BinaryBuffer &bb, T &x)
Loads x from bb by calling diy::Serialization<T>::load(bb,x).
Definition: serialization.hpp:106
void load_back(BinaryBuffer &bb, T &x)
Supports only binary data copying (meant for simple footers).
Definition: serialization.hpp:120
Definition: master.hpp:356
Definition: master.hpp:146
Definition: storage.hpp:41
void save(BinaryBuffer &bb, const T &x)
Saves x to bb by calling diy::Serialization<T>::save(bb,x).
Definition: serialization.hpp:102
int gid(int i) const
return gid of the i-th block
Definition: master.hpp:219
Definition: master.hpp:384
void foreach_(const Callback< Block > &f, const Skip &s=NeverSkip())
call f with every block
Definition: master.hpp:703
void in(const RegularLink< Bounds > &link, const Point &p, OutIter out, const Bounds &domain)
Finds the neighbor(s) containing the target point.
Definition: pick.hpp:102
Definition: master.hpp:41
int add(int gid, void *b, Link *l)
add a block
Definition: master.hpp:658
Definition: master.hpp:35
Simple wrapper around MPI_Comm.
Definition: communicator.hpp:8
Definition: master.hpp:111
Communication proxy, used for enqueueing and dequeueing items for future exchange.
Definition: proxy.hpp:8
Definition: serialization.hpp:26
Definition: no-thread.hpp:27
Definition: master.hpp:353
Definition: master.hpp:72
Definition: master.hpp:333
Definition: master.hpp:47
Master(mpi::communicator comm, int threads=1, int limit=-1, CreateBlock create=0, DestroyBlock destroy=0, ExternalStorage *storage=0, SaveBlock save=0, LoadBlock load=0, QueuePolicy *q_policy=new QueueSizePolicy(4096))
The main DIY object.
Definition: master.hpp:164
const mpi::communicator & communicator() const
return the MPI communicator
Definition: master.hpp:212
void * release(int i)
release ownership of the block
Definition: master.hpp:678
Definition: request.hpp:5
Definition: master.hpp:136
Definition: master.hpp:55
bool local(int gid) const
whether the block with global id gid is local
Definition: master.hpp:223
Definition: master.hpp:89
void exchange()
exchange the queues between all the blocks (collective operation)
Definition: master.hpp:798
Definition: master.hpp:104
Move queues out of core if their size exceeds a parameter given in the constructor.
Definition: master.hpp:80
unsigned size() const
return the number of local blocks
Definition: master.hpp:233
Definition: master.hpp:131
mpi::communicator & communicator()
return the MPI communicator
Definition: master.hpp:214
Definition: no-thread.hpp:9
int lid(int gid) const
return the local id of the local block with global id gid, or -1 if not local
Definition: master.hpp:221