// This example demonstrates how to compute histogram across multiple datasets.
// The histogram result is made available on rank with block with gid=0.

#include <algorithm>
#include <numeric>
#include <random>
#include <utility>
#include <vector>


#include <diy/mpi.hpp>
#include <diy/master.hpp>
#include <diy/reduce.hpp>
#include <diy/assigner.hpp>
#include <diy/decomposition.hpp>
#include <diy/serialization.hpp>
#include <diy/partners/merge.hpp>

#include "../opts.h"

// This represents a dataset.
struct Dataset
{
  std::vector<double> values;

  void fill(int nvalues, int uid)
  {
    std::mt19937 gen(uid);
    std::uniform_real_distribution<> dis(0.0, 1000.0);

    this->values.resize(nvalues);
    std::generate(this->values.begin(), this->values.end(), [&]() { return dis(gen); });
  }
};


// This is the information we are accumulating thus a Block for DIY.
struct Histogram
{
  std::pair<double, double> range;
  std::vector<int> counts;

  void populate(int nbins, double min, double max, const Dataset& ds)
  {
    this->counts.clear();
    this->counts.resize(nbins, 0);
    this->range.first = min;
    this->range.second = max;

    const double delta = (max - min) / nbins;
    for (double v : ds.values)
    {
      int idx = static_cast<int>((v - this->range.first) / delta);

      idx = std::max(0, idx);
      idx = std::min(idx, nbins-1);
      this->counts[idx]++;
    }
  }

  void accumulate(const Histogram& other)
  {
    if (this->counts.size() == 0)
    {
      this->counts.resize(other.counts.size(), 0);
    }
    if (other.counts.size() == 0)
    {
      return;
    }
    assert(this->counts.size() == other.counts.size());
    for (size_t cc=0; cc < this->counts.size(); ++cc)
    {
      this->counts[cc] += other.counts[cc];
    }
  }

  void print(int gid) const
  {
    using std::endl;
    using std::cout;
    cout << "---------------------------" << endl;
    cout << "gid: " << gid << endl;
    cout << "histogram:";
    for (size_t cc=0; cc < this->counts.size(); ++cc)
    {
      if (cc % 10 == 0)
      {
        cout << endl;
      }
      cout << this->counts[cc] << "  ";
    }
    cout << endl << "---------------------------" << endl;
  }

  int sum() const
  {
    return std::accumulate(this->counts.begin(), this->counts.end(), 0);
  }
};

// add serialization logic
namespace diy
{
template <>
struct Serialization<Histogram>
{

  static void save(BinaryBuffer& bb, const Histogram& hist)
  {
    diy::save(bb, hist.range);
    diy::save(bb, hist.counts);
  }

  static void load(BinaryBuffer& bb, Histogram& hist)
  {
    diy::load(bb, hist.range);
    diy::load(bb, hist.counts);
  }
};
}

int main(int argc, char* argv[])
{
  std::srand(100);


  diy::mpi::environment env(argc, argv);
  diy::mpi::communicator world;

  int nblocks   = 128;  // global number of blocks
  int nvalues   = 1024; // number of values per block
  int nbins     = 10;   // number of bins in the histogram

  // get command line arguments
  using namespace opts;
  Options ops(argc, argv);
  ops
    >> Option('b', "blocks",  nblocks,        "number of blocks")
    >> Option('n', "values",  nvalues,        "number of values")
    >> Option('t', "bins",    nbins,          "number of bins")
    ;

  if (ops >> Present('h', "help", "show help"))
  {
    if (world.rank() == 0)
    {
      std::cout << ops;
    }
    return EXIT_FAILURE;
  }

  diy::Master master(world, 1, -1,
      []()->void* { return new Histogram(); },
      [](void* ptr) { delete static_cast<Histogram*>(ptr); }
      );

  diy::RoundRobinAssigner assigner(world.size(), nblocks);

  // let's create local datasets.
  std::vector<int> local_gids;
  assigner.local_gids(world.rank(), local_gids);

  std::vector<Dataset> local_blocks(local_gids.size());
  for (size_t cc=0; cc < local_blocks.size(); ++cc)
  {
    Dataset& ds = local_blocks[cc];
    ds.fill(nvalues, local_gids[cc]);
  }

  diy::RegularDecomposer<diy::DiscreteBounds> decomposer(1, diy::interval(0, nblocks-1), nblocks);
  decomposer.decompose(world.rank(), assigner, master);

  // let's build up local histograms.
  master.foreach([&](Histogram* hist, const diy::Master::ProxyWithLink& cp) {
      const int gid = cp.gid();
      const int lid = cp.master()->lid(gid);
      const Dataset& dataset = local_blocks[lid];
      hist->populate(nbins, 0, 1000.0, dataset);
  });

  diy::RegularMergePartners partners(decomposer, /*k=*/2);//, /*contiguous*/false);

  diy::reduce(master, assigner, partners,
      [](Histogram* hist, const diy::ReduceProxy& srp, const diy::RegularMergePartners&) {
      const auto selfid = srp.gid();

      // dequeue
      std::vector<int> incoming_gids;
      srp.incoming(incoming_gids);
      for (const int gid : incoming_gids)
      {
        if (selfid != gid)
        {
          Histogram incoming;
          srp.dequeue(gid, incoming);
          hist->accumulate(incoming);
        }
      }

      // enqueue
      for (int cc=0, max = srp.out_link().size(); cc < max; ++cc)
      {
        auto target = srp.out_link().target(cc);
        if (target.gid != selfid)
        {
          srp.enqueue(target, *hist);
        }
      }
  });

  // RegularMergePartners reduces result on block with gid=0, so we print the
  // result from that block alone.
  if (master.local(0))
  {
    Histogram* hist = static_cast<Histogram*>(master.get(master.lid(0)));
    hist->print(0);

    int tot_nvalues = nblocks*nvalues;
    int sum = hist->sum();
    std::cout << "tot values: " << sum << " (expected: " << tot_nvalues << ")" << std::endl;
    return tot_nvalues == sum ? EXIT_SUCCESS : EXIT_FAILURE;
  }

  return EXIT_SUCCESS;
}
