/*****************************************************************************
 *
 * Copyright (c) 2000 - 2012, Lawrence Livermore National Security, LLC
 * Produced at the Lawrence Livermore National Laboratory
 * LLNL-CODE-442911
 * All rights reserved.
 *
 * This file is  part of VisIt. For  details, see https://visit.llnl.gov/.  The
 * full copyright notice is contained in the file COPYRIGHT located at the root
 * of the VisIt distribution or at http://www.llnl.gov/visit/copyright.html.
 *
 * Redistribution  and  use  in  source  and  binary  forms,  with  or  without
 * modification, are permitted provided that the following conditions are met:
 *
 *  - Redistributions of  source code must  retain the above  copyright notice,
 *    this list of conditions and the disclaimer below.
 *  - Redistributions in binary form must reproduce the above copyright notice,
 *    this  list of  conditions  and  the  disclaimer (as noted below)  in  the
 *    documentation and/or other materials provided with the distribution.
 *  - Neither the name of  the LLNS/LLNL nor the names of  its contributors may
 *    be used to endorse or promote products derived from this software without
 *    specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT  HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR  IMPLIED WARRANTIES, INCLUDING,  BUT NOT  LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND  FITNESS FOR A PARTICULAR  PURPOSE
 * ARE  DISCLAIMED. IN  NO EVENT  SHALL LAWRENCE  LIVERMORE NATIONAL  SECURITY,
 * LLC, THE  U.S.  DEPARTMENT OF  ENERGY  OR  CONTRIBUTORS BE  LIABLE  FOR  ANY
 * DIRECT,  INDIRECT,   INCIDENTAL,   SPECIAL,   EXEMPLARY,  OR   CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT  LIMITED TO, PROCUREMENT OF  SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF  USE, DATA, OR PROFITS; OR  BUSINESS INTERRUPTION) HOWEVER
 * CAUSED  AND  ON  ANY  THEORY  OF  LIABILITY,  WHETHER  IN  CONTRACT,  STRICT
 * LIABILITY, OR TORT  (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY  WAY
 * OUT OF THE  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 *
 *****************************************************************************/

// ************************************************************************* //
//                              avtChowderICAlgorithm.C                      //
// ************************************************************************* //

#include <avtChowderICAlgorithm.h>
#include <vtkRectilinearGrid.h>
#include <vtkFloatArray.h>
#include <vtkCellData.h>
#include <vtkDataSetWriter.h>
#include <TimingsManager.h>
#include <avtParallel.h>
#include <DebugStream.h>
#include <VisItStreamUtil.h>
#include <Block.h>
#include <iostream>
#include <fstream>
#include <avtPODICAlgorithm.h>

using namespace std;

static int randomIndex(int sz);
static float random_1();

#ifdef PARALLEL

int avtChowderICAlgorithm::TERMINATE = 1;
int avtChowderICAlgorithm::REQUEST = 2;
int avtChowderICAlgorithm::NO_THANKS = 3;

template <typename T>
static T sumArray(const vector<T> &a);

static void
computeStats(float v, float *vm=NULL, float *vM=NULL, float *vAvg=NULL);

// ****************************************************************************
//  Method: avtChowderICAlgorithm::avtChowderICAlgorithm
//
//  Purpose:
//      avtChowderICAlgorithm constructor.
//
//  Programmer: Dave Pugmire
//  Creation:   January 27, 2009
//
//  Modifications:
//
//    Hank Childs, Sun Jun  6 12:21:30 CDT 2010
//    Remove reference to avtStreamlineFilter, add reference to avtPICSFilter.
//
// ****************************************************************************

avtChowderICAlgorithm::avtChowderICAlgorithm(avtPICSFilter *picsFilter, int count, int comm)
  : avtParICAlgorithm(picsFilter)
{
    maxCount = count;
    /*ROB*/
    numTestSeeds   = 20;
    maxTestSteps   = 1000;
    minProbability = 0.05;

    doBalance = true;
    subdivUniform = false;
    subdivNX = 2;
    subdivNY = 2;
    subdivNZ = 2;
    subdivPct = 0.10;

    //DRP. MAKE SURE WE CHANGE THIS.
    picsFilter->cacheQLen = 800;
    LOAD_TIME = 10.0;
    ADVECT_TIME = 0.0;
    numTestParticlesSteps = 0;
    blockPopularity.resize(numDomains, 0.0f);
    rankPopularity.resize(nProcs, 0.0f);
    blockPopularityREAL.resize(numDomains);
    rankPopularityREAL.resize(nProcs);
    allAdvectTime.resize(nProcs, 0.0f);
    allIOTime.resize(nProcs, 0.0f);
    allDomIntegrateSteps.resize(numDomains, 0);
    allRankIntegrateSteps.resize(nProcs, 0);

    popMethod = PROB_TREE;

    //Lots 'o randomness....
    srand(0);

    int parMin = 8;
    parMin = 1;
    printRank0Stuff = (PAR_Rank() == 0 && PAR_Size() <= parMin);
    printStuff = (PAR_Size() <= parMin);
    numBlocksDuplicated = 0;
    commMethod = comm;
    totalNumICs = 0;
    numTerminated = 0;
    numICRequests = 0;
    numICReqPosted = 0;
    numTermMsgs = 0;
    numTerminated = 0;
    numStolen = 0;

    sumErr = 0.0;
    sumErr2 = 0.0;
    maxErr = -1.0;
    maxErr2 = -1.0;

    doLazyLoading = false;
    ownerOffset = 0;
    minBundle = 1;
}

// ****************************************************************************
//  Method: avtChowderICAlgorithm::~avtChowderICAlgorithm
//
//  Purpose:
//      avtChowderICAlgorithm destructor.
//
//  Programmer: Dave Pugmire
//  Creation:   January 27, 2009
//
// ****************************************************************************

avtChowderICAlgorithm::~avtChowderICAlgorithm()
{
}

// ****************************************************************************
//  Method: avtChowderICAlgorithm::Initialize
//
//  Purpose:
//      Initialization.
//
//  Programmer: Dave Pugmire
//  Creation:   January 27, 2009
//
//  Modifications:
//
//   Dave Pugmire, Mon Mar 23 18:33:10 EDT 2009
//   Make changes for point decomposed domain databases.
//
//   Hank Childs, Fri Apr  3 16:26:24 PDT 2009
//   Change parallelization strategy, since it was loading up on the last
//   processor and we want it to be more spread out.
//
//   Hank Childs, Sun Jun  6 12:21:30 CDT 2010
//   Change name of method called to AddIntegralCurves.
//
// ****************************************************************************

void
avtChowderICAlgorithm::Initialize(vector<avtIntegralCurve *> &seedPts)
{
    perfectBlockPopularity.resize(numDomains, 0.0f);
    if (popMethod == PERFECT)
    {
        perfInitSW.start();
        
        int n = seedPts.size();
        vector<avtIntegralCurve *> ics(n, NULL);
        for (int i = 0; i < n; i++)
        {
            avtStreamlineIC *ic = makeIC(seedPts[i]->CurrentLocation(), -seedPts[i]->id);
            picsFilter->FindCandidateBlocks(ic);
            ics[i] = ic;
        }
        avtPODICAlgorithm *podicAlgo;
        podicAlgo = new avtPODICAlgorithm(picsFilter, maxCount, false, false, true);
        podicAlgo->Initialize(ics);
        picsFilter->exitAfterRunAlgorithm = true;
        podicAlgo->Execute();
        podicAlgo->CleanupRequests(avtParICAlgorithm::STREAMLINE_TAG);
        ADVECT_TIME = podicAlgo->advSW.t;
        podicAlgo->advSW.reset();
        
        for (int i = 0; i < numDomains; i++)
            perfectBlockPopularity[i] = (float)podicAlgo->domIntegrateSteps[i];
        delete podicAlgo;
        
        MPI_Allreduce(MPI_IN_PLACE, &(perfectBlockPopularity[0]), numDomains, MPI_FLOAT, MPI_SUM, VISIT_MPI_COMM);
        perfInitSW.stop();
    }
    
    runSW.start();
    initSW.start();

    int numRecvs = 64;
    if (numRecvs > nProcs)
        numRecvs = nProcs-1;

    int msgSz = 2;
    int numMsgRecv = 0;
    if (asyncComm || doStealing)
        numMsgRecv = numRecvs;

    avtParICAlgorithm::InitializeBuffers(seedPts, msgSz, numMsgRecv, numRecvs);

    int nVals = 4; //(dst, numICs, numIters, totalICsFromSrc)
    int nNeighbors = 7; //6 neighboring blocks, plus self.
    int nSubdiv = std::max(std::max(subdivNX, subdivNY), subdivNZ);
    if (!subdivUniform)
        nSubdiv++;

    NVALS = nVals * (nNeighbors*nSubdiv);

    //Assign the statically loaded domains.
    blockAssignments.resize(numDomains);
    for (int i = 0; i < numDomains; i++)
    {
        BlockIDType b(i,0);
        blockAssignments[i].push_back(DomainToRank(b));

	map<int, set<int> >::iterator mit = rankInfo2Helper.find(i);
	if (mit == rankInfo2Helper.end())
	{
	    set<int> s;
	    s.insert(i);
	    rankInfo2Helper[i] = s;
	}
	else
	    rankInfo2Helper[i].insert(i);
    }

    if (doBalance)
        BalanceWorkload(seedPts);
    initSW.stop();
    runSW.stop();
    
    AddIntegralCurves(seedPts);

    //Create rank to block map.
    map<int, set<int> >::iterator it;
    rankToBlockMap.clear();
    for (int i = 0; i < numDomains; i++)
        for (int j = 0; j < blockAssignments[i].size(); j++)
        {
            int r = blockAssignments[i][j];
            it = rankToBlockMap.find(r);
            if (it == rankToBlockMap.end())
            {
                set<int> s;
                s.insert(i);
                rankToBlockMap[r] = s;
            }
            else
                it->second.insert(i);
        }

    //Determine ranks that share my blocks.

    set<int> ranks, _myBlocks;
    for (int b = 0; b < numDomains; b++)
    {
        BlockIDType bl(b,0);
        if (DomainToRank(bl) == rank)
            _myBlocks.insert(b);
    }
    for (int b = 0; b < numDomains; b++)
    {
        if (_myBlocks.find(b) != _myBlocks.end() ||
                lazyLoadBlocks.find(b) != lazyLoadBlocks.end())
        {
            for (int r = 0; r < blockAssignments[b].size(); r++)
            {
                if (rank != blockAssignments[b][r])
                    ranks.insert(blockAssignments[b][r]);
            }
        }
    }

    ranksWithMyBlocks.insert(ranksWithMyBlocks.end(), ranks.begin(), ranks.end());
    //if (rank < 10) cout<<rank<<": ranks with my blocks= "<<ranksWithMyBlocks<<endl;
}

// ****************************************************************************
//  Method: avtChowderICAlgorithm::AddIntegralCurves
//
//  Purpose:
//      Add streamlines
//
//  Programmer: Dave Pugmire
//  Creation:   December 3, 2009
//
//  Modifications:
//
//   Hank Childs, Thu Jun  3 10:22:16 PDT 2010
//   Use new name "GetCurrentLocation".
//
//   Hank Childs, Fri Jun  4 19:58:30 CDT 2010
//   Use avtStreamlines, not avtStreamlineWrappers.
//
//   Hank Childs, Sun Jun  6 12:21:30 CDT 2010
//   Rename method to AddIntegralCurves.
//
// ****************************************************************************

void
avtChowderICAlgorithm::AddIntegralCurves(vector<avtIntegralCurve *> &ics)
{
    int timerHandle = visitTimer->StartTimer();
    /*
    for (int i = 0; i < ics.size(); i++)
        allSeeds.push_back(ics[i]->CurrentLocation());
    */

    int numTotSeeds = ics.size(), maxICsToTake;

    //Split things up as evenly as possible across ranks.
    if (doBalance)
    {
        maxICsToTake = numTotSeeds / nProcs;
        if (maxICsToTake == 0)
            maxICsToTake = 1;
    }
    else
        maxICsToTake = numTotSeeds;

    int numActive = 0;
    for (int i = 0; i < ics.size(); i++)
    {
        avtIntegralCurve *ic = ics[i];
        if (DomainToRank(ic->blockList.front()) == rank)
        {
            if (numActive < maxICsToTake)
            {
                activeICs.push_back(ics[i]);
                numActive++;
            }
            else
                inactiveICs.push_back(ics[i]);
        }
        else
            delete ic;
    }
    //cout<<rank<<": MAGIC NUM= "<<maxICsToTake<<" i took "<<numActive<<" inactive "<<inactiveICs.size()<<endl;


#if 0
    vector<int> seedBlockCounts(numDomains, 0);
    vector<vector<pair<int,int> > > seedBlockIDRanges(numDomains);
    for (int i = 0; i < ics.size(); i++)
    {
        int dom = ics[i]->blockList.front().domain;
        seedBlockCounts[dom] ++;
    }
    for (int i = 0; i < numDomains; i++)
    {
        int nSeeds = seedBlockCounts[i];
        if (nSeeds == 0)
            continue;

        int nRanks = blockAssignments[i].size();
        int nPer = nSeeds/nRanks;
        int i0 = 0, i1 = 0;
        for (int j = 0; j < nRanks; j++)
        {
            i1 = i0+nPer;
            if (j == nRanks-1)
                i1 = nSeeds;
            seedBlockIDRanges[i].push_back(pair<int,int>(i0, i1));
            i0 = i1;
        }
    }
    if (printRank0Stuff)
    {
        cout<<"Seed Block Counts: "<<seedBlockCounts<<endl;
        cout<<"Seed Block Ranges: [";
        for (int i = 0; i < numDomains; i++)
            cout<<seedBlockIDRanges[i]<<", ";
        cout<<"]"<<endl;
    }

    //Get the ICs that I own.
    vector<int> seedBlockCounter(numDomains, 0);
    for (int i = 0; i < ics.size(); i++)
    {
        avtIntegralCurve *ic = ics[i];
        bool myBlock = DomainLoaded(ic->blockList.front());
        int dom = ic->blockList.front().domain;

        //DRP. This attempts to balance things up front, and will cause IO to happen.
        myBlock = false;
        for (int j = 0; j < blockAssignments[dom].size(); j++)
            if (blockAssignments[dom][j] == rank)
            {
                if (seedBlockCounter[dom] >= seedBlockIDRanges[dom][j].first &&
                    seedBlockCounter[dom] < seedBlockIDRanges[dom][j].second)
                {
                    myBlock = true;
                }
                break;
            }

        if (myBlock)
        {
            ic->originatingRank = rank;
            activeICs.push_back(ic);
            //cout<<PAR_Rank()<<" I own "<<ic->id<<endl;

#ifdef USE_IC_STATE_TRACKING
            ic->InitTrk();
#endif
        }
        else
            delete ic;

        seedBlockCounter[dom] ++;
    }
    if (DebugStream::Level1())
    {
        debug1<<"My ICcount= "<<activeICs.size()<<endl;
        debug1<<"I own: [";
        for (int i = 0; i < numDomains; i++)
        {
            BlockIDType d(i,0);
            if (OwnDomain(d))
            {
                debug1<<i<<" ";
            }
        }
        debug1<<"]\n";
    }
#endif

    totalNumICs = activeICs.size() + inactiveICs.size();
    SumIntAcrossAllProcessors(totalNumICs);
    visitTimer->StopTimer(timerHandle, "AddIntegralCurves");
}

// ****************************************************************************
// Method:  avtChowderICAlgorithm::PreRunAlgorithm
//
// Purpose:
//
// Programmer:  Dave Pugmire
// Creation:    March 21, 2012
//
// ****************************************************************************

void
avtChowderICAlgorithm::PreRunAlgorithm()
{
    picsFilter->InitializeLocators();
}

void
avtChowderICAlgorithm::SolveAndTestDomainBlockStuff()
{
    //DRP
    if (rank != 0)
        return;

    //Load everything.
    for (int i = 0; i < numDomains; i++)
    {
        avtVector pt;
        BlockIDType blk;
        blk.domain = i;
        GetDomain(blk, pt);
    }

    vector<DomainBlock *> blockInfo2;
    //Create another blockInfo thingy.
    DomainBlock::CreateBlockInfo(blockInfo2, numDomains, picsFilter->intervalTree,
                                 subdivUniform,
                                 subdivNX, subdivNY, subdivNZ, subdivPct, false);

    cout<<"SOLVE Problem"<<endl;

    vector<int> itCounter(numDomains, 0);
    for (int i = 0; i < allSeeds.size(); i++)
    {
        int id = -i;
        if (i == 0) id = -100;
        avtStreamlineIC *ic = makeIC(allSeeds[i], id);
        GetDomain(ic);
        picsFilter->FindCandidateBlocks(ic);

        DomainBlock *blk0 = blockInfo2[ic->blockList.front().domain]->GetLeaf(ic->CurrentLocation());
        while (1)
        {
            GetDomain(ic);
            picsFilter->FindCandidateBlocks(ic);
            
            int d = ic->blockList.front().domain;
            int itC = ((avtStreamlineIC *)ic)->numSteps;
            avtVector p0 = ic->CurrentLocation();
            AdvectParticle(ic);
            avtVector pN = ic->CurrentLocation();
            itC = ((avtStreamlineIC *)ic)->numSteps - itC;
            itCounter[d] += itC;

            DomainBlock *blkN = blk0;
            if ( !ic->blockList.empty())
                blkN = blockInfo2[ic->blockList.front().domain]->GetLeaf(ic->CurrentLocation());
            
            //ic->trk<<p0<<":"<<blk0->nm<<" --"<<itC<<"--> "<<pN<<":"<<blkN->nm<<endl;
            //cout<<"IC= "<<ic->id<<endl;
            blk0->AddBlockData(blkN, 1, itC, 1);

            if (ic->status.Terminated())
                break;
            
            blk0 = blkN;
        }
        delete ic;
    }

    cout<<"ITER counts "<<itCounter<<endl;
    for (int i = 0; i < numDomains; i++)
        blockInfo2[i]->UnifyData();
    //DomainBlock::Dump(blockInfo2[0], cout, 2);

    vector<avtStreamlineIC *> allICs;
    for (int i = 0; i < allSeeds.size(); i++)
    {
        avtStreamlineIC *ic = makeIC(allSeeds[i], i);
        GetDomain(ic);
        picsFilter->FindCandidateBlocks(ic);
        allICs.push_back(ic);
    }

    //Now, compute the block popularity.
    if (popMethod == PROB_TREE)
        ComputeBlockPopProbTreeSER(allICs, blockInfo2, blockPopularityREAL, rankPopularityREAL);
    else
        ComputeBlockPopRandomWalkSER(allICs, blockInfo2, blockPopularityREAL, rankPopularityREAL);
    
    cout<<"Block Popularity: "<<blockPopularityREAL<<endl;
    cout<<"SOLVE Problem. DONE."<<endl;
    cout<<"blockInfo2:"<<endl;
    DomainBlock::Dump(blockInfo2[6], cout, 2, true);

    cout<<endl;
    cout<<"blockInfo:"<<endl;
    DomainBlock::Dump(blockInfo[6], cout, 2, true);

    for (int i = 0; i < allICs.size(); i++)
        delete allICs[i];
}

// ****************************************************************************
//  Method: avtChowderICAlgorithm::RunAlgorithm
//
//  Purpose:
//      Execute the serial streamline algorithm.
//
//  Programmer: Dave Pugmire
//  Creation:   January 27, 2009
//
//  Modifications:
//
//   Dave Pugmire, Mon Mar 23 18:33:10 EDT 2009
//   Make changes for point decomposed domain databases.
//
//   Hank Childs, Sat Apr 11 23:18:32 CDT 2009
//   Make an explicit call to GetDomain before calling AdvectParticle.
//   If we don't make this call, AdvectParticle will call GetDomain for 
//   us.  But by calling it explicitly, it goes through the avtICAlgorithm
//   bookkeeping logic, meaning that I/O will correctly be counted as I/O,
//   instead of being rolled in with integration time.
//
//   Dave Pugmire, Thu Sep 24 13:52:59 EDT 2009
//   Change Execute to RunAlgorithm.
//
//   Dave Pugmire, Thu Dec  3 13:28:08 EST 2009
//   Move some initialization into RunAlgorithm.
//
//   Hank Childs, Fri Jun  4 19:58:30 CDT 2010
//   Use avtStreamlines, not avtStreamlineWrappers.
//
//   Hank Childs, Sun Jun  6 12:21:30 CDT 2010
//   Rename several methods that reflect the new emphasis in particle 
//   advection, as opposed to streamlines.
//
//   Hank Childs, Sat Nov 27 16:52:12 PST 2010
//   Add progress reporting.
//
//   David Camp, Mon Aug 15 09:36:04 PDT 2011
//   Pathline could have domains set to -1, which would cause them to be put
//   in the oob list and continuously process (hung process).
//
// ****************************************************************************

void
avtChowderICAlgorithm::RunAlgorithm()
{
    //Not needed now. Use computePerfectPopularity.
    //SolveAndTestDomainBlockStuff();
    //Barrier();
    
    int timer = visitTimer->StartTimer();

    domIntegrateSteps.resize(numDomains, 0);
    rankIntegrateSteps.resize(nProcs, 0);

    runSW.t += picsFilter->InitialIOTime;
    runSW.start();

    //load the duplicated blocks that are assigned to me.
    if (!doLazyLoading)
    {
        avtVector pt;
        set<int>::iterator it = lazyLoadBlocks.begin();
        while (it != lazyLoadBlocks.end())
        {
            BlockIDType blk;
            blk.domain = (*it);
            GetDomain(blk, pt);
            numBlocksDuplicated++;
            it++;
        }
    }

    commSW.start();
    bool done = HandleCommunication();
    commSW.stop();
    int round = 0;

    //ICLOG<<"CHW_Algo: BEGIN"<<endl;
    while (!done)
    {
        //if (PAR_Rank() == 0) cout<<"******************************* ROUND "<<round<<endl;
        int cnt = 0;
        //ICLOG<<"BEG "<<round<<": active:"<<activeICs.size()<<" term: "<<terminatedICs.size()<<" inactive: "<<inactiveICs.size()<<endl;
        while (cnt < maxCount && !activeICs.empty())
        {
            avtIntegralCurve *ic = activeICs.front();
            activeICs.pop_front();

            //Send out requests while we integrate the LAST ic.
            if (activeICs.empty())
                CommReq();
            GetDomain(ic);

            //DRP: Note. AdvectParticle is tweaking the IC status. AtSpatial is being removed IF the domain is loaded.
            //we DON'T want this. We want to control where this particle goes......

            avtVector p0 = ic->CurrentLocation();
            double t0 = ic->CurrentTime();
            int itC = ((avtStreamlineIC *)ic)->numSteps;
            int d = ic->blockList.front().domain;
            advSW.start();
            //DomainBlock *blk0 = DomainBlock::GetLeaf(blockInfo, ic->CurrentLocation());
            int nIters = AdvectParticle(ic);
            //DomainBlock *blkN = DomainBlock::GetLeaf(blockInfo, ic->CurrentLocation());
            //blk0->AddBlockData(blkN, 1, itC, 1);
            advSW.stop();
            itC = ((avtStreamlineIC *)ic)->numSteps - itC;
            domIntegrateSteps[d] += itC;
            rankIntegrateSteps[rank] += itC;
            //ic->trk<<p0<<" "<<d<<" --"<<itC<<"-->"<<ic->blockList<<" "<<ic->CurrentLocation()<<endl;
            //if (itC != nIters) cout<<"ITERS DONT MATCH!!!"<<endl;
            //ic->trk<<blk0->nm<<","<<itC<<" ==> "<<blkN->nm<<" ";
            //ic->trk<<ic->CurrentLocation()<<" "<<ic->blockList<<" steps= "<<itC<<" "<<ic->status<<endl;

            //if (ic->id == 0)cout<<"ic: "<<p0<<" "<<t0<<" --> ";
            //if (ic->id == 0)cout<<"ic: "<<ic->CurrentLocation()<<" "<<ic->CurrentTime()<<" "<<ic->status<<" "<<ic->blockList<<endl;

            if (ic->status.Terminated())
            {
                terminatedICs.push_back(ic);
                numTerminated++;
            }
            else
                inactiveICs.push_back(ic);

            if (asyncComm || doStealing) CheckMessages();
            cnt++;
        }
        //ICLOG<<"END "<<round<<": active:"<<activeICs.size()<<" term: "<<terminatedICs.size()<<" inactive: "<<inactiveICs.size()<<endl;

        commSW.start();
        done = HandleCommunication();
        commSW.stop();
        round++;
        //ICLOG<<endl;
    }
    //ICLOG<<"CHW_Algo: END"<<endl<<endl;

    runSW.stop();
    DumpPythonBalanceCode();
    DumpStats();


    stringstream sstr, tstr;
    float maxInit, maxTst, maxITst, maxOTst, maxSTst, maxProb, maxBlk, maxUpBlk[4];
    computeStats(initSW.t, NULL, &maxInit);
    computeStats(tstPtsSW.t, NULL, &maxTst);
    computeStats(tstIPtsSW.t, NULL, &maxITst);
    computeStats(tstOPtsSW.t, NULL, &maxOTst);
    computeStats(tstPtsSyncSW.t, NULL, &maxSTst);
    computeStats(probSW.t, NULL, &maxProb);
    computeStats(upBlkSW.t, NULL, &maxBlk);
    computeStats(upBlk0SW.t, NULL, &(maxUpBlk[0]));
    computeStats(upBlk1SW.t, NULL, &(maxUpBlk[1]));
    computeStats(upBlk2SW.t, NULL, &(maxUpBlk[2]));
    computeStats(upBlk3SW.t, NULL, &(maxUpBlk[3]));

    tstr<<" IN%[S "<<maxTst/maxInit<<" SI "<<maxITst/maxInit<<" SO "<<maxOTst/maxInit;
    tstr<<" SS "<<maxSTst/maxInit<<" P "<<maxProb/maxInit<<" B "<<maxBlk/maxInit;
    tstr<<" B0 "<<maxUpBlk[0]/maxBlk<<" B1 "<<maxUpBlk[1]/maxBlk<<" B2 "<<maxUpBlk[2]/maxBlk<<" B3 "<<maxUpBlk[3]/maxBlk;
    tstr<<"]";

    sstr<<"subDiv: "<<subdivUniform<<" ["<<subdivNX<<" "<<subdivNY<<" "<<subdivNZ<<"] "<<subdivPct<<" ";
    sstr<<"tst: "<<numTestSeeds<<" ";
    sstr<<"prob: "<<minProbability<<" ";
    sstr<<"pop: "<<popMethod<<" ";
    sstr<<"bal: "<<doBalance<<" ";
    sstr<<"ERRORS: "<<sumErr<<" "<<maxErr<<" ";
    if (doStealing)
    {
        int minReq = UnifyMinimumValue(numICRequests);
        int maxReq = UnifyMaximumValue(numICRequests);
        int totReq = numICRequests;
        SumIntAcrossAllProcessors(totReq);
        sstr<<"ICReq:("<<maxICReq<<" "<<maxStealIC<<")("<<minReq<<" "<<maxReq<<"):"<<(float)totReq/(float)nProcs<<" ";
        int minSteal = UnifyMinimumValue(numStolen);
        int maxSteal = UnifyMaximumValue(numStolen);
        int totSteal = numStolen;
        SumIntAcrossAllProcessors(totSteal);
        sstr<<"ICSteal:("<<minSteal<<" "<<maxSteal<<"):"<<(float)totSteal/(float)nProcs<<" ";
    }

    if (asyncComm)
    {
        int minTerm = UnifyMinimumValue(numTermMsgs);
        int maxTerm = UnifyMaximumValue(numTermMsgs);
        int totTerm = numTermMsgs;
        SumIntAcrossAllProcessors(numTermMsgs);
        sstr<<"Term:("<<minTerm<<" "<<maxTerm<<"):"<<(float)totTerm/(float)nProcs<<" ";
    }

    DumpInfo(tstr.str(), sstr.str());

    TotalTime.value += visitTimer->StopTimer(timer, "Execute");

    vector<float> tmp(nProcs, 0.0), advT(nProcs, 0.0), waitT(nProcs, 0.0);
    tmp[rank] = advSW.t;
    MPI_Reduce(&tmp[0], &advT[0], nProcs, MPI_FLOAT, MPI_SUM, 0, VISIT_MPI_COMM);
    tmp[rank] = syncWaitSW.t;
    MPI_Reduce(&tmp[0], &waitT[0], nProcs, MPI_FLOAT, MPI_SUM, 0, VISIT_MPI_COMM);

    vector<int> tmpi(nProcs, 0), steps(nProcs, 0);
    tmpi[rank] = rankIntegrateSteps[rank];
    MPI_Reduce(&tmpi[0], &steps[0], nProcs, MPI_INT, MPI_SUM, 0, VISIT_MPI_COMM);
    int totSteps = 0;
    for (int i = 0; i < nProcs; i++) totSteps += steps[i];

    vector<vector<int> > tmpii(nProcs), rankDomStps(nProcs);
    for (int i = 0; i < nProcs; i++) tmpii[i].resize(numDomains);
    for (int i = 0; i < numDomains; i++)
        tmpii[rank][i] = domIntegrateSteps[i];

    for (int i = 0; i < nProcs; i++)
    {
        rankDomStps[i].resize(numDomains);
        MPI_Reduce(&(tmpii[i][0]), &(rankDomStps[i][0]), numDomains, MPI_INT, MPI_SUM, 0, VISIT_MPI_COMM);

        //if (rank == 0) cout<<"meow: "<<i<<" "<<rankDomStps[i]<<endl;
    }


    if (false)
        //if (rank == 0)
    {
        vector<int> totStps(numDomains, 0);
        for (int d = 0; d < numDomains; d++)
            for (int r = 0; r < nProcs; r++)
                totStps[d] += rankDomStps[r][d];

        cout<<"********************************************************"<<endl;
        cout<<"R:  advT waitT %step [doms] [domStp%]"<<endl;
        for (int i = 0; i < nProcs; i++)
        {
            cout<<i<<": ";
            char x[128];
            sprintf(x, "%4.3f %4.3f %4.3f", advT[i], waitT[i], (float)steps[i]/(float)totSteps);
            cout<<x<<" ";
            vector<int> myB;
            for (int d = 0; d < numDomains; d++)
                for (int r = 0; r < blockAssignments[d].size(); r++)
                    if (blockAssignments[d][r] == i)
                        myB.push_back(d);
            cout<<myB;
            vector<float> myStp;
            for (int d = 0; d < myB.size(); d++)
            {
                float n = 0.0;
                if (totStps[myB[d]] > 0)
                    n = (float)rankDomStps[i][myB[d]] / (float)totStps[myB[d]];

                //n = (float)totStps[myB[d]];
                //myStp.push_back(n);
                //n = (float)rankDomStps[i][myB[d]];
                myStp.push_back(n);
            }
            cout<<" "<<myStp;
            /*
               cout<<"[";
               for (int d = 0; d < myB.size(); d++)
               {
               if (d != 0) cout<<" ";
               cout<<myB[d]<<":"<<myStp[d];
               }
               cout<<"]";
               */
            cout<<endl;
        }
    }

    Barrier();
    if (abortWhenDone)
        MPI_Abort(VISIT_MPI_COMM, -1);
}

bool
avtChowderICAlgorithm::RRSSyncCommunication()
{
    syncWaitSW.start();
    int numICs = inactiveICs.size() + activeICs.size();
    SumIntAcrossAllProcessors(numICs);
    totalNumICs = numICs;
    syncWaitSW.stop();
    
    if (numICs == 0)
        return true;

    //ownerOffset = 0;
    //minBundle = 1;

    vector<int> rankWork;
    vector<toSend> domainWork; 
    vector<toSend> maxRanks; 

    rankWork.resize(nProcs, 0);
    domainWork.resize(numDomains);
    maxRanks.resize(numDomains);

    for(int i = 0; i < numDomains; i++)
    {
        domainWork[i].numICs = 0;
        domainWork[i].rank = rank;
    }

    list<avtIntegralCurve*>::iterator s;
    map<int, vector<avtIntegralCurve *> > domSendICs;
    map<int, vector<avtIntegralCurve *> >::iterator it;
    set<int> destDoms; 
    for (s = inactiveICs.begin(); s != inactiveICs.end(); s++)
    {
        int theDom = (*s)->blockList.front().domain;
        
        domainWork[theDom].numICs++;

        destDoms.insert(theDom);
        it = domSendICs.find(theDom);
        if (it == domSendICs.end())
        {
            vector<avtIntegralCurve *> v;
            v.push_back(*s);
            domSendICs[theDom] = v;
        }
        else
            it->second.push_back(*s);
    }

    //same as before.
    rankWork[rank] = activeICs.size();
    MPI_Allreduce(MPI_IN_PLACE, &(rankWork[0]), nProcs, MPI_INT, MPI_SUM, VISIT_MPI_COMM);

    int *icCounts = new int[nProcs], *allCounts = new int[nProcs];
    for (int i = 0; i < nProcs; i++) 
        icCounts[i] = 0; 

    map<int, vector<avtIntegralCurve *> > sendICs;
    set<int> receivers;
    bool workToSend = true;

    while(workToSend)
    {
        //each domain has #ICs, and max sending rank.
        MPI_Allreduce(&(domainWork[0]), &(maxRanks[0]), numDomains, MPI_2INT, MPI_MAXLOC, VISIT_MPI_COMM);
        workToSend = false;

        for(int i = 0; i < numDomains; i++)
        {
            if(!maxRanks[i].numICs)
                continue;

            workToSend = true;

            int numSending = maxRanks[i].numICs;

            while(numSending > 0)
            {
                int domRank = DomainToRank4(i, maxRanks[i].rank, rankWork);
                int sendingNow = minBundle;
                if(sendingNow > numSending)
                    sendingNow = numSending;

                rankWork[domRank]+=sendingNow;

                if(rank == maxRanks[i].rank)
                {
                    vector<avtIntegralCurve *>v;
                    it = domSendICs.find(i);

                    for(int j = 0; j < sendingNow; j++)
                    {
                        v.push_back(it->second.back());
                        it->second.pop_back();
                    }
                    if(domRank == rank)
                        activeICs.insert(activeICs.end(), v.begin(), v.end());

                    else
                    {
                        icCounts[domRank] += sendingNow;
                        sendICs[domRank].insert(sendICs[domRank].begin(), v.begin(), v.end());
                    }
                }
                numSending-=sendingNow;
            }
            if(rank == maxRanks[i].rank)
                domainWork[i].numICs = 0;
        }
    }

    //if (!inactiveICs.empty()) ICLOG<<" Sending: "<<inactiveICs.size()<<" to "<<receivers<<endl;
    //if (!inactiveICs.empty()) cout<<rank<<":  Sending: "<<inactiveICs.size()<<" to "<<receivers<<endl;
    inactiveICs.clear();

    SumIntArrayAcrossAllProcessors(icCounts, allCounts, nProcs);
    bool anyToSend = false;
    for (int i = 0; i < nProcs && !anyToSend; i++)
        anyToSend = (allCounts[i] > 0);

    int incomingCnt = allCounts[rank];

    /*
       if (rank == 0)
       {
       cout<<"HandleComm: "<<numICs<<" [";
       for (int i = 0; i < nProcs; i++)
       cout<<allCounts[i]<<" ";
       cout<<"]"<<endl;
       }
       */

    //Send out my ICs.
    for (it = sendICs.begin(); it != sendICs.end(); it++)
        SendICs(it->first, it->second);

    //Wait till I get all my ICs.
    set<int> senders;
    int totIncoming = incomingCnt;
    while (incomingCnt > 0)
    {
        list<ICCommData> ics;
        list<ICCommData>::iterator s;

        debug1<<"CNT: "<<incomingCnt<<endl;
        RecvAny(NULL, &ics, NULL, true);
        for (s = ics.begin(); s != ics.end(); s++)
        {
            avtIntegralCurve *ic = (*s).ic;

            //See if I have this block.
            BlockIDType blk;
            list<BlockIDType> tmp;
            bool blockFound = false;
            while (!ic->blockList.empty())
            {
                blk = ic->blockList.front();
                ic->blockList.pop_front();
                bool mine = DomainLoaded(blk);
                if (!mine && doLazyLoading &&lazyLoadBlocks.find(blk.domain) != lazyLoadBlocks.end())
                {
                    //cout<<rank<<" "<<blk.domain<<": **************************************************LAZY LOAD!!!!!!"<<endl;
                    avtVector pt;
                    GetDomain(blk, pt);
                    numBlocksDuplicated++;
                    mine = true;
                }
                
                if (mine)
                {
                    if (picsFilter->ICInBlock(ic, blk))
                    {
                        ic->status.ClearSpatialBoundary();
                        ic->blockList.clear();
                        ic->blockList.push_back(blk);
                        blockFound = true;
                        activeICs.push_back(ic);
                        senders.insert((*s).rank);
                        break;
                    }
                }
                else
                    tmp.push_back(blk);
            }

            //IC Not in my blocks.  Terminate if blockList empty, or send to
            //block owner of next block in list.
            if (!blockFound)
            {
                ic->blockList = tmp;
                if (ic->blockList.empty())
                    terminatedICs.push_back(ic);
                else
                    inactiveICs.push_back(ic);
            }
        }

        incomingCnt -= ics.size();
        CheckPendingSendRequests();
    }
    //if (totIncoming>0) ICLOG<<" Received: "<<totIncoming<<" from "<<senders<<endl;
    debug1<<" Received: "<<totIncoming<<" from "<<senders<<endl;

    CheckPendingSendRequests(); 
    delete [] icCounts;
    delete [] allCounts;

    return false;
}

// ****************************************************************************
// Method:  avtPODICAlgorithm::HandleCommunication
//
// Purpose: Process communication.
//
// Programmer:  Dave Pugmire
// Creation:    March 21, 2012
//
// Modifications:
//
//   Dave Pugmire, Fri Mar  8 15:49:14 EST 2013
//   Bug fix. Ensure that the same IC isn't sent to the same rank. Also, when
//   an IC is received, set the domain from the particle point.
//
// ****************************************************************************

bool
avtChowderICAlgorithm::DoSyncCommunication()
{
    static const bool useMethod2 = true;
    vector<int> icArr;
    int numICs = inactiveICs.size() + activeICs.size();

    if (activeICs.empty())
        CommReq();

    //cout<<PAR_Rank()<<" numICs= "<<numICs<<endl;
    //See if we're done.
    syncWaitSW.start();

    if (useMethod2)
    {
        SumIntAcrossAllProcessors(numICs);
        totalNumICs = numICs;
    }
    else
    {
        icArr.resize(nProcs*2, 0);
        icArr[rank] = activeICs.size();
        icArr[nProcs+rank] = inactiveICs.size();
        MPI_Allreduce(MPI_IN_PLACE, &(icArr[0]), 2*nProcs, MPI_INT, MPI_SUM, VISIT_MPI_COMM);
        numICs = sumArray(icArr);
        totalNumICs = numICs;
    }
    syncWaitSW.stop();
    MsgCnt.value++;

    /*
       int numInactive = inactiveICs.size();
       SumIntAcrossAllProcessors(numInactive);
       if (rank == 0) cout<<"avtPODICAlgorithm::HandleCommunication() numInActives= "<<numInactive<<" total= "<<numICs<<endl;
       */

    //debug1<<"avtPODICAlgorithm::HandleCommunication() numICs= "<<numICs<<endl;

    //ICLOG<<" Num ICS= "<<numICs<<endl;
    if (numICs == 0)
        return true;

    // if you want this algo operate on demand (load all data blocks, uncomment
    // this friendly piece of code.
    /*
    //Force it to be POS.
    activeICs.insert(activeICs.end(), inactiveICs.begin(), inactiveICs.end());
    inactiveICs.clear();
    return false;
    */

    //Tell everyone how many ICs are coming their way.
    int *icCounts = new int[nProcs], *allCounts = new int[nProcs];
    for (int i = 0; i < nProcs; i++)
        icCounts[i] = 0;

    list<avtIntegralCurve*>::iterator s;
    map<int, vector<avtIntegralCurve *> > sendICs;
    map<int, vector<avtIntegralCurve *> >::iterator it;
    list<avtIntegralCurve*> tmp;
    set<int> receivers;
    for (s = inactiveICs.begin(); s != inactiveICs.end(); s++)
    {
        int domRank;
        //domRank = DomainToRank((*s)->blockList.front());
        if (useMethod2)
            domRank = DomainToRank2((*s)->blockList.front());
        else
            domRank = DomainToRank3((*s)->blockList.front(), icArr);

        //cout<<"Sending ("<<(*s)->blockList.front()<<") r= "<<rank<<" --> r= "<<domRank<<endl;
        receivers.insert(domRank);
        if (domRank == rank)
        {
            activeICs.push_back(*s);
            continue;
        }

        icCounts[domRank]++;

        //Add to sending map.
        it = sendICs.find(domRank);
        if (it == sendICs.end())
        {
            vector<avtIntegralCurve *> v;
            v.push_back(*s);
            sendICs[domRank] = v;
        }
        else
            it->second.push_back(*s);
    }
    //if (!inactiveICs.empty()) ICLOG<<" Sending: "<<inactiveICs.size()<<" to "<<receivers<<endl;
    inactiveICs.clear();

    SumIntArrayAcrossAllProcessors(icCounts, allCounts, nProcs);
    bool anyToSend = false;
    for (int i = 0; i < nProcs && !anyToSend; i++)
        anyToSend = (allCounts[i] > 0);

    int incomingCnt = allCounts[rank];

    /*
       if (rank == 0)
       {
       cout<<"HandleComm: "<<numICs<<" [";
       for (int i = 0; i < nProcs; i++)
       cout<<allCounts[i]<<" ";
       cout<<"]"<<endl;
       }
       */

    //Send out my ICs.
    for (it = sendICs.begin(); it != sendICs.end(); it++)
        SendICs(it->first, it->second);

    //Wait till I get all my ICs.
    set<int> senders;
    int totIncoming = incomingCnt;
    while (incomingCnt > 0)
    {
        list<ICCommData> ics;
        list<ICCommData>::iterator s;

        RecvAny(NULL, &ics, NULL, true);
        for (s = ics.begin(); s != ics.end(); s++)
        {
            avtIntegralCurve *ic = (*s).ic;

            //See if I have this block.
            BlockIDType blk;
            list<BlockIDType> tmp;
            bool blockFound = false;
            while (!ic->blockList.empty())
            {
                blk = ic->blockList.front();
                ic->blockList.pop_front();
                bool mine = DomainLoaded(blk);
                if (!mine && doLazyLoading &&lazyLoadBlocks.find(blk.domain) != lazyLoadBlocks.end())
                {
                    //cout<<rank<<" "<<blk.domain<<": **************************************************LAZY LOAD!!!!!!"<<endl;
                    avtVector pt;
                    GetDomain(blk, pt);
                    numBlocksDuplicated++;
                    mine = true;
                }
                
                if (mine)
                {
                    if (picsFilter->ICInBlock(ic, blk))
                    {
                        ic->status.ClearSpatialBoundary();
                        ic->blockList.clear();
                        ic->blockList.push_back(blk);
                        blockFound = true;
                        activeICs.push_back(ic);
                        senders.insert((*s).rank);
                        break;
                    }
                }
                else
                    tmp.push_back(blk);
            }

            //IC Not in my blocks.  Terminate if blockList empty, or send to
            //block owner of next block in list.
            if (!blockFound)
            {
                ic->blockList = tmp;
                if (ic->blockList.empty())
                    terminatedICs.push_back(ic);
                else
                    inactiveICs.push_back(ic);
            }
        }

        incomingCnt -= ics.size();
        CheckPendingSendRequests();
    }
    //if (totIncoming>0) ICLOG<<" Received: "<<totIncoming<<" from "<<senders<<endl;

    CheckPendingSendRequests(); 
    delete [] icCounts;
    delete [] allCounts;

    return false;
}

bool
avtChowderICAlgorithm::DoAsyncCommunication()
{
    //cout<<rank<<" HandleCommunication("<<numTerm<<") totalICs= "<<totalNumICs<<endl;
    CheckPendingSendRequests();

    //Send out ICs.
    if (!inactiveICs.empty())
    {
        CommICs(inactiveICs);
        inactiveICs.clear();
    }
    //Send out terminations.
    if (activeICs.empty())
        CommTerm();
    if (totalNumICs == 0)
        return true;

    //Now, see if anything is coming my way.
    list<ICCommData> ics;
    vector<MsgCommData> msgs;
    bool blockAndWait = activeICs.empty() && (totalNumICs > 0);
    if (blockAndWait)
    {
        //if (blockAndWait) ICLOG<<" BlockAndWait. #ICs= "<<totalNumICs<<endl;
        CommReq();
    }

    syncWaitSW.start();
    CheckPendingSendRequests();
    RecvAny(&msgs, &ics, NULL, blockAndWait);
    syncWaitSW.stop();
    ProcessMessages(msgs);

    //Got some ICs, so reset the IC Req counter.
    if (!ics.empty())
    {
        //if (numICReqPosted>0&&rank>=0) cout<<rank<<" Received some ICs: zapReqPosts. #ics= "<<ics.size()<<endl;
        numICReqPosted = 0;
    }

    list<avtIntegralCurve *> notMine;
    list<ICCommData>::iterator s;
    set<int> senders;
    int totIncoming = 0;
    for (s = ics.begin(); s != ics.end(); s++)
    {
        avtIntegralCurve *ic = (*s).ic;
        //See if I have this block.
        BlockIDType blk;
        list<BlockIDType> tmp;
        bool blockFound = false;
        while (!ic->blockList.empty())
        {
            blk = ic->blockList.front();
            ic->blockList.pop_front();
            bool mine = DomainLoaded(blk);
            if (!mine && doLazyLoading && lazyLoadBlocks.find(blk.domain) != lazyLoadBlocks.end())
            {
                //cout<<rank<<" "<<blk.domain<<": *********LAZY LOAD!!!!!!"<<endl;
                avtVector pt;
                GetDomain(blk, pt);
                numBlocksDuplicated++;
                mine = true;
            }       
            if (mine)
            {
                if (picsFilter->ICInBlock(ic, blk))
                {
                    ic->status.ClearSpatialBoundary();
                    ic->blockList.clear();
                    ic->blockList.push_back(blk);
                    blockFound = true;
                    activeICs.push_back(ic);
                    senders.insert((*s).rank);
                    totIncoming++;
                    break;
                }
            }
            else
                tmp.push_back(blk);
        }

        //IC Not in my blocks.  Terminate if blockList empty, or send to
        //block owner of next block in list.
        if (!blockFound)
        {
            ic->blockList = tmp;
            if (ic->blockList.empty())
            {
                terminatedICs.push_back(ic);
                numTerminated++;
            }
            else
                notMine.push_back(ic);
        }
    }
    //if (totIncoming > 0) ICLOG<<" Received: "<<totIncoming<<" from "<<senders<<endl;
    
    if (!notMine.empty())
        CommICs(notMine);
    CheckPendingSendRequests();
    if (totalNumICs < 0)
        EXCEPTION1(VisItException, "Error: Number of ICs is negative. Bug in communication");

    return (totalNumICs == 0);
}


/*
   void
   avtChowderICAlgorithm::RunAlgorithm()
   {
   cout<<rank<<": "<<__LINE__<<endl;
   while (1)
   {
   cout<<rank<<": "<<__LINE__<<endl;
   ActivateICs();
   cout<<rank<<": "<<__LINE__<<endl;
   if (activeICs.empty())
   break;

   cout<<rank<<": "<<__LINE__<<endl;
   while (!activeICs.empty())
   {
   avtIntegralCurve *ic = activeICs.front();
   activeICs.pop_front();
   GetDomain(ic);
   cout<<rank<<": "<<__LINE__<<endl;
   do
   {
   AdvectParticle(ic);
   cout<<rank<<": "<<__LINE__<<" "<<ic->id<<" "<<ic->status<<endl;
   }
   while (ic->status.Integrateable() &&
   DomainLoaded(ic->blockList.front()));

   cout<<rank<<": "<<__LINE__<<endl;

   if (ic->status.EncounteredSpatialBoundary())
   inactiveICs.push_back(ic);
   else
   terminatedICs.push_back(ic);
   cout<<rank<<": "<<__LINE__<<endl;
   }
   }
   }
   */

class node
{
    public:
        node() {num=0;dom=-1; prob=0.0; ri=0;};
        node(int n, int d, float p, int i) {num=n;dom=d; prob=p; ri=i;};
        int dom, ri, num;
        float prob;
};
inline ostream&
operator<<(ostream &out, const node &n) {out<<"("<<n.num<<" "<<n.dom<<" "<<n.prob<<" "<<n.ri<<")"; return out;}

class blockStat
{
    public:
        blockStat() {d=-1; p=0.0f; it=0.0f;}
        blockStat(int _d, float _p, float _it){d=_d; p=_p; it=_it;}
        int d;
        float p, it;

        static bool GetEntry(float r, vector<blockStat> &blockStats, int &nextDom, int &stepsTaken)
        {
            int sz = blockStats.size();
            for (int i = 0; i < sz; i++)
                if (r <= blockStats[i].p)
                {
                    nextDom = blockStats[i].d;
                    stepsTaken = blockStats[i].it;
                    return true;
                }
            return false;
        }

        static bool cmp(blockStat x, blockStat y) {return x.p < y.p;}
};
inline ostream&
operator<<(ostream &out, const blockStat &b) {out<<"("<<b.d<<" "<<b.p<<" "<<b.it<<")"; return out;}


void
avtChowderICAlgorithm::BuildDomainInfo(std::vector<domInfo> &di)
{
    di.resize(0);
    float dt = 0.0;
    for (int i = 0; i < numDomains; i++)
        dt += blockPopularity[i];

    for (int i = 0; i < numDomains; i++)
        di.push_back(domInfo(i, blockPopularity[i]/dt));
}

void
avtChowderICAlgorithm::BuildRankInfo2(std::vector<rankInfo2> &r)
{
    //This takes 99.999% of the time in the block update.
    upBlk0SW.start();
    r.resize(0);
    r.resize(nProcs);

    for (int p = 0; p < nProcs; p++)
    {
        r[p].rank = p;
        r[p].iters = 0.0f;
        r[p].it_cost = 0.0f;
        r[p].io_cost = 0.0f;

	map<int, set<int> >::iterator mit = rankInfo2Helper.find(p);
	set<int>::iterator sit;
	int i = 0;
	for (sit = mit->second.begin(); sit != mit->second.end(); sit++, i++)
	{
	    int d = *(sit);
	    rankInfo2::blockInfo bi;
	    bi.iters = blockPopularity[d]/(float)blockAssignments[d].size();
	    bi.it_cost = bi.iters*ADVECT_TIME;
	    bi.dom = d;
	    if (i == 0)
		bi.loadCost = 0.0f;
	    else
		bi.loadCost = LOAD_TIME;
	    
	    r[p].blocks.push_back(bi);
	    r[p].iters += bi.iters;
	    
	    r[p].it_cost += bi.it_cost;
	    r[p].io_cost += bi.loadCost;
	}

	/*

        for (int d = 0; d < numDomains; d++)
            for (int i = 0; i < blockAssignments[d].size(); i++)
                if (blockAssignments[d][i] == p)
                {
                    rankInfo2::blockInfo bi;
                    bi.iters = blockPopularity[d]/(float)blockAssignments[d].size();
                    bi.it_cost = bi.iters*ADVECT_TIME;
                    bi.dom = d;
                    if (i == 0)
                        bi.loadCost = 0.0f;
                    else
                        bi.loadCost = LOAD_TIME;

                    r[p].blocks.push_back(bi);
                    r[p].iters += bi.iters;

                    r[p].it_cost += bi.it_cost;
                    r[p].io_cost += bi.loadCost;
                }
	*/

        r[p].t_cost = r[p].it_cost + r[p].io_cost;
        sort(r[p].blocks.begin(), r[p].blocks.end(), rankInfo2::d_cmp);
    }
    upBlk0SW.stop();
}

void
avtChowderICAlgorithm::BuildRankInfo(std::vector<rankInfo> &ri)
{
    vector<float> balance;
    ComputeBalance(balance);

    ri.resize(0);
    for (int p = 0; p < nProcs; p++)
    {
        vector<int> doms;
        for (int d = 0; d < numDomains; d++)
            for (int i = 0; i < blockAssignments[d].size(); i++)
                if (blockAssignments[d][i] == p)
                    doms.push_back(d);

        ri.push_back(rankInfo(p, balance[p], doms));
    }

    /*
       if (rank == 0)
       {
       cout<<"blockAssignments:"<<endl;
       for (int i=0; i < numDomains; i++)
       cout<<i<<" rankies= "<<blockAssignments[i]<<endl;
       rankInfo::printIt(ri);
       }
       */
}

static bool biggestDup(pair<int,int> x, pair<int,int> y) {return y.second < x.second;}

static vector<int>
computeDomainMcNeedy(vector<domInfo> &dinfo, vector<vector<int> > &blockAssignments)
{
    int numDomains = dinfo.size();
    float perfectBal = 1.0/(float)numDomains;
    vector<int> blockDup(numDomains);
    for (int i = 0; i < numDomains; i++)
    {
        int needed = (int)(dinfo[i].pop/perfectBal + 0.5);
        if (needed > 0)
            needed -= blockAssignments[i].size();
        if (needed < 0)
            needed = 0;
        blockDup[i] = needed; 
    }

    return blockDup;
}

class procInfo
{
    public:
        procInfo(int r, int d, float v) : rank(r), dom(d), val(v) {}
        procInfo(int r, float v) : rank(r), dom(-1), val(v) {}
        procInfo() {rank=0; dom=0; val=0.0f;}
        int rank;
        int dom;
        float val;

        static bool cmp(procInfo x, procInfo y) {return x.val < y.val;}
        static bool rcmp(procInfo x, procInfo y) {return y.val < x.val;}
};
inline ostream&
operator<<(ostream &out, const procInfo &p) {out<<"("<<p.rank<<" "<<p.dom<<" "<<p.val<<")"; return out;}

//DRP
void
avtChowderICAlgorithm::DoRankCentricBalancing2()
{
    vector<rankInfo2> rinfo;

    BuildRankInfo2(rinfo);
    if (printStuff) rankInfo2::printIt(rinfo);

    //busyThresh: must be > load time.
    //lazyThresh: Not sure here.... maybe some fraction of least busy?
    float busyThresh = 1.5 * LOAD_TIME;
    busyThresh = 0.75 * LOAD_TIME;
    float lazyThresh = 0.0;

    if (printRank0Stuff) cout<<"BEGIN REBALANCE....."<<endl;
    int cnt = 0;
    while (cnt < 10)
    {
	upBlk1SW.start();
        vector<procInfo> busy, lazy;
        //Find a busy block that is N x cost of IO.
        for (int p = 0; p < nProcs; p++)
        {
            if (rinfo[p].t_cost > busyThresh)
            {
                for (int d = 0; d < rinfo[p].blocks.size(); d++)
                    if (rinfo[p].blocks[d].it_cost > busyThresh)
                    {
                        //if (rank == 0) cout<<" ++BUSY "<<p<<" i: "<<d<<" d: "<<rinfo[p].blocks[d].dom<<endl;
                        busy.push_back(procInfo(p, rinfo[p].blocks[d].dom, rinfo[p].blocks[d].it_cost));
                    }
            }
        }
	upBlk1SW.stop();
        if (busy.empty())
            break;

	upBlk2SW.start();
        sort(busy.begin(), busy.end(), procInfo::rcmp);
        if (printRank0Stuff) cout<<"BUSY= "<<busy<<endl;
        lazyThresh = busy[busy.size()-1].val;
        for (int p = 0; p < nProcs; p++)
            if (rinfo[p].t_cost < lazyThresh)
                lazy.push_back(procInfo(p, rinfo[p].t_cost));
        if (printRank0Stuff) cout<<"** LT0= "<<lazyThresh<<endl;
	upBlk2SW.stop();

        //Didn't find anyone.... loosen the threshold...
        if (lazy.empty())
        {
	    upBlk3SW.start();
            int nb = busy.size();
            if (nb > 2)
            {
                //Set the cutoff to be avg busy val.
                float sum = 0.0;
                for (int i = 0; i < nb; i++)
                    sum += busy[i].val;
                lazyThresh = sum / (float)nb;

                /*
                //or, the mid point...
                lazyThresh = busy[nb/2].val;
                */

                if (printRank0Stuff) cout<<"** LT1= "<<lazyThresh<<endl;
                for (int p = 0; p < nProcs; p++)
                    if (rinfo[p].t_cost < lazyThresh)
                        lazy.push_back(procInfo(p, rinfo[p].t_cost));
            }
	    upBlk3SW.stop();
        }

        if (lazy.empty())
            break;

        sort(lazy.begin(), lazy.end(), procInfo::cmp);

        if (printRank0Stuff) cout<<"LT= "<<lazyThresh<<" BUSY: "<<busy<<" LAZY: "<<lazy<<endl;
        int n = min(busy.size(), lazy.size());
        for (int i = 0; i < n; i++)
        {
            AssignBlock(lazy[i].rank, busy[i].dom);
            if(printRank0Stuff)cout<<"   "<<lazy[i].rank<<" loads "<<busy[i].dom<<endl;
        }

        BuildRankInfo2(rinfo);
        if (printStuff) rankInfo2::printIt(rinfo);

        cnt++;
    }


    if (printRank0Stuff)
    {
	cout<<"ALL DONE WITH REBALNCE: "<<endl;
	BuildRankInfo2(rinfo);
	if (printStuff) rankInfo2::printIt(rinfo);
    }

    /*
       float busyCost = 30.0;
       float lazyCost = 10.0;
       vector<int> busy = rankInfo2::getThresh(rinfo, busyCost, true, false);
       vector<int> lazy = rankInfo2::getThresh(rinfo, lazyCost, false, false);
       cout<<"BUSY: "<<busy<<" LAZY: "<<lazy<<endl;
       */


    /*
       sort(rinfo.begin(), rinfo.end(), rankInfo2::t_cost_rcmp);
       int busy = rinfo[0].rank;
       int slacker = rinfo[rinfo.size()-1].rank;

       BuildRankInfo2(rinfo);
       AssignBlock(slacker, rinfo[busy].blocks[0].dom);

       BuildRankInfo2(rinfo);
       rankInfo2::printIt(rinfo);
       Barrier();
       */
}

void
avtChowderICAlgorithm::DoRankCentricBalancing()
{
    vector<domInfo> dinfo;
    vector<rankInfo> rinfo;

    float perfectBal = 1.0/(float)nProcs;
    float busyThresh = perfectBal * 1.0;
    float lazyThresh = perfectBal * 1.0;
    BuildRankInfo(rinfo);
    BuildDomainInfo(dinfo);

    //This is a rank centric balancing.
    int cnt = 0;
    while (true)
    {
        vector<int> busy, lazy;
        if (printStuff) rankInfo::printIt(rinfo);
        busy = rankInfo::getThresh(rinfo, busyThresh, true, true);
        lazy = rankInfo::getThresh(rinfo, lazyThresh, false, true);
        if (printRank0Stuff) cout<<"busy: "<<busy<<" lazy: "<<lazy<<" ("<<busyThresh<<" "<<lazyThresh<<")"<<endl;

        if (lazy.empty() || busy.empty())
            break;

        int n = min(lazy.size(), busy.size());
        for (int i = 0; i < n; i++)
        {
            //Pick a random domain. Probably want to do a PDF based selection.
            //need to determine which block is causing the busy, and how to best assign.
            //NOTE: Ditto on the send. When sending seed, need to send according to a PDF so that
            //things are evenely distributed.
            //Also, don't assign blocks if you'll blow the cache.
            int di = randomIndex(rinfo[busy[i]].doms.size());
            AssignBlock(lazy[i], rinfo[busy[i]].doms[di]);
        }
        BuildRankInfo(rinfo);
        if (printStuff) rankInfo::printIt(rinfo);

        cnt++;
        if (cnt > 5)
            break;
    }
}

void
avtChowderICAlgorithm::DoBlockCentricBalancing()
{
    vector<domInfo> dinfo;
    vector<rankInfo> rinfo;
    BuildRankInfo(rinfo);
    BuildDomainInfo(dinfo);

    if (printStuff) domInfo::printIt(dinfo);
    float perfectBal = 1.0/(float)nProcs;
    float busyThresh = perfectBal * 1.0;
    float lazyThresh = perfectBal * 1.0;

    int cnt = 0;
    while(true)
    {
        BuildRankInfo(rinfo);
        BuildDomainInfo(dinfo);
        vector<int> blockDup = computeDomainMcNeedy(dinfo, blockAssignments);
        vector<pair<int, int> > candidateBlocks;
        for (int i = 0; i < blockDup.size(); i++)
            if (blockDup[i] > 0)
                candidateBlocks.push_back(pair<int,int>(i, blockDup[i]));

        //sort them so that blocks needing most duplication will be first.
        sort(candidateBlocks.begin(), candidateBlocks.end(), biggestDup);

        if(printRank0Stuff) cout<<"blocks to dup= "<<candidateBlocks<<endl;

        vector<int> lazy;
        lazy = rankInfo::getThresh(rinfo, lazyThresh, false, true);
        if (candidateBlocks.empty() || lazy.empty())
            break;

        int n = min(lazy.size(), candidateBlocks.size());
        for (int i = 0; i < n; i++)
            AssignBlock(lazy[i], candidateBlocks[i].first);
        BuildRankInfo(rinfo);
        BuildDomainInfo(dinfo);
        cnt++;

        if (cnt > 100)
            break;
    }
    if (printStuff) domInfo::printIt(dinfo);
    if (printStuff) rankInfo::printIt(rinfo);
}

void
avtChowderICAlgorithm::UpdateBalanceWorkload()
{
    if (workBalMethod == REGULAR)
        UpdateBlockAssignments();
    else if (workBalMethod == PYTHON)
        PythonBalance();
}

void
avtChowderICAlgorithm::PythonBalance()
{
    int arrSz = 0;
    int *balArray = NULL;
    
    if (rank == 0)
    {
        //Write the block popularity table.
        ofstream fp("./blockPop.dat", ios::out);
        fp<<LOAD_TIME<<" "<<ADVECT_TIME<<endl;
        for (int i = 0; i < numDomains; i++)
            fp<<i<<" "<<i<<" "<<blockPopularity[i]<<endl;
        fp.close();
        
        system("python ./pythonCode/balance.py ./blockPop.dat ./blockBal.dat");
        
        vector<vector<int> > balData(numDomains);
        ifstream balF("./blockBal.dat", ios::in);
        int r, numD, d;
        arrSz = 0;
        for (int i = 0; i < nProcs; i++)
        {
            balF >> r;
            balF >> numD;
            balData[r].resize(numD);
            for (int i = 0; i < numD; i++)
            {
                balF >> d;
                balData[r][i] = d;
                arrSz++;
            }
        }
        balF.close();

        cout<<"BALANCE: "<<endl;
        for (int i = 0; i < nProcs; i++)
            cout<<i<<" "<<balData[i]<<endl;

        //Fill out the array.
        arrSz += nProcs;
        balArray = new int[arrSz];
        
        int cnt = 0;
        for (int i = 0; i < nProcs; i++)
        {
            int n = balData[i].size();
            balArray[cnt++] = n;
            for (int j = 0; j < n; j++)
                balArray[cnt++] = balData[i][j];
        }
    }
    
    SumIntAcrossAllProcessors(arrSz);
    if (rank != 0)
    {
        balArray = new int[arrSz];
        for (int i = 0; i < arrSz; i++)
            balArray[i] = 0;
    }
    MPI_Allreduce(MPI_IN_PLACE, balArray, arrSz, MPI_INT, MPI_SUM, VISIT_MPI_COMM);

    int cnt = 0;
    for (int i = 0; i < nProcs; i++)
    {
        int n = balArray[cnt++];
        for (int j = 0; j < n; j++)
        {
            int d = balArray[cnt++];
            //if (rank==0) cout<<"AssignBlock("<<i<<" "<<d<<")"<<endl;
            AssignBlock(i, d);
        }
    }
}

void
avtChowderICAlgorithm::UpdateBlockAssignments()
{
    vector<domInfo> dinfo;
    vector<rankInfo> rinfo;

    /*
       sort(dinfo.begin(), dinfo.end(), domInfo::rcmp);
       sort(rinfo.begin(), rinfo.end(), rankInfo::cmp);
       rankInfo::printIt(rinfo);
       domInfo::printIt(dinfo);
       */

    if (printRank0Stuff)
    {
	BuildDomainInfo(dinfo);
	BuildRankInfo(rinfo);
	cout<<"BEGIN"<<endl;
	if (printStuff) rankInfo::printIt(rinfo);
    }

    DoRankCentricBalancing2();
    //DoRankCentricBalancing();
    //DoBlockCentricBalancing();

    if (printRank0Stuff)
    {
	cout<<"REBALANCE"<<endl;
	BuildRankInfo(rinfo);
	rankInfo::printIt(rinfo);

	vector<float> balance;
	ComputeBalance(balance);
        cout<<"Block to Rank Assignments: "<<endl;
        for (int i = 0; i < numDomains; i++)
        {
            cout<<"block_"<<i<<" :ranks: "<<blockAssignments[i]<<endl;
        }

        cout<<"Rank Balance: "<<endl;
        float perfectBal = 1.0f/(float)nProcs;
        for (int i = 0; i < nProcs; i++)
            cout<<i<<" "<<balance[i]<<" <"<<balance[i]/perfectBal<<">"<<endl;
    }
}

void
avtChowderICAlgorithm::ComputeBalance(vector<float> &balance)
{
    balance.resize(nProcs, 0.0f);

    float bt = 0.0;
    for (int i = 0; i < nProcs; i++)
    {
        for (int j = 0; j < numDomains; j++)
        {
            for (int k = 0; k < blockAssignments[j].size(); k++)
            {
                if (blockAssignments[j][k] == i)
                    balance[i] += blockPopularity[j];
            }
        }
        bt += balance[i];
    }

    for (int i = 0; i < nProcs; i++)
        if (bt > 0.0 )
            balance[i] /= bt;
}

void
avtChowderICAlgorithm::AssignBlock(int r, int b)
{
    //ldBlkSW.start();
    vector<int>::iterator it;
    bool loadIt = false;
    it = find(blockAssignments[b].begin(), blockAssignments[b].end(), r);
    if (it == blockAssignments[b].end())
    {
        blockAssignments[b].push_back(r);
        loadIt = true;

	map<int, set<int> >::iterator mit = rankInfo2Helper.find(r);
	if (mit == rankInfo2Helper.end())
	{
	    set<int> s;
	    s.insert(b);
	    rankInfo2Helper[r] = s;
	}
	else
	    rankInfo2Helper[r].insert(b);
    }

    //if (rank==0) cout<<"RANK "<<r<<" loads "<<b<<" ***************************"<<endl;

    //If it's me, then mark the block for lazy-loading.
    if (r == rank && loadIt)
    {
        lazyLoadBlocks.insert(b);
    }

    //ldBlkSW.stop();
}

static float random_1()
{
    return (float)rand()/(float)RAND_MAX;
}

static int randomIndex(int sz)
{
    int idx = rand() % sz;
    return idx;

    /*
    if (sz == 1)
        return 0;

    vector<int> idx(sz);
    for (int i = 0; i < sz; i++)
        idx[i] = i;

    random_shuffle(idx.begin(), idx.end());
    return idx[0];
    */
}

int
avtChowderICAlgorithm::DomainToRank2(BlockIDType &blk)
{
    int sz = blockAssignments[blk.domain].size();

    //Pick a random recpient.
    int idx = randomIndex(sz);
    int r = blockAssignments[blk.domain][idx];
    //if (rank == 0) cout<<"Sending "<<blk<<" to "<<r<<" of "<<sz<<endl;

    return r;
}

static bool
candidateCmp(const pair<int,int> &a, const pair<int,int> &b)
{
    return a.second < b.second;
}

int
avtChowderICAlgorithm::DomainToRank3(BlockIDType &blk, const vector<int> &icLoad)
{
    int sz = blockAssignments[blk.domain].size();
    if (sz == 1)
        return blockAssignments[blk.domain][0];

    vector<pair<int,int> > candidateRanks(sz);
    for (int i = 0; i < sz; i++)
    {
        int r = blockAssignments[blk.domain][i];
        candidateRanks[i] = make_pair(r, icLoad[r]);
    }
    
    //Sort them so the least amount of work is first.
    sort(candidateRanks.begin(), candidateRanks.end(), candidateCmp);
    //cout<<"Dom2R: "<<candidateRanks<<" sending to "<<candidateRanks[0].first<<endl;

    return candidateRanks[0].first;
}

int
avtChowderICAlgorithm::DomainToRank4(int theDom, int theRank, const vector<int> &icLoad)
{
    int sz = blockAssignments[theDom].size();
    if (sz == 1)
        return blockAssignments[theDom][0];

    vector<pair<int,int> > candidateRanks(sz);
    for (int i = 0; i < sz; i++)
    {
        int r = blockAssignments[theDom][i];
        if(r == theRank)
            candidateRanks[i] = make_pair(r, icLoad[r]-ownerOffset);
        else
            candidateRanks[i] = make_pair(r, icLoad[r]);
    }
    
    //Sort them so the least amount of work is first.
    sort(candidateRanks.begin(), candidateRanks.end(), candidateCmp);
    //cout<<"Dom2R: "<<candidateRanks<<" sending to "<<candidateRanks[0].first<<endl;

    return candidateRanks[0].first;
}


static void
setArray(float *ptr, int nx, int ny, int nz)
{
    for (int i = 0; i < nx; i++)
        for (int j = 0; j < ny; j++)
            for (int k = 0; k < nz; k++)
            {
                int idx = i*ny*nz + j*nz + k;
                ptr[idx] = ptr[idx];
            }
}

static vtkFloatArray *
mkArr(const char *nm, int n, vtkRectilinearGrid *rg)
{
    vtkFloatArray *arr = vtkFloatArray::New();
    arr->SetNumberOfTuples(n);
    arr->SetName(nm);

    rg->GetCellData()->AddArray(arr);
    arr->Delete();
    return arr;
}

void
avtChowderICAlgorithm::DumpBlockStatsData(const vector<int> &actualIterations)
{
    if (rank != 0)
        return;

    int nx, ny, nz;

    //assume regular....
    if (1)
    {
        float v = pow((float)numDomains, 0.333333);
        nx = (int)v + 1;
        ny = (int)v + 1;
        nz = (int)v + 1;
    }
    else
    {
        nx = 4;
        ny = 2;
        nz = 4;
    }

    int res[3] = {nx+1, ny+1, nz+1};
    vtkRectilinearGrid *rg = vtkRectilinearGrid::New();
    rg->SetDimensions(res);

    vtkFloatArray *x = vtkFloatArray::New();
    vtkFloatArray *y = vtkFloatArray::New();
    vtkFloatArray *z = vtkFloatArray::New();

    x->SetNumberOfTuples(res[0]);
    y->SetNumberOfTuples(res[1]);
    z->SetNumberOfTuples(res[2]);

    float x0 = 0.0, x1 = 1.0;
    float y0 = 0.0, y1 = 1.0;
    float z0 = 0.0, z1 = 1.0;
    float dx = (x1-x0) / (float)(res[0]-1);
    float dy = (y1-y0) / (float)(res[1]-1);
    float dz = (z1-z0) / (float)(res[2]-1);

    for (int i = 0; i < res[0]; i++)
        x->SetTuple1(i, x0+i*dx);
    x->SetTuple1(res[0]-1, x1);
    for (int i = 0; i < res[1]; i++)
        y->SetTuple1(i, y0+i*dy);
    y->SetTuple1(res[1]-1, y1);
    for (int i = 0; i < res[2]; i++)
        z->SetTuple1(i, z0+i*dz);
    z->SetTuple1(res[2]-1, z1);

    rg->SetXCoordinates(x);
    rg->SetYCoordinates(y);
    rg->SetZCoordinates(z);

    vtkFloatArray *doms = mkArr("doms", numDomains, rg);
    vtkFloatArray *predicted = mkArr("predicted", numDomains, rg);
    //vtkFloatArray *predicted2 = mkArr("predicted2", numDomains, rg);
    //vtkFloatArray *diff_predicted = mkArr("diff_predicted", numDomains, rg);
    vtkFloatArray *actual = mkArr("actual", numDomains, rg);
    vtkFloatArray *diff = mkArr("diff", numDomains, rg);
    //vtkFloatArray *diff2 = mkArr("diff2", numDomains, rg);
    //vtkFloatArray *diffN = mkArr("diffN", numDomains, rg);
    vtkFloatArray *abs_diff = mkArr("error", numDomains, rg);
    //vtkFloatArray *abs_diff = mkArr("abs_diff", numDomains, rg);
    //vtkFloatArray *abs_diff2 = mkArr("abs_diff2", numDomains, rg);
    vtkFloatArray *dom_dup = mkArr("dom_duplication", numDomains, rg);

    float sumAct = 0;
    float sumPop = 0;
    
    sumErr = 0;
    maxErr = -1.0f;
    for(int i = 0; i < numDomains; i++)
    {
        sumAct += actualIterations[i];
        sumPop += blockPopularity[i];
    }
    for(int i = 0; i < numDomains; i++)
    {
        float act = actualIterations[i]/sumAct*100;
        float pop = blockPopularity[i]/sumPop*100;
        float e1 = fabs(act-pop);
        sumErr += e1;
        if (e1 > maxErr) maxErr = e1;
    }
    sumErr /= 2.0;
    maxErr /= 2.0;
    cout<<"Prediction error: "<<sumErr<<" MAX Prediction error: "<<maxErr<<endl;
    predErr = sumErr;

    int d = 0;
    for (int i = 0; i < nx; i++)
        for (int j = 0; j < ny; j++)
            for (int k = 0; k < nz; k++)
            {
                int idx = k*nx*ny + j*nz + i;
                doms->SetTuple1(idx, d);
                float pred = blockPopularity[d]/sumPop*100;
                predicted->SetTuple1(idx, pred);
                //predicted2->SetTuple1(idx, blockPopularity2[d]);
                //diff_predicted->SetTuple1(idx, blockPopularity2[d]-blockPopularity[d]);
                float actl = actualIterations[d]/sumAct*100;
                actual->SetTuple1(idx, actl);
                float dff = abs(pred-actl);
                diff->SetTuple1(idx, pred-actl);
                //diff2->SetTuple1(idx, actualIterations[d]-blockPopularity2[d]);
                //diffN->SetTuple1(idx, (actualIterations[d]-blockPopularity[d])/blockPopularity[d]);
                abs_diff->SetTuple1(idx, dff);
                //abs_diff->SetTuple1(idx, fabs(actualIterations[d]-blockPopularity[d]));
                //abs_diff2->SetTuple1(idx, fabs(actualIterations[d]-blockPopularity2[d]));
                dom_dup->SetTuple1(idx, blockAssignments[d].size());

                d++;
            }

    vtkDataSetWriter *writer = vtkDataSetWriter::New();
    writer->SetFileName("sl_dump.vtk");
    writer->SetInputData(rg);
    writer->Update();
    writer->Write();


    x->Delete();
    y->Delete();
    z->Delete();

    rg->Delete();
    writer->Delete();
}

class domainID
{
    public:
        domainID(int d) :dom(d), sub(-1) {}
        domainID(int d, int s) :dom(d), sub(s) {}
        int dom, sub;
        string nm;

        bool operator< (const domainID &x) const { return (dom==x.dom ? (sub < x.sub) : dom < x.dom); }

        /*
           bool operator() (const domainID &x, const domainID &y) const {return false;}
           bool operator== (const domainID &y) const {return false;}
           bool operator< (const domainID &y) const {return false;}

           friend bool operator< (const domainID &x, const domainID &y) {return false;}    
           friend bool operator== (const domainID &x, const domainID &y) {return false;}    
           */
};

//    bool operator() (const domainID &x, const domainID &y) {return false;}
//    bool operator== ( const domainID &y) {return false;}
//    bool operator< (domainID &y) {return false;}    

/*
   int test()
   {
//typedef std::pair<int,int> domainID;
std::map<std::pair<domainID,domainID>, int> mappy1;

domainID src(3), dst(4);

std::pair<domainID, domainID> p(src, dst);
mappy1[p] = 3324;

std::map<std::pair<int,int>, int> mappy2;

mappy2[make_pair(3,4)] = 93;
}
*/

avtStreamlineIC *
avtChowderICAlgorithm::makeIC(const avtVector &p, int id)
{
    unsigned char attr = avtStateRecorderIntegralCurve::SAMPLE_POSITION;
    avtVector dir(0,0,0);
    double t;

    avtStreamlineIC *s = new avtStreamlineIC(maxTestSteps, false, 0.0, false, 0.0, attr, 
            picsFilter->solver,
            avtIntegralCurve::DIRECTION_FORWARD,
            t, p, dir, id);

#ifdef USE_IC_STATE_TRACKING
    s->InitTrk();
#endif
    return s;
}

class nextBlock
{
    public:
        nextBlock() {cnt=0; numIters=0;}

        void visit(int i) {cnt++; numIters += i;}

        int cnt, numIters;
};

void
avtChowderICAlgorithm::GenerateTestPts(int d, int s, int nPts, vector<avtVector> &pts)
{
    float bb[6];
    blockInfo[d]->GetLeafFromIndex(s)->GetBBox(bb);

    float dx=bb[1]-bb[0], dy=bb[3]-bb[2], dz=bb[5]-bb[4];
    
    int slab = -1;
    if (dx < dy && dx < dz) slab = 0;
    else if (dy < dx && dy < dz) slab = 1;
    else if (dz < dx && dz < dy) slab = 2;

    pts.resize(nPts);
    float t[3];
    for (int i = 0; i < nPts; i++)
    {
        t[0] = random_1();
        t[1] = random_1();
        t[2] = random_1();
        t[slab] = 0.0f;
        
        pts[i][0] = bb[0] + random_1()*dx;
        pts[i][1] = bb[2] + random_1()*dy;
        pts[i][2] = bb[4] + random_1()*dz;
    }
}

void
avtChowderICAlgorithm::GenerateTestPts(int d, int s, vector<avtIntegralCurve *> &ics, vector<avtVector> &pts)
{
    //Do a binary search for ics in domain d.
    DomainBlock *leaf = blockInfo[d]->GetLeafFromIndex(s);
    
    int maxPts = 20;
    int lo = 0, hi = ics.size()-1;

    while (lo <= hi)
    {
        int mid = lo + (hi-lo)/2;
        int dom = ics[mid]->blockList.front().domain; 
        if (dom == d)
        {
            int i = mid, cnt = 0;
            // go forward.
            while (i < hi && cnt < maxPts)
            {
                int di = ics[i]->blockList.front().domain;
                if (di != d)
                    break;
                avtVector p = ics[i]->CurrentLocation();
                if (leaf->InBBox(p))
                {               
                    pts.push_back(ics[i]->CurrentLocation());
                    cnt++;
                }
                i++;
            }
            
            //go backward.
            i = mid-1;
            while (i > lo && cnt < maxPts)
            {
                int di = ics[i]->blockList.front().domain;
                if (di != d)
                    break;
                
                avtVector p = ics[i]->CurrentLocation();
                if (leaf->InBBox(p))
                {               
                    pts.push_back(ics[i]->CurrentLocation());
                    cnt++;
                }
                i--;
            }
            break;
        }
        else if (dom < d)
            lo = mid+1;
        else
            hi = mid-1;
    }
    
/*    
    DomainBlock *leaf = blockInfo[d]->GetLeafFromIndex(s);
    for (int i = 0; i < ics.size(); i++)
        if (ics[i]->blockList.front().domain == d)
        {
            avtVector p = ics[i]->CurrentLocation();
            if (leaf->InBBox(p))
                pts.push_back(p);
        }
*/
}


void
avtChowderICAlgorithm::RunTestPts(int d, int li, vector<avtVector> &pts, vector<int> &blockData)
{
    if (pts.empty())
        return;
    
    DomainBlock *blk = blockInfo[d]->GetLeafFromIndex(li);
    int start = blk->gid;

    StopWatch advT;
    map<int, nextBlock*> destinations;
    int num = 0, sz = pts.size();
    for (int i = 0; i < sz; i++)
    {
        avtStreamlineIC *s = makeIC(pts[i]);

        //See where they go...
        advT.start();
        int iters = ((avtStreamlineIC *)s)->numSteps;
        AdvectParticle(s);
        iters = ((avtStreamlineIC *)s)->numSteps - iters;
        advT.stop();
        int end = start;

        if (iters > 0)
        {
            if (!s->blockList.empty())
            {
                DomainBlock *dst = blockInfo[s->blockList.front().domain]->GetLeaf(s->CurrentLocation());
                end = dst->gid;
            }
            //Terminated/left mesh.
            //leave end the same, max out iters.
            else
            {
                //iters = maxTestSteps;
            }
        }
        delete s;
        if (iters == 0)
            continue;

        nextBlock *n = NULL;
        map<int, nextBlock*>::iterator it = destinations.find(end);
        if (it == destinations.end())
        {
            n = new nextBlock;
            destinations[end] = n;
        }
        else
            n = it->second;
        n->visit(iters);

        numTestParticlesSteps += iters;
        num++;
    }

    ADVECT_TIME += advT.t;

    map<int, nextBlock*>::iterator it;
    int index = 0;
    for (it = destinations.begin(); it != destinations.end(); it++)
    {
        nextBlock *n = it->second;

        // dst, totalNumICs, numICsToDst, totalNumICs, totalSteps,
        blockData[index++] = it->first;
        blockData[index++] = n->cnt;
        blockData[index++] = n->numIters;
        blockData[index++] = num;
        //if (rank == nProcs/2)cout<<start<<": --> "<<it->first<<" "<<n->cnt<<" "<<n->numIters<<" "<<num<<" index= "<<index<<endl;
        delete n;
    }
    if (index >= NVALS)
    {
        char msg[512];
        sprintf(msg, "MEMORY overflow in blockData. Increase size of NVALS. index=%d NVALS=%d\n", index, NVALS);
        EXCEPTION1(VisItException, msg);
    }
    destinations.clear();
}

#if 0
void
avtChowderICAlgorithm::RunTestPts(int d, int li, vector<avtVector> &pts, int **blockData)
{
    if (pts.empty())
        return;
    
    DomainBlock *blk = blockInfo[d]->GetLeafFromIndex(li);
    int start = blk->gid;

    StopWatch advT;
    map<int, nextBlock*> destinations;
    int num = 0, sz = pts.size();
    for (int i = 0; i < sz; i++)
    {
        avtStreamlineIC *s = makeIC(pts[i]);

        //See where they go...
        advT.start();
        int iters = ((avtStreamlineIC *)s)->numSteps;
        AdvectParticle(s);
        iters = ((avtStreamlineIC *)s)->numSteps - iters;
        advT.stop();
        int end = start;

        if (iters > 0)
        {
            if (!s->blockList.empty())
            {
                DomainBlock *dst = blockInfo[s->blockList.front().domain]->GetLeaf(s->CurrentLocation());
                end = dst->gid;
            }
            //Terminated/left mesh.
            //leave end the same, max out iters.
            else
            {
                //iters = maxTestSteps;
            }
        }
        delete s;
        if (iters == 0)
            continue;

        nextBlock *n = NULL;
        map<int, nextBlock*>::iterator it = destinations.find(end);
        if (it == destinations.end())
        {
            n = new nextBlock;
            destinations[end] = n;
        }
        else
            n = it->second;

        n->visit(iters);

        numTestParticlesSteps += iters;
        num++;
    }

    ADVECT_TIME += advT.t;

    map<int, nextBlock*>::iterator it;
    int index = 0;
    for (it = destinations.begin(); it != destinations.end(); it++)
    {
        nextBlock *n = it->second;

        // dst, totalNumICs, numICsToDst, totalNumICs, totalSteps,
        blockData[start][index++] = it->first;
        blockData[start][index++] = n->cnt;
        blockData[start][index++] = n->numIters;
        blockData[start][index++] = num;
        //if (rank == nProcs/2)cout<<start<<": --> "<<it->first<<" "<<n->cnt<<" "<<n->numIters<<" "<<num<<" index= "<<index<<endl;
        delete n;
    }
    if (index >= NVALS)
    {
        char msg[512];
        sprintf(msg, "MEMORY overflow in blockData. Increase size of NVALS. index=%d NVALS=%d\n", index, NVALS);
        EXCEPTION1(VisItException, msg);
    }
    destinations.clear();
}
#endif

static bool
binCmp(const pair<int,int> &a, const pair<int,int> &b)
{
    return a.second > b.second;
}

void
avtChowderICAlgorithm::HandleInteriorTestPoints(int d, int s, vector<avtIntegralCurve *> &ics,
                                                int maxNum, vector<avtVector> &pts)
{
    int which = 0;

    if (which == 5)
    {
        pts.resize(0);
        cout<<rank<<": Interior PTS: "<<pts.size()<<endl;
        return;
    }

    GenerateTestPts(d, s, ics, pts);
    if (pts.size() > maxNum)
    {
        if (which == 0) //Randomize.
        {
            random_shuffle(pts.begin(), pts.end());
            pts.resize(maxNum);
        }
        else if (which == 1) //Average points.
        {
            /* RRS -- Average points instead of picking at random */
            vector<avtVector> avgPts;
            int numPerAvg = pts.size()/maxNum;
            int stragglers= pts.size()%maxNum;
            avgPts.resize(maxNum);
            int ci = 0;
            for(int aa = 0; aa < maxNum; aa++)
            {
                int avgCount = (aa<stragglers)?numPerAvg+1:numPerAvg;
                avgPts[aa]=pts[ci++]; 
                for(int bb = 1; bb < avgCount; bb++)
                    avgPts[aa]+=pts[ci++];
                avgPts[aa]/=avgCount;
            }
            pts.resize(maxNum);
            for(int aa = 0; aa < maxNum; aa++)
                pts[aa] = avgPts[aa];
            avgPts.resize(0);
        }
        else if (which == 2) //Bin, and sort.
        {
            DomainBlock *leaf = blockInfo[d]->GetLeafFromIndex(s);
            
            float bb[6];
            leaf->GetBBox(bb);
            
            int n = 10, nTot = n*n*n;
            vector<pair<int,int> > bins(nTot);
            for (int k = 0; k < nTot; k++)
            {
                bins[k].first = 0;
                bins[k].second = 0;
            }
            
            float dx = bb[1]-bb[0], dy = bb[3]-bb[2], dz = bb[5]-bb[4];
            float dn = 1.0f/(float)(n-1);
            
            for (int k = 0; k < pts.size(); k++)
            {
                if (leaf->InBBox(pts[k]))
                {
                    int xi = (int)(((pts[k][0]-bb[0])/dx) / dn);
                    int yi = (int)(((pts[k][1]-bb[2])/dy) / dn);
                    int zi = (int)(((pts[k][2]-bb[4])/dz) / dn);
                    int idx = xi + yi*n + zi*yi*n;
                    
                    bins[idx].first = k;
                    bins[idx].second++;
                }
            }
            
            sort(bins.begin(), bins.end(), binCmp);
            
            vector<avtVector> binPts;
            for (int k = 0; k < nTot; k++)
            {
                if (bins[k].second > 0)
                    binPts.push_back(pts[bins[k].first]);
            }
                        
            pts = binPts;
        }
        else if (which == 3) //Random interior points.
        {
            pts.resize(0);
            GenerateTestPts(d, s, 10*maxNum, pts);
        }
        else if (which == 4)
        {
            //just use pts.
        }
        else if (which == 5) //Advect them to the edges. So, skip them here.
        {
            pts.resize(0);
        }
    }
    //cout<<rank<<": Interior PTS: "<<pts.size()<<endl;
}


void
avtChowderICAlgorithm::BalanceWorkload(vector<avtIntegralCurve *> &ics)
{
    DomainBlock::CreateBlockInfo(blockInfo, numDomains, picsFilter->intervalTree,
                                 subdivUniform,
                                 subdivNX, subdivNY, subdivNZ, subdivPct, skipSharedFaces);

    tstPtsSW.start();
    if (popMethod == PROB_TREE || popMethod == RANDOM_WALK || popMethod == AWFUL)
    {
        int totNumLeafs = DomainBlock::TotalNumLeaves(blockInfo);
        vector<int> allBlockData(totNumLeafs*NVALS, -1);
        
        //Generate test seeds.
        for (int i = 0; i < numDomains; i++)
        {
            int numLeafs = blockInfo[i]->NumLeafs();

            BlockIDType d(i,0);
            int owner = DomainToRank(d);
            for (int j = 0; j < numLeafs; j++)
            {
                if (rank == owner)
                {
                    vector<int> seedData(NVALS, -1);
                    vector<avtVector> pts;
                    if (subdivUniform || j > 0)
                    {
                        GenerateTestPts(i, j, numTestSeeds, pts);
                        
                        tstOPtsSW.start();
                        RunTestPts(i, j, pts, seedData);
                        tstOPtsSW.stop();
                    }
                    else
                    {
                        HandleInteriorTestPoints(i, j, ics, numTestSeeds, pts);

                        tstIPtsSW.start();
                        RunTestPts(i, j, pts, seedData);
                        tstIPtsSW.stop();
                    }
                    
                    //RunTestPts(i, j, pts, seedData);
                    
                    int idx = i*(numLeafs*NVALS) + j*NVALS;
                    for (int k = 0; k < NVALS; k++)
                        allBlockData[idx+k] = seedData[k];
                }
            }
        }
        
        tstPtsSyncSW.start();
        MPI_Allreduce(MPI_IN_PLACE, &(allBlockData[0]), allBlockData.size(), MPI_INT, MPI_MAX, VISIT_MPI_COMM);
        tstPtsSyncSW.stop();
        
        //Fill out blockInfo.
        int numLeafs = blockInfo[0]->NumLeafs();
        for (int i = 0; i < numDomains; i++)
        {
            for (int j = 0; j < numLeafs; j++)
            {
                DomainBlock *blk = blockInfo[i]->GetLeafFromIndex(j);
                int idx = i*(numLeafs*NVALS) + j*NVALS;
                for (int k = 0; k < NVALS; k += 4)
                {
                    if (allBlockData[idx+k] < 0)
                        break;
                    DomainBlock *dstBlk = DomainBlock::GetBlockFromGID(blockInfo, allBlockData[idx+k]);
                    blk->AddBlockData(dstBlk,
                                      allBlockData[idx+k+1],
                                      allBlockData[idx+k+2],
                                      allBlockData[idx+k+3]);
                }
            }
            blockInfo[i]->UnifyData();
        }
    }
    tstPtsSW.stop();

    probSW.start();
    if (popMethod == PROB_TREE)
        ComputeBlockPopProbTree(ics, blockInfo, blockPopularity, rankPopularity);
    else if (popMethod == RANDOM_WALK)
        ComputeBlockPopRandomWalk(ics);
    else if (popMethod == AWFUL)
    {
        for (int i = 0; i < numDomains; i++)
        {
            int n = 0;
            for (int j = 0; j < blockInfo[i]->data.size(); j++)
                n += blockInfo[i]->data[j].numIters;
            blockPopularity[i] = n;
        }
    }
    else if (popMethod == PERFECT)
        for (int i = 0; i < numDomains; i++)
            blockPopularity[i] = perfectBlockPopularity[i];
    probSW.stop();

    if (printRank0Stuff)
    {
        cout<<"Block Popularity"<<endl;
        float t = 0.0f;
        for(int i = 0; i < numDomains; i++)
            t += blockPopularity[i];

        char str[32];
        for(int i = 0; i < numDomains; i++)
        {
            sprintf(str, "%3d: (%.3f) %.1f", i, blockPopularity[i]/t, blockPopularity[i]);
            cout<<str<<endl;
        }
    }
    
    //Compute some stats, and do the balancing...
    int nTestParticles = 0;
    SumFloatAcrossAllProcessors(ADVECT_TIME);
    if (popMethod == PROB_TREE || popMethod == RANDOM_WALK)
    {
        SumIntAcrossAllProcessors(numTestParticlesSteps);
        nTestParticles = numTestParticlesSteps;
    }
    else
        nTestParticles = ics.size();
    
    ADVECT_TIME /= (float)nTestParticles;

    LOAD_TIME = picsFilter->timeForAllInitialIO;
    SumFloatAcrossAllProcessors(LOAD_TIME);
    LOAD_TIME /= (float)numDomains;

    if (rank == 0) cout<<"ADV/IO time= "<<ADVECT_TIME<<" "<<LOAD_TIME<<endl;

    upBlkSW.start();
    UpdateBalanceWorkload();
    upBlkSW.stop();
    
    //DumpPythonCode(ics);
}

#if 0
//FAST AND WRONG
void
avtChowderICAlgorithm::BalanceWorkload(vector<avtIntegralCurve *> &ics)
{
    DomainBlock::CreateBlockInfo(blockInfo, numDomains, picsFilter->intervalTree,
                                 subdivUniform,
                                 subdivNX, subdivNY, subdivNZ, subdivPct, skipSharedFaces);

    int totNumLeafs = DomainBlock::TotalNumLeaves(blockInfo);
    //int **blockData = new int*[totNumLeafs];
    blockData = new int*[totNumLeafs];
    for (int i = 0; i < totNumLeafs; i++)
    {
        blockData[i] = new int[NVALS];
        for (int j = 0; j < NVALS; j++)
            blockData[i][j] = -1;
    }

    int bdSz = totNumLeafs*NVALS;
    int *gblBlockData = new int[bdSz];

    for (int i = 0; i < bdSz; i++) gblBlockData[i] = -1;

    tstPtsSW.start();
    //Generate test seeds.
    int tot = 0;
    for (int i = 0; i < numDomains; i++)
    {
        BlockIDType d(i,0);
        int owner = DomainToRank(d);
        if (owner != rank)
            continue;

        int numLeafs = blockInfo[i]->NumLeafs();
        for (int j = 0; j < numLeafs; j++)
        {
            vector<avtVector> pts;
            if (subdivUniform || j > 0)
                GenerateTestPts(i, j, numTestSeeds, pts);
            else
                HandleInteriorTestPoints(i, j, ics, numTestSeeds, pts);
            
            RunTestPts(i, j, pts, blockData);
            tot += numTestSeeds;

            int idx = i*numLeafs*NVALS + (j*NVALS);
            for (int k = 0; k < NVALS; k++)
                gblBlockData[idx+k] = blockData[i*numLeafs+j][k];
        }
    }

    MPI_Allreduce(MPI_IN_PLACE, gblBlockData, bdSz, MPI_INT, MPI_MAX, VISIT_MPI_COMM);
    //Copy values back into structure.
    for (int i = 0; i < numDomains; i++)
    {
        int numLeafs = blockInfo[i]->NumLeafs();
        for (int j = 0; j < numLeafs; j++)
        {
            for (int k = 0; k < NVALS; k++)
                blockData[i*numLeafs+j][k] = gblBlockData[i*numLeafs*NVALS + (j*NVALS) + k];
        }
    }
    tstPtsSW.stop();

    /*
    //Make sure data are identical:
    if (rank == 0)
    {
        for (int i = 0; i < numDomains; i++)
        {
            int numLeafs = blockInfo[i]->NumLeafs();
            for (int j = 0; j < numLeafs; j++)
            {
                for (int k = 0; k < NVALS; k++)
                    if (blockData[i*numLeafs+j][k] != gblBlockData[i*numLeafs*NVALS + (j*NVALS) + k])
                    {
                        cout<<"ERROR: values dont match!!!!!! "<<i<<" "<<j<<" "<<k<<endl;
                        EXCEPTION1(VisItException, "MEOW: things no matchy!");
                    }
            }
        }
    }
    */


    //Push blockData into blockInfo.
    float prob;
    for (int i = 0; i < numDomains; i++)
    {
        int numLeafs = blockInfo[i]->NumLeafs();
        for (int j = 0; j < numLeafs; j++)
        {
            DomainBlock *blk = blockInfo[i]->GetLeafFromIndex(j);
            int id = blk->gid;
            for (int k = 0; k < NVALS; k +=4)
            {
                if (blockData[id][k] < 0)
                    break;
                //id, numICs, numIters, totalNumICs.
                DomainBlock *dstBlk = DomainBlock::GetBlockFromGID(blockInfo, blockData[id][k]);
                if (blk->gid == 162)
                    cout<<"COPY: "<<blk->gid<<" "<<dstBlk->gid<<" "<<blockData[id][k+1]<<" "<<blockData[id][k+2]<<" "<<blockData[id][k+3]<<endl;
                blk->AddBlockData(dstBlk,
                        blockData[id][k+1],
                        blockData[id][k+2],
                        blockData[id][k+3]);
            }
        }
        blockInfo[i]->UnifyData();
    }
    if (rank == 0)
    {
        cout<<"SANITY CHECK"<<endl;
        DomainBlock::Dump(blockInfo[6], cout, 2);
    }

    probSW.start();
    if (popMethod == PROB_TREE)
        ComputeBlockPopProbTree(ics, blockInfo, blockPopularity, rankPopularity);
    else if (popMethod == RANDOM_WALK)
        ComputeBlockPopRandomWalk(ics);
    probSW.stop();

    if (printRank0Stuff)
    {
        cout<<"Block Popularity"<<endl;
        float t = 0.0f;
        for(int i = 0; i < numDomains; i++)
            t += blockPopularity[i];

        char str[32];
        for(int i = 0; i < numDomains; i++)
        {
            sprintf(str, "%3d: (%.3f) %.1f", i, blockPopularity[i]/t, blockPopularity[i]);
            cout<<str<<endl;
        }
    }

    //Compute some stats, and do the balancing...
    SumIntAcrossAllProcessors(numTestParticlesSteps);
    SumFloatAcrossAllProcessors(ADVECT_TIME);
    ADVECT_TIME /= (float)numTestParticlesSteps;
    
    LOAD_TIME = picsFilter->timeForAllInitialIO;
    SumFloatAcrossAllProcessors(LOAD_TIME);
    LOAD_TIME /= (float)numDomains;

    //Time exchanging data for load via network.
    //TODO...

    //This is what I was testing with...
    /*
       LOAD_TIME = 10.0;
       ADVECT_TIME = 0.01;
       */

    upBlkSW.start();
    UpdateBalanceWorkload();
    upBlkSW.stop();
    
    //DumpPythonCode(ics);

    for (int i = 0; i < totNumLeafs; i++)
        delete [] blockData[i];
    delete [] blockData;
}
#endif

void
avtChowderICAlgorithm::ComputeBlockPopRandomWalkSER(vector<avtStreamlineIC *> &ics,
                                                    vector<DomainBlock *> &BI,
                                                    vector<float> &blkPop,
                                                    vector<float> &rnkPop)
{
    float maxSteps = 1000;

    int n = ics.size();
    int N = 5;
    for (int i = 0; i < n; i++)
    {
        avtStreamlineIC *ic = ics[i];
        DomainBlock *blk0 = BI[ic->blockList.front().domain]->GetLeaf(ic->CurrentLocation());
        if (blk0 == NULL)
            continue;

        for (int j = 0; j < N; j++)
        {
            DomainBlock *blk = blk0;
            float s = maxSteps;
            int domPrev = blk->dom;
            while (s > 0.0f)
            {
                int idx = blk->GetDataIdxFromPct(random_1());
                if (idx < 0)
                {
                    break;
                }
                
                float stepsTaken = blk->data[idx].avgIt;
                blkPop[blk->dom] += stepsTaken;
                rnkPop[rank] += stepsTaken;
                s -= stepsTaken;
                blk = blk->data[idx].blk;
                //If we stay in the same domain, then we hit a sink.
                if (blk->dom == domPrev)
                {
                    break;
                }
                domPrev = blk->dom;
            }
        }
    }
}

void
avtChowderICAlgorithm::ComputeBlockPopRandomWalk(vector<avtIntegralCurve *> &ics_)
{
    vector<avtIntegralCurve *> ics;
    int numICs = ics_.size();
    int maxNum = 25000;
    if (ics_.size() > maxNum)
    {
        vector<int> idx(numICs);
        for (int i = 0; i < numICs; i++)
            idx[i] = i;
        random_shuffle(idx.begin(), idx.end());
        ics.resize(maxNum);
        for (int i = 0; i < maxNum; i++)
            ics[i] = ics_[idx[i]];
    }
    else
    {
        ics.resize(numICs);
        for (int i = 0; i < numICs; i++)
            ics[i] = ics_[i];
    }

    avtStreamlineIC *ic = (avtStreamlineIC *)ics[0];
    float maxSteps = (float)ic->maxSteps;

    int n = ics.size();
    int N = 5;
    for (int i = rank; i < n; i += nProcs)
    {
        avtIntegralCurve *ic = ics[i];
        //DomainBlock *blk0 = DomainBlock::GetLeaf_FIX_THIS(blockInfo, ic->CurrentLocation());
        DomainBlock *blk0 = blockInfo[ic->blockList.front().domain]->GetLeaf(ic->CurrentLocation());
        if (blk0 == NULL)
            continue;

        for (int j = 0; j < N; j++)
        {
            DomainBlock *blk = blk0;
            float s = maxSteps;
            int domPrev = blk->dom;
            while (s > 0.0f)
            {
                int idx = blk->GetDataIdxFromPct(random_1());
                if (idx < 0)
                {
                    break;
                }
                
                float stepsTaken = blk->data[idx].avgIt;
                blockPopularity[blk->dom] += stepsTaken;
                rankPopularity[rank] += stepsTaken;
                s -= stepsTaken;
                blk = blk->data[idx].blk;
                //If we stay in the same domain, then we hit a sink.
                if (blk->dom == domPrev)
                {
                    break;
                }
                domPrev = blk->dom;
            }
        }
    }

    //for (int i = 0; i < numDomains; i++)
    //blockPopularity[i] /= N;

    MPI_Allreduce(MPI_IN_PLACE, &(blockPopularity[0]), numDomains, MPI_FLOAT, MPI_SUM, VISIT_MPI_COMM);
    MPI_Allreduce(MPI_IN_PLACE, &(rankPopularity[0]), nProcs, MPI_FLOAT, MPI_SUM, VISIT_MPI_COMM);
}

void
avtChowderICAlgorithm::ComputeBlockPopProbTreeSER(vector<avtStreamlineIC *> &ics,
                                                  vector<DomainBlock *> &BI,
                                                  vector<float> &BP, vector<float> &RP)
{
    int numLeafs = BI[0]->NumLeafs();
    int totalBlocks = numLeafs*numDomains;
    
    int *icCount = new int[totalBlocks];
    int *toCount = new int[totalBlocks];

    for(int i = 0; i < totalBlocks; i++)
    {
        icCount[i] = 0;
        toCount[i] = 0;
    }

    int n = ics.size();
    for(int i = 0; i < n; i++)
    {
        //Upper bound: Advect ICS here to see where it goes, then use the info here.
        avtStreamlineIC *ic = ics[i];
        DomainBlock *blk = BI[ic->blockList.front().domain]->GetLeaf(ic->CurrentLocation());
        if (blk != NULL)
            icCount[blk->gid]++;
    }

    float avgIterations;
    int maxSteps = 1000;

    // block, #ics, maxSteps
    vector<pair<int, pair<int, int> > >nodes;
    for (int i = 0; i < totalBlocks; i++)
    {
        if (!icCount[i])
            continue;
        float avgIter = avgIter = 1.*toCount[i]/icCount[i];
        nodes.push_back(make_pair(i, make_pair(icCount[i], maxSteps-avgIter)));
    }
    delete [] icCount;

    while(!nodes.empty())
    {
        int start  = nodes.begin()->first;
        int inIcs  = nodes.begin()->second.first;
        int inIt = nodes.begin()->second.second;
        nodes.erase(nodes.begin());

        DomainBlock *from = DomainBlock::GetBlockFromGID(BI, start);

        for(int b = 0; inIcs > 0 && b < from->data.size(); b++)
        {
            float nProb = from->data[b].pct;

            int to = from->data[b].blk->gid;
            int moving = ceil(inIcs*nProb);

            inIcs -= moving;

            float mult = from->data[b].avgIt;
            if(mult > inIt)
                mult = inIt;
            
            BP[from->dom] += moving*mult;
            RP[rank] += moving*mult;
            
            int its = inIt-mult;
            if(from->dom == from->data[b].blk->dom || its <= 0)
                continue;

            nodes.push_back(make_pair(to, make_pair(moving, its)));
        }
    }
}


void
avtChowderICAlgorithm::ComputeBlockPopProbTree(vector<avtIntegralCurve *> &ics,
                                               vector<DomainBlock *> &BI,
                                               vector<float> &BP, vector<float> &RP)
{
    int numLeafs = BI[0]->NumLeafs();
    int totalBlocks = numLeafs*numDomains;
    
    vector<int> icCount(totalBlocks, 0), toCount(totalBlocks, 0);
    int n = ics.size();
    bool preAdvectICs = false;
    
    if (preAdvectICs)
    {
        for (int i = 0; i < n; i++)
        {
            if (rank == DomainToRank(ics[i]->blockList.front()))
            {
                avtStreamlineIC *s = makeIC(ics[i]->CurrentLocation());
                picsFilter->FindCandidateBlocks(s);
                GetDomain(s);
                
                int d = s->blockList.front().domain;
                DomainBlock *blk0 = BI[d]->GetLeaf(s->CurrentLocation());
                int ap = AdvectParticle(s);
                BP[d] += ap;
                if (!s->status.Terminated())
                {
                    d = s->blockList.front().domain;
                    DomainBlock *blkN = BI[d]->GetLeaf(s->CurrentLocation());
                    icCount[blkN->gid]++;
                    toCount[blkN->gid] += ap;
                }
                
                /*
                if (!subdivUniform && blk0->sub == 0) //interior...
                {
                    cout<<"Interior seed: "<<blk0->nm<<" ==> "<<blkN->nm<<endl;
                }
                */
            }
        }

        MPI_Allreduce(MPI_IN_PLACE, &(toCount[0]), totalBlocks, MPI_INT, MPI_SUM, VISIT_MPI_COMM);
        MPI_Allreduce(MPI_IN_PLACE, &(BP[0]), numDomains, MPI_FLOAT, MPI_SUM, VISIT_MPI_COMM);
    }
    else
    {
        for(int i = rank; i < n; i+=nProcs)
        {
            //Upper bound: Advect ICS here to see where it goes, then use the info here.
            DomainBlock *blk = BI[ics[i]->blockList.front().domain]->GetLeaf(ics[i]->CurrentLocation());
            if (blk != NULL)
                icCount[blk->gid]++;
        }
    }
    MPI_Allreduce(MPI_IN_PLACE, &(icCount[0]), totalBlocks, MPI_INT, MPI_SUM, VISIT_MPI_COMM);

    float avgIterations;

    avtStreamlineIC *first = (avtStreamlineIC *)ics[0];
    int maxSteps = first->maxSteps;

    // block, #ics, maxSteps
    vector<pair<int, pair<int, int> > >nodes;
    for(int i = rank; i < totalBlocks; i+=nProcs)
    {
        if(!icCount[i])
            continue;
        float avgIter = avgIter = 1.*toCount[i]/icCount[i];
        nodes.push_back(make_pair(i, make_pair(icCount[i], maxSteps-avgIter)));
    }

    while(!nodes.empty())
    {
        int start  = nodes.begin()->first;
        int inIcs  = nodes.begin()->second.first;
        int inIt = nodes.begin()->second.second;
        nodes.erase(nodes.begin());

        DomainBlock *from = DomainBlock::GetBlockFromGID(BI, start);

        for(int b = 0; inIcs > 0 && b < from->data.size(); b++)
        {
            float nProb = from->data[b].pct;

            int to = from->data[b].blk->gid;
            int moving = ceil(inIcs*nProb);

            inIcs -= moving;

            float mult = from->data[b].avgIt;
            if(mult > inIt)
                mult = inIt;
            
            BP[from->dom] += moving*mult;
            RP[rank] += moving*mult;
            
            int its = inIt-mult;
            if(from->dom == from->data[b].blk->dom || its <= 0)
                continue;

            nodes.push_back(make_pair(to, make_pair(moving, its)));
        }
    }
    MPI_Allreduce(MPI_IN_PLACE, &(BP[0]), numDomains, MPI_FLOAT, MPI_SUM, VISIT_MPI_COMM);
    MPI_Allreduce(MPI_IN_PLACE, &(RP[0]), nProcs, MPI_FLOAT, MPI_SUM, VISIT_MPI_COMM);
}

void
avtChowderICAlgorithm::ReportStatistics(ostream &os)
{
    avtParICAlgorithm::ReportStatistics(os);

    if (rank != 0)
        return;

    os<<"Balance Report: *******************************************"<<endl;
    char tmp[128];
    float ta = 0.0, tio = 0.0;
    for (int i = 0; i < nProcs; i++)
    {
        ta += allAdvectTime[i];
        tio += allIOTime[i];
    }
    os<<"Rank  T_a   T_IO   (% total)"<<endl;
    for (int i = 0; i < nProcs; i++)
    {
        sprintf(tmp, "R_%02d %6.4f %6.4f (%4.2f %4.2f)", i,
                allAdvectTime[i], allIOTime[i],
                allAdvectTime[i]/ta, allIOTime[i]/tio);
        os<<tmp<<endl;
    }
    os<<endl;
    os<<"Block Assignments: R_i, {d0, d1, ...}"<<endl;
    for (int i = 0; i < nProcs; i++)
    {
        os<<"R_"<<i<<": {";
        for (int j = 0; j < numDomains; j++)
            for (int k = 0; k < blockAssignments[j].size(); k++)
                if (blockAssignments[j][k] == i)
                    os<<j<<" ";
        os<<"}"<<endl;
    }
    os<<endl;

    int tot = 0;
    for (int i = 0; i < nProcs; i++)
        tot += allRankIntegrateSteps[i];
    os<<"Rank   #Steps  %Steps"<<endl;
    for (int i = 0; i < nProcs; i++)
    {
        sprintf(tmp, "R_%03d: %6d  %4.2f", i,
                allRankIntegrateSteps[i],
                (float)allRankIntegrateSteps[i]/(float)tot);
        os<<tmp<<endl;
    }
    os<<endl;

    if (doBalance)
    {
        int tp = 0, ta = 0;
        for (int i = 0; i < numDomains; i++)
        {
            tp += (int)(blockPopularity[i] + 0.5);
            ta += allDomIntegrateSteps[i];
        }
        os<<"Dom     Pred  (%)    Act  (%)   :    Diff   err (<0 under predict)"<<endl;
        for (int i = 0; i < numDomains; i++)
        {
            float diff = blockPopularity[i]-allDomIntegrateSteps[i];
            float err;
            if (blockPopularity[i] > (float)(allDomIntegrateSteps[i])) //over predict
            {
                if (allDomIntegrateSteps[i] > 0)
                    err = (float)blockPopularity[i] / (float)allDomIntegrateSteps[i];
                else
                    err = 999.0;
            }
            else
            {
                if ((int)blockPopularity[i] == allDomIntegrateSteps[i]) //Spot on!
                    err = 0.0;
                else if (blockPopularity[i] > 0)
                    err = - (float)allDomIntegrateSteps[i] / (float)blockPopularity[i];
                else
                    err = -999.0;
            }
            sprintf(tmp, "D_%02d: %7d %3.2f %7d %3.2f : %8d %7.3f",
                    i,
                    (int)blockPopularity[i], blockPopularity[i]/(float)tp, 
                    allDomIntegrateSteps[i], allDomIntegrateSteps[i]/(float)ta,
                    (int)diff,  err);
            os<<tmp<<endl;
        }
        os<<endl;
        os<<"Domain Duplication:"<<endl;
        for (int i = 0; i < numDomains; i++)
        {
            if (blockAssignments[i].size() > 1)
                os<<i<<" "<<blockAssignments[i].size()<<" "<<blockAssignments[i]<<endl;
        }
        os<<endl;
        os<<"AvgTime (load, step) ("<<LOAD_TIME<<", "<<ADVECT_TIME<<") load/step= "<<LOAD_TIME/ADVECT_TIME<<endl;
        os<<endl;
    }

    /*
    if (rank==0)
        DomainBlock::Dump(blockInfo, cout, 0);
    */
}

void
avtChowderICAlgorithm::DumpPythonCode(std::vector<avtIntegralCurve *> &ics)
{
    if (rank != 0)
        return;

    int totNumLeafs = DomainBlock::TotalNumLeaves(blockInfo);

    FILE *fp = fopen("pyCode.py", "w");
    fprintf(fp, "import numpy as np\n");
    fprintf(fp, "np.set_printoptions(precision=2)\n");
    fprintf(fp, "nIters = 10\n");
    fprintf(fp, "A = np.zeros(shape=(%d,%d))\n", totNumLeafs, totNumLeafs);

    for (int i = 0; i < totNumLeafs; i++)
    {
        DomainBlock *blk = DomainBlock::GetBlockFromGID(blockInfo, i);
        for (int j = 0; j < blk->data.size(); j++)
        {
            fprintf(fp, "A[%d][%d] = %f\n", i, blk->data[j].blk->gid, blk->data[j].pct);
        }
    }
    fprintf(fp, "\n");
    //create GID to domain map.
    fprintf(fp, "blkMap = [-1]*%d\n", totNumLeafs);
    for (int i = 0; i < totNumLeafs; i++)
    {
        DomainBlock *blk = DomainBlock::GetBlockFromGID(blockInfo, i);
        fprintf(fp, "blkMap[%d] = %d\n", i, blk->dom);
    }
    fprintf(fp, "\n");

    fprintf(fp, "S = np.zeros(shape=(1,%d))\n", totNumLeafs);
    vector<int> seedCnt(totNumLeafs, 0);
    for (int i = 0; i < ics.size(); i++)
    {
        avtVector p = ics[i]->CurrentLocation();
        for (int j = 0; j < blockInfo.size(); j++)
        {
            DomainBlock *blk = blockInfo[j]->GetLeaf(p);
            if (blk)
                seedCnt[blk->gid]++;
        }
    }
    for (int i = 0; i < totNumLeafs; i++)
        fprintf(fp, "S[0][%d] = %d\n", i, seedCnt[i]);
    fprintf(fp, "\n\n");
    fprintf(fp, "X = np.dot(A,A)\n");
    fprintf(fp, "for i in range(nIters):\n");
    fprintf(fp, " X = np.dot(X,A)\n");
    fprintf(fp, "print X\n\n");
    fprintf(fp, "W = np.dot(S,X)\n");

    fprintf(fp, "\n\n");
    fprintf(fp, "Bal = [0]*%d\n", numDomains);
    for (int i = 0; i < totNumLeafs; i++)
    {
        fprintf(fp, "Bal[blkMap[%d]] = Bal[blkMap[%d]] + W[0][%d]\n", i, i, i);
    }
    fprintf(fp, "\n\n");
    fprintf(fp, "print Bal\n");

    fclose(fp);
}

void
avtChowderICAlgorithm::DumpPythonBalanceCode()
{
    vector<int> tmp(numDomains);
    for (int i = 0; i < numDomains; i++)
        tmp[i] = domIntegrateSteps[i];
    MPI_Allreduce(MPI_IN_PLACE, &tmp[0], numDomains, MPI_INT, MPI_SUM, VISIT_MPI_COMM);
    if (rank != 0)
        return;
    FILE *fp = fopen("./pyBalanceCode.py", "w");
    if (fp == NULL)
    {
        cout<<"FAILED to dump out PythonBalanceCode."<<endl;
        return;
    }
    fprintf(fp, "popWL = []\n");
    for (int i = 0; i < numDomains; i++)
        fprintf(fp, "popWL.append(Rank(%d, %d, %d))\n", i, i, (int)(blockPopularity[i]+0.5f));

    fprintf(fp, "\n\n");
    fprintf(fp, "wl = []\n");
    for (int i = 0; i < numDomains; i++)
        fprintf(fp, "wl.append(Rank(%d, %d, %d))\n", i, i, tmp[i]);
    fclose(fp);
}

void
avtChowderICAlgorithm::DumpStats()
{
    //collect some information.
    MPI_Reduce(&domIntegrateSteps[0], &allDomIntegrateSteps[0], numDomains,
               MPI_INT, MPI_SUM, 0, VISIT_MPI_COMM);
    MPI_Reduce(&rankIntegrateSteps[0], &allRankIntegrateSteps[0], nProcs,
               MPI_INT, MPI_SUM, 0, VISIT_MPI_COMM);
    /*
    cout<<"ACTUAL STEPS: "<<rank<<" "<<domIntegrateSteps<<endl;
    if (rank==0) cout<<"REDUCE: "<<allDomIntegrateSteps<<endl;
    */

    //Unify blockInfo2 stuff.
    /* 
    int totNumLeafs = DomainBlock::TotalNumLeaves(blockInfo2);
    vector<int> leafData(NVALS);
    for (int i = 0; i < totNumLeafs; i++)
    {
        for (int j = 0; j < NVALS; j++) leafData[j] = 0;
        
        int idx = 0;
        for (int j = 0; j < blockInfo2[i]->data.size(); j++)
        {
            leafData[idx++] = blockInfo2[i]->data[j].blk->gid;
            leafData[idx++] = blockInfo2[i]->data[j].numICs;
            leafData[idx++] = blockInfo2[i]->data[j].numIters;
            leafData[idx++] = blockInfo2[i]->data[j].totalNumICs;
        }
        MPI_Reduce(MPI_IN_PLACE, &leafData[0], NVALS, MPI_INT, MPI_SUM, 0, VISIT_MPI_COMM);

        blockInfo2[i]->data.resize(0);
        for (int j = 0; j < NVALS; j+=3)
        {
            if (leafData[j] == 0) break;
            blockInfo2[i]->AddBlockData(NULL, leafData[j+0], leafData[j+1], leafData[j+2]);
        }
    }
    */

    Barrier();
    if (rank == 0 && numDomains <= 8)
    {
        //DomainBlock::Dump(blockInfo[0], cout, 2, true);
        //DomainBlock::Dump(blockInfo[0], cout, 2);
        cout<<"BLOCK COMPARE: "<<endl;
        int pt = 0, at = 0;
        for (int i = 0; i < numDomains; i++)
        {
            pt += blockPopularity[i];
            at += allDomIntegrateSteps[i];
        }
        for (int i = 0; i < numDomains; i++)
        {
            float act = allDomIntegrateSteps[i]/(float)(at)*100.0;
            float pop = blockPopularity[i]/(float)(pt)*100;
            printf("%d: P: %.2f (%5.2f) ACT: %.2f (%8d %8d)\n", i,
                   (float)blockPopularity[i]/(float)pt, fabs(pop-act)/2.0,
                   (float)allDomIntegrateSteps[i]/(float)at,
                   (int)blockPopularity[i], (int)allDomIntegrateSteps[i]);
        }

        /*
        cout<<endl;
        cout<<"RANK COMPARE: "<<endl;
        pt = 0, at = 0;
        for (int i = 0; i < nProcs; i++)
        {
            pt += rankPopularity[i];
            at += allRankIntegrateSteps[i];
        }
        for (int i = 0; i < nProcs; i++)
            printf("%d: %.2f actual: %.2f (%d %d)\n", i, 
                   (float)rankPopularity[i]/(float)pt,
                   (float)allRankIntegrateSteps[i]/(float)at,
                   (int)rankPopularity[i], (int)allRankIntegrateSteps[i]);
        */
    }

    vector<float> advectTime(nProcs, 0.0f), IOTime(nProcs, 0.0f);
    advectTime[rank] = visitTimer->LookupTimer("AdvectParticle()");
    IOTime[rank] = picsFilter->timeForAllInitialIO + visitTimer->LookupTimer("GetDomain()");

    SumFloatArrayAcrossAllProcessors(&advectTime[0], &allAdvectTime[0], nProcs);
    SumFloatArrayAcrossAllProcessors(&IOTime[0], &allIOTime[0], nProcs);

    if (doBalance)
        DumpBlockStatsData(allDomIntegrateSteps);

#if 0

    vector<int> resultsDom(numDomains);
    MPI_Reduce(&domIntegrateSteps[0], &resultsDom[0], numDomains, MPI_INT, MPI_SUM, 0, VISIT_MPI_COMM);
    vector<int> resultsRank(nProcs);
    MPI_Reduce(&rankIntegrateSteps[0], &resultsRank[0], nProcs, MPI_INT, MPI_SUM, 0, VISIT_MPI_COMM);

    if (rank != 0)
        return;

    float totalIO = visitTimer->LookupTimer("Reading dataset") + visitTimer->LookupTimer("GetDomain()");
    cout<<"TIME: ADVECT: "<<visitTimer->LookupTimer("AdvectParticle()")<<" IO: "<<totalIO<<endl;
    cout<<"Block Assignments: R_i, {d0, d1, ...}"<<endl;
    for (int i = 0; i < nProcs; i++)
    {
        cout<<i<<": {";
        for (int j = 0; j < numDomains; j++)
            for (int k = 0; k < blockAssignments[j].size(); k++)
                if (blockAssignments[j][k] == i)
                    cout<<"R_"<<i<<" ";
        cout<<"}"<<endl;
    }
    cout<<"Load Balance: R, #Steps, %ofSteps"<<endl;
    float tot = 0.0;
    for (int i = 0; i < nProcs; i++)
        tot += resultsRank[i];
    for (int i = 0; i < nProcs; i++)
        cout<<i<<": "<<resultsRank[i], (float)resultsRank[i]/tot;

    if (doBalance)
    {
        cout<<"Avg DomLoad, Step "<<LOAD_TIME<<" "<<ADVECT_TIME<<" ratio= "<<LOAD_TIME/ADVECT_TIME<<endl;
        cout<<"Domain Load D : Pred Act : Diff"<<endl;
        for (int i = 0; i < numDomains; i++)
        {
            float diff = blockPopularity[i]-resultsDom[i];
            cout<<i<<": "<<blockPopularity[i]<<" "<<resultsDom[i]<<" : "<<diff<<" err: "<<diff/resultsDom[i]<<endl;
        }
        cout<<"Domain Duplication:"<<endl;
        for (int i = 0; i < numDomains; i++)
        {
            if (blockAssignments[i].size() > 1)
                cout<<i<<" "<<blockAssignments[i].size()<<" "<<blockAssignments[i]<<endl;
        }

        DumpBlockStatsData(resultsDom);
    }
#endif
}


void
avtChowderICAlgorithm::CommTerm()
{
    if (numTerminated == 0)
        return;

    totalNumICs -= numTerminated;

    //Tell everyone else the good news.
    vector<int> msg(2);
    msg[0] = avtChowderICAlgorithm::TERMINATE;
    msg[1] = numTerminated;
    for (int i = 0; i < nProcs; i++)
        if (i != rank)
            SendMsg(i, msg);
    numTermMsgs++;
    numTerminated = 0;
    //cout<<rank<<" CommTerm("<<numTerm<<") total= "<<totalNumICs<<endl;
}

void
avtChowderICAlgorithm::CommReq()
{
    if (!doStealing || ranksWithMyBlocks.empty())
        return;

    //We've already requested work.
    if (numICReqPosted > 0 || ranksWithMyBlocks.empty())
        return;

    if (stealThreshold > 0)
    {
        int avgLoad = totalNumICs/nProcs;
        if (avgLoad < stealThreshold)
            return;
    }

    int idx = randomIndex(ranksWithMyBlocks.size());
    vector<int> msg(2);
    msg[0] = avtChowderICAlgorithm::REQUEST;
    msg[1] = 1;

    SendMsg(ranksWithMyBlocks[idx], msg);
    numICRequests++;
    numICReqPosted++;
    //debug1<<"NO WORK. REQUEST work from: "<<ranksWithMyBlocks[idx]<<" of "<<ranksWithMyBlocks<<endl;

    //cout<<rank<<" has no work. REQUEST work from: "<<ranksWithMyBlocks[idx]<<" of "<<ranksWithMyBlocks<<endl;

#if 0
    //Beg for some work.
    int n = maxICReq;
    if (maxICReq < 0 || (maxICReq > 0 && maxICReq > ranksWithMyBlocks.size()))
    {
        n = ranksWithMyBlocks.size();
        random_shuffle(ranksWithMyBlocks.begin(), ranksWithMyBlocks.end());
    }

    vector<int> msg(2), victims(n);
    msg[0] = avtChowderICAlgorithm::REQUEST;
    msg[1] = n;

    for (int i = 0; i < n; i++)
    {
        SendMsg(ranksWithMyBlocks[i], msg);
        numICRequests++;
        numICReqPosted++;
        victims[i] = ranksWithMyBlocks[i];
    }

    cout<<rank<<" has no work. REQUEST work from: "<<victims<<" of "<<ranksWithMyBlocks<<endl;
#endif

    /*
       if (rank>=0) cout<<rank<<" has no work. REQUEST work from: "<<victims<<" of "<<ranksWithMyBlocks<<endl;
       if (numICReqPosted > 1)
       if (rank==0) cout<<rank<<" has multiple requests out: "<<numICReqPosted<<endl;
       */
}

    void
avtChowderICAlgorithm::CommICs(list<avtIntegralCurve *> &l, int dstRank)
{
    //cout<<rank<<": sending "<<l.size()<<endl;

    list<avtIntegralCurve*>::iterator s;
    map<int, vector<avtIntegralCurve *> > sendICs;
    map<int, vector<avtIntegralCurve *> >::iterator it;
    set<int> receivers;

    for (s = l.begin(); s != l.end(); s++)
    {
        //int domRank = DomainToRank((*s)->blockList.front());

        //Pick a random destination.
        int domRank = dstRank;
        if (dstRank == -1)
            domRank = DomainToRank2((*s)->blockList.front());

        receivers.insert(domRank);
        if (domRank == rank)
            activeICs.push_back(*s);
        else
        {
            //Add to sending map.
            it = sendICs.find(domRank);
            if (it == sendICs.end())
            {
                vector<avtIntegralCurve *> v;
                v.push_back(*s);
                sendICs[domRank] = v;
            }
            else
                it->second.push_back(*s);
        }
    }

    //Send out my ICs.
    for (it = sendICs.begin(); it != sendICs.end(); it++)
        SendICs(it->first, it->second);
    
    //if (!l.empty()) ICLOG<<" Sending: "<<l.size()<<" to "<<receivers<<endl;
}

void
avtChowderICAlgorithm::ProcessMessages(std::vector<MsgCommData> &msgs)
{
    for (int i = 0; i < msgs.size(); i++)
    {
        int fromRank = msgs[i].rank;
        vector<int> &msg = msgs[i].message;
        int msgType = msg[0];

        if (msgType == avtChowderICAlgorithm::TERMINATE)
        {
            totalNumICs -= msg[1];
            //ICLOG<<" Recv Term: "<<msg[1]<<" from "<<fromRank<<endl;
        }

        else if (msgType == avtChowderICAlgorithm::REQUEST)
        {
            list<avtIntegralCurve*> ics;
            set<int> reqDoms = rankToBlockMap.find(fromRank)->second;
            int msgNumReqs = msg[1];
            int maxSend = maxStealIC;
            int myLoad = activeICs.size();
            int avgLoad = totalNumICs/nProcs;

            if (maxSend == -1)
                maxSend = activeICs.size() / 2;
            else if (maxSend == -2)
            {
                if (myLoad > avgLoad)
                    maxSend = myLoad-avgLoad;
                else
                    maxSend = 0;
            }

            if (maxSend > 0)
            {
                FindICsInDoms(ics, activeICs, reqDoms, maxSend);
                FindICsInDoms(ics, inactiveICs, reqDoms, maxSend);
            }

            if (maxSend > 0 && !ics.empty())
            {
                vector<avtIntegralCurve *> icsV;
                icsV.insert(icsV.end(), ics.begin(), ics.end());
                SendICs(fromRank, icsV);
                //ICLOG<<" Recv REQ from "<<fromRank<<" Sending WORK "<<ics.size()<<" from doms: "<<reqDoms<<" (myOldLoad= "<<myLoad<<" newLoad= "<<activeICs.size()<<" avgLoad= "<<avgLoad<<")"<<endl;
                numStolen += ics.size();
            }
            else
            {
                //No work to share, send a NO_THANKS.
                vector<int> msg(1);
                msg[0] = avtChowderICAlgorithm::NO_THANKS;
                SendMsg(fromRank, msg);
                /*
                if (maxSend <= 0)
                    ICLOG<<" Recv REQ from "<<fromRank<<" sending NO_THANKS. maxSend==0 (myLoad= "<<myLoad<<" avgLoad= "<<avgLoad<<")"<<endl;
                else if (ics.empty())
                    ICLOG<<" Recv REQ from "<<fromRank<<" sending NO_THANKS. NOWORK ("<<myLoad<<" "<<avgLoad<<")"<<endl;
                */
            }
        }

        else if (msgType == avtChowderICAlgorithm::NO_THANKS)
        {
            numICReqPosted = 0;
            //ICLOG<<" Recv NO_THANKS from "<<fromRank<<endl;
        }
    }
}

bool
avtChowderICAlgorithm::CheckMessages()
{
    vector<MsgCommData> msgs;
    RecvMsg(msgs);
    ProcessMessages(msgs);
    return true;
}

void
avtChowderICAlgorithm::FindICsInDoms(list<avtIntegralCurve *> &lOut,
        list<avtIntegralCurve *> &lIn,
        set<int> &reqDoms,
        int maxSend)
{
    int cnt = lOut.size();

    list<avtIntegralCurve*> tmp;
    while (!lIn.empty())
    {
        avtIntegralCurve *ic = lIn.front();
        lIn.pop_front();
        int d = ic->blockList.front().domain;

        if (reqDoms.find(d) != reqDoms.end())
        {
            lOut.push_back(ic);
            cnt++;
        }
        else
            tmp.push_back(ic);

        if (cnt >= maxSend)
            break;
    }

    lIn.insert(lIn.end(), tmp.begin(), tmp.end());
}

template <typename T>
static T
sumArray(const vector<T> &a)
{
    T s = 0;
    int sz = a.size();
    for (int i = 0; i < sz; i++)
        s += a[i];
    return s;
}

static void
computeStats(float v, float *vm, float *vM, float *vAvg)
{
    if (vm) *vm =  UnifyMinimumValue(v);
    if (vM) *vM =  UnifyMaximumValue(v);
    
    if (vAvg)
    {
        *vAvg = v;
        SumFloatAcrossAllProcessors(*vAvg);
        *vAvg /= (float)PAR_Size();
    }
}

#endif

