/*****************************************************************************
 *
 * Copyright (c) 2000 - 2012, Lawrence Livermore National Security, LLC
 * Produced at the Lawrence Livermore National Laboratory
 * LLNL-CODE-442911
 * All rights reserved.
 *
 * This file is  part of VisIt. For  details, see https://visit.llnl.gov/.  The
 * full copyright notice is contained in the file COPYRIGHT located at the root
 * of the VisIt distribution or at http://www.llnl.gov/visit/copyright.html.
 *
 * Redistribution  and  use  in  source  and  binary  forms,  with  or  without
 * modification, are permitted provided that the following conditions are met:
 *
 *  - Redistributions of  source code must  retain the above  copyright notice,
 *    this list of conditions and the disclaimer below.
 *  - Redistributions in binary form must reproduce the above copyright notice,
 *    this  list of  conditions  and  the  disclaimer (as noted below)  in  the
 *    documentation and/or other materials provided with the distribution.
 *  - Neither the name of  the LLNS/LLNL nor the names of  its contributors may
 *    be used to endorse or promote products derived from this software without
 *    specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT  HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR  IMPLIED WARRANTIES, INCLUDING,  BUT NOT  LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND  FITNESS FOR A PARTICULAR  PURPOSE
 * ARE  DISCLAIMED. IN  NO EVENT  SHALL LAWRENCE  LIVERMORE NATIONAL  SECURITY,
 * LLC, THE  U.S.  DEPARTMENT OF  ENERGY  OR  CONTRIBUTORS BE  LIABLE  FOR  ANY
 * DIRECT,  INDIRECT,   INCIDENTAL,   SPECIAL,   EXEMPLARY,  OR   CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT  LIMITED TO, PROCUREMENT OF  SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF  USE, DATA, OR PROFITS; OR  BUSINESS INTERRUPTION) HOWEVER
 * CAUSED  AND  ON  ANY  THEORY  OF  LIABILITY,  WHETHER  IN  CONTRACT,  STRICT
 * LIABILITY, OR TORT  (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY  WAY
 * OUT OF THE  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 *
 *****************************************************************************/

// ************************************************************************* //
//                              avtAChowderICAlgorithm.C                      //
// ************************************************************************* //

#include <avtAChowderICAlgorithm.h>
#include <vtkRectilinearGrid.h>
#include <vtkFloatArray.h>
#include <vtkCellData.h>
#include <vtkDataSetWriter.h>
#include <TimingsManager.h>
#include <avtParallel.h>
#include <DebugStream.h>
#include <VisItStreamUtil.h>
#include <Block.h>

using namespace std;

static int randomIndex(int sz);
static float random_1();

#ifdef PARALLEL

int avtAChowderICAlgorithm::TERMINATE = 1;
int avtAChowderICAlgorithm::REQUEST = 2;

// ****************************************************************************
//  Method: avtAChowderICAlgorithm::avtAChowderICAlgorithm
//
//  Purpose:
//      avtAChowderICAlgorithm constructor.
//
//  Programmer: Dave Pugmire
//  Creation:   January 27, 2009
//
//  Modifications:
//
//    Hank Childs, Sun Jun  6 12:21:30 CDT 2010
//    Remove reference to avtStreamlineFilter, add reference to avtPICSFilter.
//
// ****************************************************************************

avtAChowderICAlgorithm::avtAChowderICAlgorithm(avtPICSFilter *picsFilter, int count)
    : avtParICAlgorithm(picsFilter)
{
    maxCount = count;
    /*ROB*/
    numTestSeeds   = 20;
    maxTestSteps   = 1000;
    minProbability = 0.05;

    doBalance = true;
    subdivUniform = false;
    subdivNX = 2;
    subdivNY = 2;
    subdivNZ = 2;
    subdivPct = 0.10;
    
    //DRP. MAKE SURE WE CHANGE THIS.
    picsFilter->cacheQLen = 800;
    LOAD_TIME = 10.0;
    ADVECT_TIME = 0.0;
    numTestParticlesSteps = 0;
    blockPopularity.resize(numDomains, 0.0f);
    allAdvectTime.resize(nProcs, 0.0f);
    allIOTime.resize(nProcs, 0.0f);
    allDomIntegrateSteps.resize(numDomains, 0);
    allRankIntegrateSteps.resize(nProcs, 0);

    popMethod = PROB_TREE;

    //Lots 'o randomness....
    srand(0);

    int parMin = 8;
    parMin = 1;
    printRank0Stuff = (PAR_Rank() == 0 && PAR_Size() <= parMin);
    printStuff = (PAR_Size() <= parMin);
    numBlocksDuplicated = 0;
    numICRequests = 0;
    numICReqPosted = 0;
    numTermMsgs = 0;
    numTerminated = 0;
}

// ****************************************************************************
//  Method: avtAChowderICAlgorithm::~avtAChowderICAlgorithm
//
//  Purpose:
//      avtAChowderICAlgorithm destructor.
//
//  Programmer: Dave Pugmire
//  Creation:   January 27, 2009
//
// ****************************************************************************

avtAChowderICAlgorithm::~avtAChowderICAlgorithm()
{
}

// ****************************************************************************
//  Method: avtAChowderICAlgorithm::Initialize
//
//  Purpose:
//      Initialization.
//
//  Programmer: Dave Pugmire
//  Creation:   January 27, 2009
//
//  Modifications:
//
//   Dave Pugmire, Mon Mar 23 18:33:10 EDT 2009
//   Make changes for point decomposed domain databases.
//
//   Hank Childs, Fri Apr  3 16:26:24 PDT 2009
//   Change parallelization strategy, since it was loading up on the last
//   processor and we want it to be more spread out.
//
//   Hank Childs, Sun Jun  6 12:21:30 CDT 2010
//   Change name of method called to AddIntegralCurves.
//
// ****************************************************************************

void
avtAChowderICAlgorithm::Initialize(vector<avtIntegralCurve *> &seedPts)
{
    runSW.start();
    initSW.start();
    
    int numRecvs = 64;
    if (numRecvs > nProcs)
        numRecvs = nProcs-1;

    int msgSz = 2;
    avtParICAlgorithm::InitializeBuffers(seedPts, msgSz, numRecvs, numRecvs);

    int nVals = 4; //(dst, numICs, numIters, totalICsFromSrc)
    int nNeighbors = 7; //6 neighboring blocks, plus self.
    int nSubdiv = std::max(std::max(subdivNX, subdivNY), subdivNZ);
    if (!subdivUniform)
        nSubdiv++;
    
    NVALS = nVals * (nNeighbors*nSubdiv);

    //Assign the statically loaded domains.
    blockAssignments.resize(numDomains);
    for (int i = 0; i < numDomains; i++)
    {
        BlockIDType b(i,0);
        blockAssignments[i].push_back(DomainToRank(b));
    }

    if (doBalance)
        BalanceWorkload(seedPts);
    AddIntegralCurves(seedPts);
    initSW.stop();
    runSW.stop();

    //Create rank to block map.
    map<int, set<int> >::iterator it;
    rankToBlockMap.clear();
    for (int i = 0; i < numDomains; i++)
        for (int j = 0; j < blockAssignments[i].size(); j++)
        {
            int r = blockAssignments[i][j];
            it = rankToBlockMap.find(r);
            if (it == rankToBlockMap.end())
            {
                set<int> s;
                s.insert(i);
                rankToBlockMap[r] = s;
            }
            else
                it->second.insert(i);
        }

    //Determine ranks that share my blocks.
    
    set<int> ranks, _myBlocks;
    for (int b = 0; b < numDomains; b++)
    {
        BlockIDType bl(b,0);
        if (DomainToRank(bl) == rank)
            _myBlocks.insert(b);
    }
    for (int b = 0; b < numDomains; b++)
    {
        if (_myBlocks.find(b) != _myBlocks.end() ||
            lazyLoadBlocks.find(b) != lazyLoadBlocks.end())
        {
            for (int r = 0; r < blockAssignments[b].size(); r++)
            {
                if (rank != blockAssignments[b][r])
                    ranks.insert(blockAssignments[b][r]);
            }
        }
    }
    
    ranksWithMyBlocks.insert(ranksWithMyBlocks.end(), ranks.begin(), ranks.end());
    //if (rank < 10) cout<<rank<<": ranks with my blocks= "<<ranksWithMyBlocks<<endl;
}

// ****************************************************************************
//  Method: avtAChowderICAlgorithm::AddIntegralCurves
//
//  Purpose:
//      Add streamlines
//
//  Programmer: Dave Pugmire
//  Creation:   December 3, 2009
//
//  Modifications:
//
//   Hank Childs, Thu Jun  3 10:22:16 PDT 2010
//   Use new name "GetCurrentLocation".
//
//   Hank Childs, Fri Jun  4 19:58:30 CDT 2010
//   Use avtStreamlines, not avtStreamlineWrappers.
//
//   Hank Childs, Sun Jun  6 12:21:30 CDT 2010
//   Rename method to AddIntegralCurves.
//
// ****************************************************************************

void
avtAChowderICAlgorithm::AddIntegralCurves(vector<avtIntegralCurve *> &ics)
{
    addICsSW.doBarrier = true;
    addICsSW.start();
    /*
    for (int i = 0; i < numDomains; i++)
    {
        BlockIDType d(i,0);
        if (DomainLoaded(d)) //OwnDomain(d))
            cout<<PAR_Rank()<<": owns dom= "<<i<<endl;
    }
    */

    vector<int> seedBlockCounts(numDomains, 0);
    vector<vector<pair<int,int> > > seedBlockIDRanges(numDomains);
    for (int i = 0; i < ics.size(); i++)
    {
        int dom = ics[i]->blockList.front().domain;
        seedBlockCounts[dom] ++;
    }
    for (int i = 0; i < numDomains; i++)
    {
        int nSeeds = seedBlockCounts[i];
        if (nSeeds == 0)
            continue;
        
        int nRanks = blockAssignments[i].size();
        int nPer = nSeeds/nRanks;
        int i0 = 0, i1 = 0;
        for (int j = 0; j < nRanks; j++)
        {
            i1 = i0+nPer;
            if (j == nRanks-1)
                i1 = nSeeds;
            seedBlockIDRanges[i].push_back(pair<int,int>(i0, i1));
            i0 = i1;
        }
    }
    if (printRank0Stuff)
    {
        cout<<"Seed Block Counts: "<<seedBlockCounts<<endl;
        cout<<"Seed Block Ranges: [";
        for (int i = 0; i < numDomains; i++)
            cout<<seedBlockIDRanges[i]<<", ";
        cout<<"]"<<endl;
    }
    
    //Get the ICs that I own.
    vector<int> seedBlockCounter(numDomains, 0);
    for (int i = 0; i < ics.size(); i++)
    {
        avtIntegralCurve *ic = ics[i];
        bool myBlock = DomainLoaded(ic->blockList.front());

        int dom = ic->blockList.front().domain;

        //DRP. This attempts to balance things up front, and will cause IO to happen.
        myBlock = false;
        for (int j = 0; j < blockAssignments[dom].size(); j++)
            if (blockAssignments[dom][j] == rank)
            {
                if (seedBlockCounter[dom] >= seedBlockIDRanges[dom][j].first &&
                    seedBlockCounter[dom] < seedBlockIDRanges[dom][j].second)
                {
                    myBlock = true;
                }
                break;
            }

        if (myBlock)
        {
            ic->originatingRank = rank;
            activeICs.push_back(ic);
            //cout<<PAR_Rank()<<" I own "<<ic->id<<endl;

#ifdef USE_IC_STATE_TRACKING
            ic->InitTrk();
#endif
        }
        else
            delete ic;
        
        seedBlockCounter[dom] ++;
    }
    if (DebugStream::Level1())
    {
        debug1<<"My ICcount= "<<activeICs.size()<<endl;
        debug1<<"I own: [";
        for (int i = 0; i < numDomains; i++)
        {
            BlockIDType d(i,0);
            if (OwnDomain(d))
                debug1<<i<<" ";
        }
        debug1<<"]\n";
    }

    totalNumICs = activeICs.size();
    SumIntAcrossAllProcessors(totalNumICs);

    addICsSW.stop();
}

// ****************************************************************************
// Method:  avtAChowderICAlgorithm::PreRunAlgorithm
//
// Purpose:
//
// Programmer:  Dave Pugmire
// Creation:    March 21, 2012
//
// ****************************************************************************

void
avtAChowderICAlgorithm::PreRunAlgorithm()
{
    picsFilter->InitializeLocators();
}

// ****************************************************************************
//  Method: avtAChowderICAlgorithm::RunAlgorithm
//
//  Purpose:
//      Execute the serial streamline algorithm.
//
//  Programmer: Dave Pugmire
//  Creation:   January 27, 2009
//
//  Modifications:
//
//   Dave Pugmire, Mon Mar 23 18:33:10 EDT 2009
//   Make changes for point decomposed domain databases.
//
//   Hank Childs, Sat Apr 11 23:18:32 CDT 2009
//   Make an explicit call to GetDomain before calling AdvectParticle.
//   If we don't make this call, AdvectParticle will call GetDomain for 
//   us.  But by calling it explicitly, it goes through the avtICAlgorithm
//   bookkeeping logic, meaning that I/O will correctly be counted as I/O,
//   instead of being rolled in with integration time.
//
//   Dave Pugmire, Thu Sep 24 13:52:59 EDT 2009
//   Change Execute to RunAlgorithm.
//
//   Dave Pugmire, Thu Dec  3 13:28:08 EST 2009
//   Move some initialization into RunAlgorithm.
//
//   Hank Childs, Fri Jun  4 19:58:30 CDT 2010
//   Use avtStreamlines, not avtStreamlineWrappers.
//
//   Hank Childs, Sun Jun  6 12:21:30 CDT 2010
//   Rename several methods that reflect the new emphasis in particle 
//   advection, as opposed to streamlines.
//
//   Hank Childs, Sat Nov 27 16:52:12 PST 2010
//   Add progress reporting.
//
//   David Camp, Mon Aug 15 09:36:04 PDT 2011
//   Pathline could have domains set to -1, which would cause them to be put
//   in the oob list and continuously process (hung process).
//
// ****************************************************************************

void
avtAChowderICAlgorithm::RunAlgorithm()
{
    debug1<<"avtPODICAlgorithm::RunAlgorithm() activeICs: "<<activeICs.size()<<" inactiveICs: "<<inactiveICs.size()<<endl;

    int timer = visitTimer->StartTimer();
    
    domIntegrateSteps.resize(numDomains, 0);
    rankIntegrateSteps.resize(nProcs, 0);

    runSW.t += picsFilter->InitialIOTime;
    runSW.start();

    commSW.start();
    bool done = HandleCommunication();
    commSW.stop();
    int round = 0;

    while (!done)
    {
        //if (PAR_Rank() == 0) cout<<"******************************* ROUND "<<round<<endl;
        int cnt = 0;
        while (cnt < maxCount && !activeICs.empty())
        {
            avtIntegralCurve *ic = activeICs.front();
            activeICs.pop_front();
            GetDomain(ic);

            //DRP: Note. AdvectParticle is tweaking the IC status. AtSpatial is being removed IF the domain is loaded.
            //we DON'T want this. We want to control where this particle goes......
            
            avtVector p0 = ic->CurrentLocation();
            double t0 = ic->CurrentTime();
            int itC = ((avtStreamlineIC *)ic)->numSteps;
            int d = ic->blockList.front().domain;
            advSW.start();
            AdvectParticle(ic);
            advSW.stop();
            itC = ((avtStreamlineIC *)ic)->numSteps - itC;
            domIntegrateSteps[d] += itC;
            rankIntegrateSteps[rank] += itC;
            /*
            if (ic->id == 0)cout<<"ic: "<<p0<<" "<<t0<<" --> ";
            if (ic->id == 0)cout<<"ic: "<<ic->CurrentLocation()<<" "<<ic->CurrentTime()<<" "<<ic->status<<" "<<ic->blockList<<endl;
            */

            if (ic->status.Terminated())
            {
                terminatedICs.push_back(ic);
                numTerminated++;
            }
            else
                inactiveICs.push_back(ic);

            CheckMessages();
            cnt++;
        }
        
        commSW.start();
        done = HandleCommunication();
        commSW.stop();
        round++;
    }

    runSW.stop();
    if (rank == 0)
        cout<<"ACHO: "<<runSW.t<<": IN: "<<initSW.t<<" IO: "<<ioSW->t<<" CM: "<<commSW.t<<endl;

    stringstream sstr, tstr;
    float maxInit = UnifyMaximumValue(initSW.t);
    float maxTst = UnifyMaximumValue(tstPtsSW.t);
    float maxProb = UnifyMaximumValue(probSW.t);
    float maxUpBlk = UnifyMaximumValue(upBlkSW.t);
    float maxLdBlk = UnifyMaximumValue(ldBlkSW.t);
    int minReq = UnifyMinimumValue(numICRequests);
    int maxReq = UnifyMaximumValue(numICRequests);
    int totReq = numICRequests;
    SumIntAcrossAllProcessors(totReq);
    
    int minTerm = UnifyMinimumValue(numTermMsgs);
    int maxTerm = UnifyMaximumValue(numTermMsgs);
    int totTerm = numTermMsgs;
    SumIntAcrossAllProcessors(numTermMsgs);

    tstr<<" IN%[S "<<maxTst/maxInit<<" P "<<maxProb/maxInit<<" U "<<maxUpBlk/maxInit<<" L "<<maxLdBlk/maxInit<<" addICs "<<addICsSW.t/initSW.t<<"]";
    
    sstr<<"subDiv: "<<subdivUniform<<" ["<<subdivNX<<" "<<subdivNY<<" "<<subdivNZ<<"] "<<subdivPct<<" ";
    sstr<<"tst: "<<numTestSeeds<<" ";
    sstr<<"prob: "<<minProbability<<" ";
    sstr<<"pop: "<<popMethod<<" ";
    sstr<<"ICReq:("<<maxICReq<<" "<<maxStealIC<<")("<<minReq<<" "<<maxReq<<"):"<<(float)totReq/(float)nProcs<<" ";
    sstr<<"Term:("<<minTerm<<" "<<maxTerm<<"):"<<(float)totTerm/(float)nProcs<<" ";
    DumpInfo(tstr.str(), sstr.str());
    
    DumpStats();
    TotalTime.value += visitTimer->StopTimer(timer, "Execute");
    
    vector<float> tmp(nProcs, 0.0), advT(nProcs, 0.0), waitT(nProcs, 0.0);
    tmp[rank] = advSW.t;
    MPI_Reduce(&tmp[0], &advT[0], nProcs, MPI_FLOAT, MPI_SUM, 0, VISIT_MPI_COMM);
    tmp[rank] = syncWaitSW.t;
    MPI_Reduce(&tmp[0], &waitT[0], nProcs, MPI_FLOAT, MPI_SUM, 0, VISIT_MPI_COMM);

    vector<int> tmpi(nProcs, 0), steps(nProcs, 0);
    tmpi[rank] = rankIntegrateSteps[rank];
    MPI_Reduce(&tmpi[0], &steps[0], nProcs, MPI_INT, MPI_SUM, 0, VISIT_MPI_COMM);
    int totSteps = 0;
    for (int i = 0; i < nProcs; i++) totSteps += steps[i];

    vector<vector<int> > tmpii(nProcs), rankDomStps(nProcs);
    for (int i = 0; i < nProcs; i++) tmpii[i].resize(numDomains);
    for (int i = 0; i < numDomains; i++)
        tmpii[rank][i] = domIntegrateSteps[i];

    for (int i = 0; i < nProcs; i++)
    {
        rankDomStps[i].resize(numDomains);
        MPI_Reduce(&(tmpii[i][0]), &(rankDomStps[i][0]), numDomains, MPI_INT, MPI_SUM, 0, VISIT_MPI_COMM);

        //if (rank == 0) cout<<"meow: "<<i<<" "<<rankDomStps[i]<<endl;
    }


    //if (rank == 0)
    if (false)
    {
        vector<int> totStps(numDomains, 0);
        for (int d = 0; d < numDomains; d++)
            for (int r = 0; r < nProcs; r++)
                totStps[d] += rankDomStps[r][d];
        
        cout<<"********************************************************"<<endl;
        cout<<"R:  advT waitT %step [doms] [domStp%]"<<endl;
        for (int i = 0; i < nProcs; i++)
        {
            cout<<i<<": ";
            char x[128];
            sprintf(x, "%4.3f %4.3f %4.3f", advT[i], waitT[i], (float)steps[i]/(float)totSteps);
            cout<<x<<" ";
            vector<int> myB;
            for (int d = 0; d < numDomains; d++)
                for (int r = 0; r < blockAssignments[d].size(); r++)
                    if (blockAssignments[d][r] == i)
                        myB.push_back(d);
            cout<<myB;
            vector<float> myStp;
            for (int d = 0; d < myB.size(); d++)
            {
                float n = 0.0;
                if (totStps[myB[d]] > 0)
                    n = (float)rankDomStps[i][myB[d]] / (float)totStps[myB[d]];
                
                //n = (float)totStps[myB[d]];
                //myStp.push_back(n);
                //n = (float)rankDomStps[i][myB[d]];
                myStp.push_back(n);
            }
            cout<<" "<<myStp;
            /*
            cout<<"[";
            for (int d = 0; d < myB.size(); d++)
            {
                if (d != 0) cout<<" ";
                cout<<myB[d]<<":"<<myStp[d];
            }
            cout<<"]";
            */
            cout<<endl;
        }
    }
}

// ****************************************************************************
// Method:  avtPODICAlgorithm::HandleCommunication
//
// Purpose: Process communication.
//
// Programmer:  Dave Pugmire
// Creation:    March 21, 2012
//
// Modifications:
//
//   Dave Pugmire, Fri Mar  8 15:49:14 EST 2013
//   Bug fix. Ensure that the same IC isn't sent to the same rank. Also, when
//   an IC is received, set the domain from the particle point.
//
// ****************************************************************************

bool
avtAChowderICAlgorithm::HandleCommunication()
{
    //cout<<rank<<" HandleCommunication("<<numTerm<<") totalICs= "<<totalNumICs<<endl;
    CheckPendingSendRequests();
    
    //Send out ICs.
    if (!inactiveICs.empty())
    {
        CommICs(inactiveICs);
        inactiveICs.clear();
    }
    //Send out terminations.
    if (activeICs.empty())
        CommTerm();
    if (totalNumICs == 0)
        return true;

    //Now, see if anything is coming my way.
    list<ICCommData> ics;
    vector<MsgCommData> msgs;
    bool blockAndWait = activeICs.empty() && (totalNumICs > 0);

    if (blockAndWait)
        CommReq();
    
    //    if (blockAndWait)
    //  cout<<rank<<": block and wait...."<<endl;
    syncWaitSW.start();
    bool b = RecvAny(&msgs, &ics, NULL, blockAndWait);
    syncWaitSW.stop();
    //if (blockAndWait && rank==0) cout<<rank<<" backFrom dead. "<<msgs.size()<<" "<<ics.size()<<" "<<b<<endl;
    if (blockAndWait && rank==0 && msgs.size()==0 && ics.size()==0) cout<<" BLOCK AND WAIT FAILED **********"<<endl;
    //    if (blockAndWait)
    //  cout<<rank<<": block and wait.... BACK FROM THE DEAD. "<<msgs.size()<<" "<<ics.size()<<endl;

    ProcessMessages(msgs);

    //Got some ICs, so reset the IC Req counter.
    if (!ics.empty())
    {
        //if (numICReqPosted>0&&rank>=0) cout<<rank<<" Received some ICs: zapReqPosts. #ics= "<<ics.size()<<endl;
        numICReqPosted = 0;
    }

    list<avtIntegralCurve *> notMine;
    list<ICCommData>::iterator s;
    for (s = ics.begin(); s != ics.end(); s++)
    {
        avtIntegralCurve *ic = (*s).ic;
        //See if I have this block.
        BlockIDType blk;
        list<BlockIDType> tmp;
        bool blockFound = false;
        while (!ic->blockList.empty())
        {
            blk = ic->blockList.front();
            ic->blockList.pop_front();
            bool mine = DomainLoaded(blk);
            if (!mine && lazyLoadBlocks.find(blk.domain) != lazyLoadBlocks.end())
            {
                //cout<<rank<<" "<<blk.domain<<": *********LAZY LOAD!!!!!!"<<endl;
                avtVector pt;
                GetDomain(blk, pt);
                numBlocksDuplicated++;
                mine = true;
            }       
            if (mine)
            {
                if (picsFilter->ICInBlock(ic, blk))
                {
                    ic->status.ClearSpatialBoundary();
                    ic->blockList.clear();
                    ic->blockList.push_back(blk);
                    blockFound = true;
                    activeICs.push_back(ic);
                    break;
                }
            }
            else
                tmp.push_back(blk);
        }

        //IC Not in my blocks.  Terminate if blockList empty, or send to
        //block owner of next block in list.
        if (!blockFound)
        {
            ic->blockList = tmp;
            if (ic->blockList.empty())
            {
                terminatedICs.push_back(ic);
                numTerminated++;
            }
            else
                notMine.push_back(ic);
        }
    }

    if (!notMine.empty())
        CommICs(notMine);

    //    if (numTerm)
    //        CommTerm(numTerm);

    //cout<<rank<<" totalNumICs= "<<totalNumICs<<endl;
    CheckPendingSendRequests();
    if (totalNumICs < 0)
        EXCEPTION1(VisItException, "Error: Number of ICs is negative. Bug in communication");
    
    return (totalNumICs == 0);

#if 0
    int numICs = inactiveICs.size() + activeICs.size();
    
    //cout<<PAR_Rank()<<" numICs= "<<numICs<<endl;
    //See if we're done.
    syncWaitSW.start();
    SumIntAcrossAllProcessors(numICs);
    syncWaitSW.stop();
    MsgCnt.value++;

    /*
    int numInactive = inactiveICs.size();
    SumIntAcrossAllProcessors(numInactive);
    if (rank == 0) cout<<"avtPODICAlgorithm::HandleCommunication() numInActives= "<<numInactive<<" total= "<<numICs<<endl;
    */
    
    //debug1<<"avtPODICAlgorithm::HandleCommunication() numICs= "<<numICs<<endl;
    if (numICs == 0)
        return true;

    // if you want this algo operate on demand (load all data blocks, uncomment
    // this friendly piece of code.
    /*
    //Force it to be POS.
    activeICs.insert(activeICs.end(), inactiveICs.begin(), inactiveICs.end());
    inactiveICs.clear();
    return false;
    */

    //Tell everyone how many ICs are coming their way.
    int *icCounts = new int[nProcs], *allCounts = new int[nProcs];
    for (int i = 0; i < nProcs; i++)
        icCounts[i] = 0;
    
    list<avtIntegralCurve*>::iterator s;
    map<int, vector<avtIntegralCurve *> > sendICs;
    map<int, vector<avtIntegralCurve *> >::iterator it;
    list<avtIntegralCurve*> tmp;
    for (s = inactiveICs.begin(); s != inactiveICs.end(); s++)
    {
        //int domRank = DomainToRank((*s)->blockList.front());
        int domRank = DomainToRank2((*s)->blockList.front());
        //cout<<"Sending ("<<(*s)->blockList.front()<<") r= "<<rank<<" --> r= "<<domRank<<endl;
        if (domRank == rank)
        {
            activeICs.push_back(*s);
            continue;
        }
        
        icCounts[domRank]++;
            
        //Add to sending map.
        it = sendICs.find(domRank);
        if (it == sendICs.end())
        {
            vector<avtIntegralCurve *> v;
            v.push_back(*s);
            sendICs[domRank] = v;
        }
        else
            it->second.push_back(*s);
    }
    inactiveICs.clear();
    
    SumIntArrayAcrossAllProcessors(icCounts, allCounts, nProcs);
    bool anyToSend = false;
    for (int i = 0; i < nProcs && !anyToSend; i++)
        anyToSend = (allCounts[i] > 0);
    
    int incomingCnt = allCounts[rank];

    /*
    if (rank == 0)
    {
        cout<<"HandleComm: "<<numICs<<" [";
        for (int i = 0; i < nProcs; i++)
            cout<<allCounts[i]<<" ";
        cout<<"]"<<endl;
    }
    */
    
    //Send out my ICs.
    for (it = sendICs.begin(); it != sendICs.end(); it++)
        SendICs(it->first, it->second);

    //Wait till I get all my ICs.
    while (incomingCnt > 0)
    {
        list<ICCommData> ics;
        list<ICCommData>::iterator s;

        RecvAny(NULL, &ics, NULL, true);
        for (s = ics.begin(); s != ics.end(); s++)
        {
            avtIntegralCurve *ic = (*s).ic;

            //See if I have this block.
            BlockIDType blk;
            list<BlockIDType> tmp;
            bool blockFound = false;
            while (!ic->blockList.empty())
            {
                blk = ic->blockList.front();
                ic->blockList.pop_front();
                bool mine = DomainLoaded(blk);
                if (!mine && lazyLoadBlocks.find(blk.domain) != lazyLoadBlocks.end())
                {
                    //cout<<rank<<" "<<blk.domain<<": **************************************************LAZY LOAD!!!!!!"<<endl;
                    avtVector pt;
                    GetDomain(blk, pt);
                    numBlocksDuplicated++;
                    mine = true;
                }
                if (mine)
                {
                    if (picsFilter->ICInBlock(ic, blk))
                    {
                        ic->status.ClearSpatialBoundary();
                        ic->blockList.clear();
                        ic->blockList.push_back(blk);
                        blockFound = true;
                        activeICs.push_back(ic);
                        break;
                    }
                }
                else
                    tmp.push_back(blk);
            }

            //IC Not in my blocks.  Terminate if blockList empty, or send to
            //block owner of next block in list.
            if (!blockFound)
            {
                ic->blockList = tmp;
                if (ic->blockList.empty())
                    terminatedICs.push_back(ic);
                else
                    inactiveICs.push_back(ic);
            }
        }
        
        incomingCnt -= ics.size();
        CheckPendingSendRequests();
    }
    
    CheckPendingSendRequests(); 
    delete [] icCounts;
    delete [] allCounts;
    
    return false;
#endif
}


/*
void
avtAChowderICAlgorithm::RunAlgorithm()
{
    cout<<rank<<": "<<__LINE__<<endl;
    while (1)
    {
        cout<<rank<<": "<<__LINE__<<endl;
        ActivateICs();
        cout<<rank<<": "<<__LINE__<<endl;
        if (activeICs.empty())
            break;

        cout<<rank<<": "<<__LINE__<<endl;
        while (!activeICs.empty())
        {
            avtIntegralCurve *ic = activeICs.front();
            activeICs.pop_front();
            GetDomain(ic);
            cout<<rank<<": "<<__LINE__<<endl;
            do
            {
                AdvectParticle(ic);
                cout<<rank<<": "<<__LINE__<<" "<<ic->id<<" "<<ic->status<<endl;
            }
            while (ic->status.Integrateable() &&
                    DomainLoaded(ic->blockList.front()));

            cout<<rank<<": "<<__LINE__<<endl;

            if (ic->status.EncounteredSpatialBoundary())
                inactiveICs.push_back(ic);
            else
                terminatedICs.push_back(ic);
            cout<<rank<<": "<<__LINE__<<endl;
        }
    }
}
*/

class node
{
public:
    node() {num=0;dom=-1; prob=0.0; ri=0;};
    node(int n, int d, float p, int i) {num=n;dom=d; prob=p; ri=i;};
    int dom, ri, num;
    float prob;
};
inline ostream&
operator<<(ostream &out, const node &n) {out<<"("<<n.num<<" "<<n.dom<<" "<<n.prob<<" "<<n.ri<<")"; return out;}

class blockStat
{
public:
    blockStat() {d=-1; p=0.0f; it=0.0f;}
    blockStat(int _d, float _p, float _it){d=_d; p=_p; it=_it;}
    int d;
    float p, it;

    static bool GetEntry(float r, vector<blockStat> &blockStats, int &nextDom, int &stepsTaken)
    {
        int sz = blockStats.size();
        for (int i = 0; i < sz; i++)
            if (r <= blockStats[i].p)
            {
                nextDom = blockStats[i].d;
                stepsTaken = blockStats[i].it;
                return true;
            }
        return false;
    }

    static bool cmp(blockStat x, blockStat y) {return x.p < y.p;}
};
inline ostream&
operator<<(ostream &out, const blockStat &b) {out<<"("<<b.d<<" "<<b.p<<" "<<b.it<<")"; return out;}


void
avtAChowderICAlgorithm::BuildDomainInfo(std::vector<domInfo> &di)
{
    di.resize(0);
    float dt = 0.0;
    for (int i = 0; i < numDomains; i++)
        dt += blockPopularity[i];
    
    for (int i = 0; i < numDomains; i++)
        di.push_back(domInfo(i, blockPopularity[i]/dt));
}

void
avtAChowderICAlgorithm::BuildRankInfo2(std::vector<rankInfo2> &r)
{
    r.resize(0);
    r.resize(nProcs);
    
    for (int p = 0; p < nProcs; p++)
    {
        r[p].rank = p;
        r[p].iters = 0.0f;
        r[p].it_cost = 0.0f;
        r[p].io_cost = 0.0f;
        
        for (int d = 0; d < numDomains; d++)
            for (int i = 0; i < blockAssignments[d].size(); i++)
                if (blockAssignments[d][i] == p)
                {
                    rankInfo2::blockInfo bi;
                    bi.iters = blockPopularity[d]/(float)blockAssignments[d].size();
                    bi.it_cost = bi.iters*ADVECT_TIME;
                    bi.dom = d;
                    if (i == 0)
                        bi.loadCost = 0.0f;
                    else
                        bi.loadCost = LOAD_TIME;

                    r[p].blocks.push_back(bi);
                    r[p].iters += bi.iters;

                    r[p].it_cost += bi.it_cost;
                    r[p].io_cost += bi.loadCost;
                }
        
        r[p].t_cost = r[p].it_cost + r[p].io_cost;
        sort(r[p].blocks.begin(), r[p].blocks.end(), rankInfo2::d_cmp);
    }
}

void
avtAChowderICAlgorithm::BuildRankInfo(std::vector<rankInfo> &ri)
{
    vector<float> balance;
    ComputeBalance(balance);
    
    ri.resize(0);
    for (int p = 0; p < nProcs; p++)
    {
        vector<int> doms;
        for (int d = 0; d < numDomains; d++)
            for (int i = 0; i < blockAssignments[d].size(); i++)
                if (blockAssignments[d][i] == p)
                    doms.push_back(d);
        
        ri.push_back(rankInfo(p, balance[p], doms));
    }

    /*
    if (rank == 0)
    {
        cout<<"blockAssignments:"<<endl;
        for (int i=0; i < numDomains; i++)
            cout<<i<<" rankies= "<<blockAssignments[i]<<endl;
        rankInfo::printIt(ri);
    }
    */
}

static bool biggestDup(pair<int,int> x, pair<int,int> y) {return y.second < x.second;}

static vector<int>
computeDomainMcNeedy(vector<domInfo> &dinfo, vector<vector<int> > &blockAssignments)
{
    int numDomains = dinfo.size();
    float perfectBal = 1.0/(float)numDomains;
    vector<int> blockDup(numDomains);
    for (int i = 0; i < numDomains; i++)
    {
        int needed = (int)(dinfo[i].pop/perfectBal + 0.5);
        if (needed > 0)
            needed -= blockAssignments[i].size();
        if (needed < 0)
            needed = 0;
        blockDup[i] = needed; 
    }

    return blockDup;
}

class procInfo
{
public:
    procInfo(int r, int d, float v) : rank(r), dom(d), val(v) {}
    procInfo(int r, float v) : rank(r), dom(-1), val(v) {}
    procInfo() {rank=0; dom=0; val=0.0f;}
    int rank;
    int dom;
    float val;

    static bool cmp(procInfo x, procInfo y) {return x.val < y.val;}
    static bool rcmp(procInfo x, procInfo y) {return y.val < x.val;}
};
inline ostream&
operator<<(ostream &out, const procInfo &p) {out<<"("<<p.rank<<" "<<p.dom<<" "<<p.val<<")"; return out;}

//DRP
void
avtAChowderICAlgorithm::DoRankCentricBalancing2()
{
    vector<rankInfo2> rinfo;
    
    BuildRankInfo2(rinfo);
    if (printStuff) rankInfo2::printIt(rinfo);

    BuildRankInfo2(rinfo);
    if (printStuff) rankInfo2::printIt(rinfo);

    //busyThresh: must be > load time.
    //lazyThresh: Not sure here.... maybe some fraction of least busy?
    float busyThresh = 1.5 * LOAD_TIME;
    float lazyThresh = 0.0;
    
    if (printRank0Stuff) cout<<"BEGIN REBALANCE....."<<endl;
    int cnt = 0;
    while (cnt < 10)
    {
        vector<procInfo> busy, lazy;
        //Find a busy block that is N x cost of IO.
        for (int p = 0; p < nProcs; p++)
        {
            if (rinfo[p].t_cost > busyThresh)
            {
                for (int d = 0; d < rinfo[p].blocks.size(); d++)
                    if (rinfo[p].blocks[d].it_cost > busyThresh)
                    {
                        //if (rank == 0) cout<<" ++BUSY "<<p<<" i: "<<d<<" d: "<<rinfo[p].blocks[d].dom<<endl;
                        busy.push_back(procInfo(p, rinfo[p].blocks[d].dom, rinfo[p].blocks[d].it_cost));
                    }
            }
        }
        if (busy.empty())
            break;
        
        sort(busy.begin(), busy.end(), procInfo::rcmp);
        if (printRank0Stuff) cout<<"BUSY= "<<busy<<endl;
        lazyThresh = busy[busy.size()-1].val;
        for (int p = 0; p < nProcs; p++)
            if (rinfo[p].t_cost < lazyThresh)
                lazy.push_back(procInfo(p, rinfo[p].t_cost));
        if (printRank0Stuff) cout<<"** LT0= "<<lazyThresh<<endl;

        //Didn't find anyone.... loosen the threshold...
        if (lazy.empty())
        {
            int nb = busy.size();
            if (nb > 2)
            {
                //Set the cutoff to be avg busy val.
                float sum = 0.0;
                for (int i = 0; i < nb; i++)
                    sum += busy[i].val;
                lazyThresh = sum / (float)nb;

                /*
                //or, the mid point...
                lazyThresh = busy[nb/2].val;
                */
                
                if (printRank0Stuff) cout<<"** LT1= "<<lazyThresh<<endl;
                for (int p = 0; p < nProcs; p++)
                    if (rinfo[p].t_cost < lazyThresh)
                        lazy.push_back(procInfo(p, rinfo[p].t_cost));
            }
        }
        
        if (lazy.empty())
            break;
        
        sort(lazy.begin(), lazy.end(), procInfo::cmp);
        
        if (printRank0Stuff) cout<<"LT= "<<lazyThresh<<" BUSY: "<<busy<<" LAZY: "<<lazy<<endl;
        int n = min(busy.size(), lazy.size());
        for (int i = 0; i < n; i++)
        {
            AssignBlock(lazy[i].rank, busy[i].dom);
            if(printRank0Stuff)cout<<"   "<<lazy[i].rank<<" loads "<<busy[i].dom<<endl;
        }
        
        BuildRankInfo2(rinfo);
        if (printStuff) rankInfo2::printIt(rinfo);
        
        cnt++;
    }


    if(printRank0Stuff)cout<<"ALL DONE WITH REBALNCE: "<<endl;
    BuildRankInfo2(rinfo);
    if (printStuff) rankInfo2::printIt(rinfo);
    
    /*
    float busyCost = 30.0;
    float lazyCost = 10.0;
    vector<int> busy = rankInfo2::getThresh(rinfo, busyCost, true, false);
    vector<int> lazy = rankInfo2::getThresh(rinfo, lazyCost, false, false);
    cout<<"BUSY: "<<busy<<" LAZY: "<<lazy<<endl;
    */
    

    /*
    sort(rinfo.begin(), rinfo.end(), rankInfo2::t_cost_rcmp);
    int busy = rinfo[0].rank;
    int slacker = rinfo[rinfo.size()-1].rank;

    BuildRankInfo2(rinfo);
    AssignBlock(slacker, rinfo[busy].blocks[0].dom);

    BuildRankInfo2(rinfo);
    rankInfo2::printIt(rinfo);
    Barrier();
    */
}

void
avtAChowderICAlgorithm::DoRankCentricBalancing()
{
    vector<domInfo> dinfo;
    vector<rankInfo> rinfo;
    
    float perfectBal = 1.0/(float)nProcs;
    float busyThresh = perfectBal * 1.0;
    float lazyThresh = perfectBal * 1.0;
    BuildRankInfo(rinfo);
    BuildDomainInfo(dinfo);

    //This is a rank centric balancing.
    int cnt = 0;
    while (true)
    {
        vector<int> busy, lazy;
        if (printStuff) rankInfo::printIt(rinfo);
        busy = rankInfo::getThresh(rinfo, busyThresh, true, true);
        lazy = rankInfo::getThresh(rinfo, lazyThresh, false, true);
        if (printRank0Stuff) cout<<"busy: "<<busy<<" lazy: "<<lazy<<" ("<<busyThresh<<" "<<lazyThresh<<")"<<endl;
        
        if (lazy.empty() || busy.empty())
            break;

        int n = min(lazy.size(), busy.size());
        for (int i = 0; i < n; i++)
        {
            //Pick a random domain. Probably want to do a PDF based selection.
            //need to determine which block is causing the busy, and how to best assign.
            //NOTE: Ditto on the send. When sending seed, need to send according to a PDF so that
            //things are evenely distributed.
            //Also, don't assign blocks if you'll blow the cache.
            int di = randomIndex(rinfo[busy[i]].doms.size());
            AssignBlock(lazy[i], rinfo[busy[i]].doms[di]);
        }
        BuildRankInfo(rinfo);
        if (printStuff) rankInfo::printIt(rinfo);
        
        cnt++;
        if (cnt > 5)
            break;
    }
}

void
avtAChowderICAlgorithm::DoBlockCentricBalancing()
{
    vector<domInfo> dinfo;
    vector<rankInfo> rinfo;
    BuildRankInfo(rinfo);
    BuildDomainInfo(dinfo);
    
    if (printStuff) domInfo::printIt(dinfo);
    float perfectBal = 1.0/(float)nProcs;
    float busyThresh = perfectBal * 1.0;
    float lazyThresh = perfectBal * 1.0;
    
    int cnt = 0;
    while(true)
    {
        BuildRankInfo(rinfo);
        BuildDomainInfo(dinfo);
        vector<int> blockDup = computeDomainMcNeedy(dinfo, blockAssignments);
        vector<pair<int, int> > candidateBlocks;
        for (int i = 0; i < blockDup.size(); i++)
            if (blockDup[i] > 0)
                candidateBlocks.push_back(pair<int,int>(i, blockDup[i]));
        
        //sort them so that blocks needing most duplication will be first.
        sort(candidateBlocks.begin(), candidateBlocks.end(), biggestDup);
        
        if(printRank0Stuff) cout<<"blocks to dup= "<<candidateBlocks<<endl;
        
        vector<int> lazy;
        lazy = rankInfo::getThresh(rinfo, lazyThresh, false, true);
        if (candidateBlocks.empty() || lazy.empty())
            break;

        int n = min(lazy.size(), candidateBlocks.size());
        for (int i = 0; i < n; i++)
            AssignBlock(lazy[i], candidateBlocks[i].first);
        BuildRankInfo(rinfo);
        BuildDomainInfo(dinfo);
        cnt++;

        if (cnt > 100)
            break;
    }
    if (printStuff) domInfo::printIt(dinfo);
    if (printStuff) rankInfo::printIt(rinfo);
}

void
avtAChowderICAlgorithm::UpdateBlockAssignments()
{
    vector<domInfo> dinfo;
    vector<rankInfo> rinfo;
    
    BuildDomainInfo(dinfo);
    BuildRankInfo(rinfo);

    /*
    sort(dinfo.begin(), dinfo.end(), domInfo::rcmp);
    sort(rinfo.begin(), rinfo.end(), rankInfo::cmp);
    rankInfo::printIt(rinfo);
    domInfo::printIt(dinfo);
    */
    
    if (printRank0Stuff) cout<<"BEGIN"<<endl;
    if (printStuff) rankInfo::printIt(rinfo);
    
    DoRankCentricBalancing2();
    //DoRankCentricBalancing();
    //DoBlockCentricBalancing();

    if (printRank0Stuff) cout<<"REBALANCE"<<endl;
    BuildRankInfo(rinfo);
    if (printStuff) rankInfo::printIt(rinfo);

    vector<float> balance;
    ComputeBalance(balance);
    
    if (printRank0Stuff)
    {
        cout<<"Block to Rank Assignments: "<<endl;
        for (int i = 0; i < numDomains; i++)
        {
            cout<<"block_"<<i<<" :ranks: "<<blockAssignments[i]<<endl;
        }

        cout<<"Rank Balance: "<<endl;
        float perfectBal = 1.0f/(float)nProcs;
        for (int i = 0; i < nProcs; i++)
            cout<<i<<" "<<balance[i]<<" <"<<balance[i]/perfectBal<<">"<<endl;
    }
}

void
avtAChowderICAlgorithm::ComputeBalance(vector<float> &balance)
{
    balance.resize(nProcs, 0.0f);
    
    float bt = 0.0;
    for (int i = 0; i < nProcs; i++)
    {
        for (int j = 0; j < numDomains; j++)
        {
            for (int k = 0; k < blockAssignments[j].size(); k++)
            {
                if (blockAssignments[j][k] == i)
                    balance[i] += blockPopularity[j];
            }
        }
        bt += balance[i];
    }
    
    for (int i = 0; i < nProcs; i++)
        if (bt > 0.0 )
            balance[i] /= bt;
}

void
avtAChowderICAlgorithm::AssignBlock(int r, int b)
{
    ldBlkSW.start();

    vector<int>::iterator it;
    it = find(blockAssignments[b].begin(), blockAssignments[b].end(), r);
    if (it == blockAssignments[b].end())
    {
        blockAssignments[b].push_back(r);
    }

    //if (rank==0) cout<<"RANK "<<r<<" loads "<<b<<" ************************************************"<<endl;

    //If it's me, then mark the block for lazy-loading.
    if (r == rank)
    {
        lazyLoadBlocks.insert(b);
        //cout<<rank<<" WILL LAZY LOAD DOM= "<<b<<" *************************************************"<<endl;
        /*
        avtVector pt;
        BlockIDType blk(b,0);
        GetDomain(blk, pt);
        numBlocksDuplicated++;
        */
    }
    
    ldBlkSW.stop();
}

static float random_1()
{
    return (float)rand()/(float)RAND_MAX;
}

static int randomIndex(int sz)
{
    if (sz == 1)
        return 0;
    
    vector<int> idx(sz);
    for (int i = 0; i < sz; i++)
        idx[i] = i;
    
    random_shuffle(idx.begin(), idx.end());
    return idx[0];
}

int
avtAChowderICAlgorithm::DomainToRank2(BlockIDType &blk)
{
    int sz = blockAssignments[blk.domain].size();

    //Pick a random recpient.
    int idx = randomIndex(sz);
    int r = blockAssignments[blk.domain][idx];
    //if (rank == 0) cout<<"Sending "<<blk<<" to "<<r<<" of "<<sz<<endl;

    return r;
}

static void
setArray(float *ptr, int nx, int ny, int nz)
{
    for (int i = 0; i < nx; i++)
        for (int j = 0; j < ny; j++)
            for (int k = 0; k < nz; k++)
            {
                int idx = i*ny*nz + j*nz + k;
                ptr[idx] = ptr[idx];
            }
}

static vtkFloatArray *
mkArr(const char *nm, int n, vtkRectilinearGrid *rg)
{
    vtkFloatArray *arr = vtkFloatArray::New();
    arr->SetNumberOfTuples(n);
    arr->SetName(nm);

    rg->GetCellData()->AddArray(arr);
    arr->Delete();
    return arr;
}

void
avtAChowderICAlgorithm::DumpBlockStatsData(const vector<int> &actualIterations)
{
    if (rank != 0)
        return;
    
    int nx, ny, nz;

    //assume regular....
    if (1)
    {
        float v = pow((float)numDomains, 0.333333);
        nx = (int)v + 1;
        ny = (int)v + 1;
        nz = (int)v + 1;
    }
    else
    {
        nx = 4;
        ny = 2;
        nz = 4;
    }
    cout<<"Nxyz= "<<nx<<" "<<ny<<" "<<nz<<" numDomains= "<<numDomains<<endl;


    int res[3] = {nx+1, ny+1, nz+1};
    vtkRectilinearGrid *rg = vtkRectilinearGrid::New();
    rg->SetDimensions(res);
    
    vtkFloatArray *x = vtkFloatArray::New();
    vtkFloatArray *y = vtkFloatArray::New();
    vtkFloatArray *z = vtkFloatArray::New();

    x->SetNumberOfTuples(res[0]);
    y->SetNumberOfTuples(res[1]);
    z->SetNumberOfTuples(res[2]);

    float x0 = 0.0, x1 = 1.0;
    float y0 = 0.0, y1 = 1.0;
    float z0 = 0.0, z1 = 1.0;
    float dx = (x1-x0) / (float)(res[0]-1);
    float dy = (y1-y0) / (float)(res[1]-1);
    float dz = (z1-z0) / (float)(res[2]-1);
    
    for (int i = 0; i < res[0]; i++)
        x->SetTuple1(i, x0+i*dx);
    x->SetTuple1(res[0]-1, x1);
    for (int i = 0; i < res[1]; i++)
        y->SetTuple1(i, y0+i*dy);
    y->SetTuple1(res[1]-1, y1);
    for (int i = 0; i < res[2]; i++)
        z->SetTuple1(i, z0+i*dz);
    z->SetTuple1(res[2]-1, z1);

    rg->SetXCoordinates(x);
    rg->SetYCoordinates(y);
    rg->SetZCoordinates(z);

    vtkFloatArray *doms = mkArr("doms", numDomains, rg);
    vtkFloatArray *predicted = mkArr("predicted", numDomains, rg);
    //vtkFloatArray *predicted2 = mkArr("predicted2", numDomains, rg);
    //vtkFloatArray *diff_predicted = mkArr("diff_predicted", numDomains, rg);
    vtkFloatArray *actual = mkArr("actual", numDomains, rg);
    vtkFloatArray *diff = mkArr("diff", numDomains, rg);
    //vtkFloatArray *diff2 = mkArr("diff2", numDomains, rg);
    //vtkFloatArray *diffN = mkArr("diffN", numDomains, rg);
    vtkFloatArray *abs_diff = mkArr("abs_diff", numDomains, rg);
    //vtkFloatArray *abs_diff2 = mkArr("abs_diff2", numDomains, rg);
    vtkFloatArray *dom_dup = mkArr("dom_duplication", numDomains, rg);

    int d = 0;
    for (int i = 0; i < nx; i++)
        for (int j = 0; j < ny; j++)
            for (int k = 0; k < nz; k++)
            {
                int idx = k*nx*ny + j*nz + i;
                doms->SetTuple1(idx, d);
                predicted->SetTuple1(idx, blockPopularity[d]);
                //predicted2->SetTuple1(idx, blockPopularity2[d]);
                //diff_predicted->SetTuple1(idx, blockPopularity2[d]-blockPopularity[d]);
                actual->SetTuple1(idx, actualIterations[d]);
                diff->SetTuple1(idx, actualIterations[d]-blockPopularity[d]);
                //diff2->SetTuple1(idx, actualIterations[d]-blockPopularity2[d]);
                //diffN->SetTuple1(idx, (actualIterations[d]-blockPopularity[d])/blockPopularity[d]);
                abs_diff->SetTuple1(idx, fabs(actualIterations[d]-blockPopularity[d]));
                //abs_diff2->SetTuple1(idx, fabs(actualIterations[d]-blockPopularity2[d]));
                dom_dup->SetTuple1(idx, blockAssignments[d].size());

                d++;
            }

    vtkDataSetWriter *writer = vtkDataSetWriter::New();
    writer->SetFileName("sl_dump.vtk");
    writer->SetInputData(rg);
    writer->Update();
    writer->Write();


    x->Delete();
    y->Delete();
    z->Delete();

    rg->Delete();
    writer->Delete();
}

class domainID
{
public:
    domainID(int d) :dom(d), sub(-1) {}
    domainID(int d, int s) :dom(d), sub(s) {}
    int dom, sub;
    string nm;

    bool operator< (const domainID &x) const { return (dom==x.dom ? (sub < x.sub) : dom < x.dom); }
    
    /*
    bool operator() (const domainID &x, const domainID &y) const {return false;}
    bool operator== (const domainID &y) const {return false;}
    bool operator< (const domainID &y) const {return false;}

    friend bool operator< (const domainID &x, const domainID &y) {return false;}    
    friend bool operator== (const domainID &x, const domainID &y) {return false;}    
    */
};

//    bool operator() (const domainID &x, const domainID &y) {return false;}
//    bool operator== ( const domainID &y) {return false;}
//    bool operator< (domainID &y) {return false;}    

/*
int test()
{
    //typedef std::pair<int,int> domainID;
    std::map<std::pair<domainID,domainID>, int> mappy1;

    domainID src(3), dst(4);
    
    std::pair<domainID, domainID> p(src, dst);
    mappy1[p] = 3324;

    std::map<std::pair<int,int>, int> mappy2;

    mappy2[make_pair(3,4)] = 93;
}
*/

avtStreamlineIC *
avtAChowderICAlgorithm::makeIC(const avtVector &p)
{
    unsigned char attr = avtStateRecorderIntegralCurve::SAMPLE_POSITION;
    avtVector dir(0,0,0);
    double t;
    
    avtStreamlineIC *s = new avtStreamlineIC(maxTestSteps, false, 0.0, false, 0.0, attr, 
                                             picsFilter->solver,
                                             avtIntegralCurve::DIRECTION_FORWARD,
                                             t, p, dir, 0);
    return s;
}

class nextBlock
{
public:
    nextBlock() {cnt=0; numIters=0;}
    
    void visit(int i) {cnt++; numIters += i;}
    
    int cnt, numIters;
};

void
avtAChowderICAlgorithm::GenerateTestPts(int d, int s, int nPts, vector<avtVector> &pts)
{
    int start = blockInfo[d]->GetLeafFromIndex(s)->gid;
    
    float bb[6];
    blockInfo[d]->GetLeafFromIndex(s)->GetBBox(bb);
    
    float dx=bb[1]-bb[0], dy=bb[3]-bb[2], dz=bb[5]-bb[4];
    
    map<int, nextBlock*> destinations;
    int num = 0;
    for (int i = 0; i < nPts; i++)
        pts.push_back(avtVector(bb[0] + random_1()*dx,
                                bb[2] + random_1()*dy,
                                bb[4] + random_1()*dz));
}

void
avtAChowderICAlgorithm::GenerateTestPts(int d, vector<avtIntegralCurve *> &ics, vector<avtVector> &pts)
{
    for (int i = 0; i < ics.size(); i++)
        if (ics[i]->blockList.front().domain == d)
            pts.push_back(ics[i]->CurrentLocation());
}

void
avtAChowderICAlgorithm::RunTestPts(int d, int s, vector<avtVector> &pts, int **blockData)
{
    DomainBlock *blk = blockInfo[d]->GetLeafFromIndex(s);
    int start = blk->gid;
    
    StopWatch advT;
    map<int, nextBlock*> destinations;
    int num = 0, sz = pts.size();
    for (int i = 0; i < sz; i++)
    {
        avtStreamlineIC *s = makeIC(pts[i]);

        //See where they go...
        advT.start();
        int iters = AdvectParticle(s);
        advT.stop();
        int end = start;

        if (iters > 0)
        {
            if (!s->blockList.empty())
            {
                DomainBlock *dst = blockInfo[s->blockList.front().domain]->GetLeaf(s->CurrentLocation());
                end = dst->gid;
            }
            //Terminated/left mesh.
            //leave end the same, max out iters.
            else
            {
                //iters = maxTestSteps;
            }
        }
        delete s;
        if (iters == 0)
            continue;

        nextBlock *n = NULL;
        map<int, nextBlock*>::iterator it = destinations.find(end);
        if (it == destinations.end())
        {
            n = new nextBlock;
            destinations[end] = n;
        }
        else
            n = it->second;

        n->visit(iters);
        numTestParticlesSteps += iters;
        num++;
    }
    
    ADVECT_TIME += advT.t;
    
    map<int, nextBlock*>::iterator it;
    int index = 0;
    for (it = destinations.begin(); it != destinations.end(); it++)
    {
        nextBlock *n = it->second;

        // dst, totalNumICs, numICsToDst, totalNumICs, totalSteps,
        blockData[start][index++] = it->first;
        blockData[start][index++] = n->cnt;
        blockData[start][index++] = n->numIters;
        blockData[start][index++] = num;
        //if (rank == 0)cout<<start<<": --> "<<it->first<<" "<<num<<" index= "<<index<<endl;
        delete n;
    }
    if (index >= NVALS)
    {
        char msg[512];
        sprintf(msg, "MEMORY overflow in blockData. Increase size of NVALS. index=%d NVALS=%d\n", index, NVALS);
        EXCEPTION1(VisItException, msg);
    }
    destinations.clear();
}

void
avtAChowderICAlgorithm::BalanceWorkload(vector<avtIntegralCurve *> &ics)
{
    DomainBlock::CreateBlockInfo(blockInfo, numDomains, picsFilter->intervalTree,
                                 subdivUniform,
                                 subdivNX, subdivNY, subdivNZ, subdivPct);

    int totNumLeafs = DomainBlock::TotalNumLeaves(blockInfo);
    int **blockData = new int*[totNumLeafs];
    for (int i = 0; i < totNumLeafs; i++)
    {
        blockData[i] = new int[NVALS];
        for (int j = 0; j < NVALS; j++)
            blockData[i][j] = -1;
    }
    int bdSz = totNumLeafs*NVALS;
    int *gblBlockData = new int[bdSz];
    
    for (int i = 0; i < bdSz; i++) gblBlockData[i] = -1;

    tstPtsSW.start();
    //Generate test seeds.
    int tot = 0;
    for (int i = 0; i < numDomains; i++)
    {
        BlockIDType d(i,0);
        int owner = DomainToRank(d);
        int numLeafs = blockInfo[i]->NumLeafs();
            
        if (owner == rank)
        {
            for (int j = 0; j < numLeafs; j++)
            {
                vector<avtVector> pts;
                if (!subdivUniform && j == 0)
                {
                    //put more samples in the interior block.
                    //DRP... BUG. Too many points causes problems... Not sure why.
                    //GenerateTestPts(i, j, 1*numTestSeeds, pts);
                    GenerateTestPts(i, ics, pts);
                    if (pts.size() > numTestSeeds)
                    {
                        random_shuffle(pts.begin(), pts.end());
                        pts.resize(numTestSeeds);
                    }
                }
                else
                    GenerateTestPts(i, j, numTestSeeds, pts);
                
                RunTestPts(i, j, pts, blockData);
                tot += numTestSeeds;

                int idx = i*numLeafs*NVALS + (j*NVALS);
                for (int k = 0; k < NVALS; k++)
                    gblBlockData[idx+k] = blockData[i*numLeafs+j][k];
            }
        }

        /*
        //Share results....
        for (int k = 0; k < numLeafs; k++)
            MPI_Bcast(&(blockData[i*numLeafs+k][0]), NVALS, MPI_INT, owner, VISIT_MPI_COMM);
        */

    }
    
    MPI_Allreduce(MPI_IN_PLACE, gblBlockData, bdSz, MPI_INT, MPI_MAX, VISIT_MPI_COMM);
    //Copy values back into structure.
    for (int i = 0; i < numDomains; i++)
    {
        int numLeafs = blockInfo[i]->NumLeafs();
        for (int j = 0; j < numLeafs; j++)
        {
            for (int k = 0; k < NVALS; k++)
                blockData[i*numLeafs+j][k] = gblBlockData[i*numLeafs*NVALS + (j*NVALS) + k];
        }
    }
    tstPtsSW.stop();

    //Make sure data are identical:
    /*
    if (rank == 0)
    {
        for (int i = 0; i < numDomains; i++)
        {
            int numLeafs = blockInfo[i]->NumLeafs();
            for (int j = 0; j < numLeafs; j++)
            {
                for (int k = 0; k < NVALS; k++)
                    if (blockData[i*numLeafs+j][k] != gblBlockData[i*numLeafs*NVALS + (j*NVALS) + k])
                    {
                        cout<<"ERROR: values dont match!!!!!! "<<i<<" "<<j<<" "<<k<<endl;
                        EXCEPTION1(VisItException, "MEOW: things no matchy!");
                    }
            }
        }
    }
    */

    //Push blockData into blockInfo.
    for (int i = 0; i < numDomains; i++)
    {
        int numLeafs = blockInfo[i]->NumLeafs();
        for (int j = 0; j < numLeafs; j++)
        {
            DomainBlock *blk = blockInfo[i]->GetLeafFromIndex(j);
            int id = blk->gid;
            for (int k = 0; k < NVALS; k +=4)
            {
                if (blockData[id][k] < 0)
                    break;
                //id, numICs, numIters, totalNumICs.
                DomainBlock *dstBlk = DomainBlock::GetBlockFromGID(blockInfo, blockData[id][k]);
                blk->AddBlockData(dstBlk,
                                  blockData[id][k+1],
                                  blockData[id][k+2],
                                  blockData[id][k+3]);
            }
        }
        blockInfo[i]->UnifyData();
    }

    probSW.start();
    if (popMethod == PROB_TREE)
        ComputeBlockPopProbTree(ics);
    else if (popMethod == RANDOM_WALK)
        ComputeBlockPopRandomWalk(ics);
    probSW.stop();

    if (printRank0Stuff)
    {
        cout<<"Block Popularity"<<endl;
        float t = 0.0f;
        for(int i = 0; i < numDomains; i++)
            t += blockPopularity[i];
        
        char str[32];
        for(int i = 0; i < numDomains; i++)
        {
            sprintf(str, "%3d: (%.3f) %.1f", i, blockPopularity[i]/t, blockPopularity[i]);
            cout<<str<<endl;
        }
    }

    //Compute some stats, and do the balancing...
    SumIntAcrossAllProcessors(numTestParticlesSteps);
    SumFloatAcrossAllProcessors(ADVECT_TIME);
    LOAD_TIME = picsFilter->timeForAllInitialIO;
    SumFloatAcrossAllProcessors(LOAD_TIME);
    LOAD_TIME /= (float)numDomains;
    ADVECT_TIME /= (float)numTestParticlesSteps;
    
    //Time exchanging data for load via network.
    //TODO...

    //This is what I was testing with...
    /*
    LOAD_TIME = 10.0;
    ADVECT_TIME = 0.01;
    */

    upBlkSW.start();
    UpdateBlockAssignments();
    upBlkSW.stop();
    
    //DumpPythonCode(ics);

    for (int i = 0; i < totNumLeafs; i++)
        delete [] blockData[i];
    delete [] blockData;
}

void
avtAChowderICAlgorithm::ComputeBlockPopRandomWalk(vector<avtIntegralCurve *> &ics_)
{
    vector<avtIntegralCurve *> ics;
    int numICs = ics_.size();
    int maxNum = 25000;
    if (ics_.size() > maxNum)
    {
        vector<int> idx(numICs);
        for (int i = 0; i < numICs; i++)
            idx[i] = i;
        random_shuffle(idx.begin(), idx.end());
        ics.resize(maxNum);
        for (int i = 0; i < maxNum; i++)
            ics[i] = ics_[idx[i]];
    }
    else
    {
        ics.resize(numICs);
        for (int i = 0; i < numICs; i++)
            ics[i] = ics_[i];
    }
    
    avtStreamlineIC *ic = (avtStreamlineIC *)ics[0];
    float maxSteps = (float)ic->maxSteps;
    
    int n = ics.size();
    for (int i = rank; i < n; i += nProcs)
    {
        avtIntegralCurve *ic = ics[i];
        DomainBlock *blk0 = DomainBlock::GetLeaf(blockInfo, ic->CurrentLocation());

        int N = 5;
        for (int n = 0; n < N; n++)
        {
            DomainBlock *blk = blk0;
            float s = maxSteps;
            int domPrev = blk->dom;
            while (s > 0.0f)
            {
                int idx = blk->GetDataIdxFromPct(random_1());
                if (idx < 0)
                    break;
                
                float stepsTaken = blk->data[idx].avgIt;
                blockPopularity[blk->dom] += stepsTaken;
                s -= stepsTaken;
                blk = blk->data[idx].blk;
                //If we stay in the same domain, then we hit a sink.
                if (blk->dom == domPrev)
                    break;
                domPrev = blk->dom;
            }
        }
    }
    
    MPI_Allreduce(MPI_IN_PLACE, &(blockPopularity[0]), numDomains, MPI_FLOAT, MPI_SUM, VISIT_MPI_COMM);
}

void
avtAChowderICAlgorithm::ComputeBlockPopProbTree(vector<avtIntegralCurve *> &ics_)
{
    vector<avtIntegralCurve *> ics;
    int numICs = ics_.size();
    int maxNum = 25000;
    if (ics_.size() > maxNum)
    {
        vector<int> idx(numICs);
        for (int i = 0; i < numICs; i++)
            idx[i] = i;
        random_shuffle(idx.begin(), idx.end());
        ics.resize(maxNum);
        for (int i = 0; i < maxNum; i++)
            ics[i] = ics_[idx[i]];
    }
    else
    {
        ics.resize(numICs);
        for (int i = 0; i < numICs; i++)
            ics[i] = ics_[i];
    }
    
    int numLeafs = blockInfo[0]->NumLeafs();
    vector<float> expectedIterations(numDomains*numLeafs);

    for (int i = 0; i < numDomains; i++)
        for (int j = 0; j < numLeafs; j++)
        {
            expectedIterations[i*numLeafs+j] = 0.0f;
            DomainBlock *blk = blockInfo[i]->GetLeafFromIndex(j);
            for (int k = 0; k < blk->data.size(); k++)
                expectedIterations[i*numLeafs+j] += (blk->data[k].pct*blk->data[k].avgIt);
        }

    vector<pair<int, pair<float, float> > > nodes;
    avtStreamlineIC *ic = (avtStreamlineIC *)ics[0];
    int numStepsToUse = ic->maxSteps;

    int n = ics.size();
    for (int i = rank; i < n; i += nProcs)
    {
        avtStreamlineIC *ic = (avtStreamlineIC *)ics[i];
        if(ic->blockList.empty())
            continue;

        avtVector p = ic->CurrentLocation();
        BlockIDType b = ic->blockList.front();
        DomainBlock *blk = blockInfo[b.domain]->GetLeaf(p);

        int sd = blk->gid;
        float ri = numStepsToUse;
        nodes.push_back(make_pair(sd, make_pair(1.0, ri)));
    }

    
    //find global expected iterations
    int cnt = 0;
    while (!nodes.empty())
    {
        cnt++;
        int sd = nodes.begin()->first;
        float sp = nodes.begin()->second.first;
        float ri = nodes.begin()->second.second;
        nodes.erase(nodes.begin());

        // if we haven't advected enough, let's add next level to tree
        if (expectedIterations[sd] <= ri)
        {
            DomainBlock *blk = DomainBlock::GetBlockFromGID(blockInfo, sd);
            for (int i = 0; i < blk->data.size(); i++)
            {
                int nd = blk->data[i].blk->gid;
                float pd = blk->data[i].pct;
                float it = blk->data[i].avgIt;
                
                if (sp*pd >= minProbability)
                    nodes.push_back(make_pair(nd, make_pair(sp*pd, ri-expectedIterations[sd])));
            }
        }
        
        blockPopularity[sd/numLeafs] += sp*expectedIterations[sd];
    }
    
    MPI_Allreduce(MPI_IN_PLACE, &(blockPopularity[0]), numDomains, MPI_FLOAT, MPI_SUM, VISIT_MPI_COMM);
}

void
avtAChowderICAlgorithm::ReportStatistics(ostream &os)
{
    avtParICAlgorithm::ReportStatistics(os);

    if (rank != 0)
        return;

    os<<"Balance Report: *******************************************"<<endl;
    char tmp[128];
    float ta = 0.0, tio = 0.0;
    for (int i = 0; i < nProcs; i++)
    {
        ta += allAdvectTime[i];
        tio += allIOTime[i];
    }
    os<<"Rank  T_a   T_IO   (% total)"<<endl;
    for (int i = 0; i < nProcs; i++)
    {
        sprintf(tmp, "R_%02d %6.4f %6.4f (%4.2f %4.2f)", i,
                allAdvectTime[i], allIOTime[i],
                allAdvectTime[i]/ta, allIOTime[i]/tio);
        os<<tmp<<endl;
    }
    os<<endl;
    os<<"Block Assignments: R_i, {d0, d1, ...}"<<endl;
    for (int i = 0; i < nProcs; i++)
    {
        os<<"R_"<<i<<": {";
        for (int j = 0; j < numDomains; j++)
            for (int k = 0; k < blockAssignments[j].size(); k++)
                if (blockAssignments[j][k] == i)
                    os<<j<<" ";
        os<<"}"<<endl;
    }
    os<<endl;

    int tot = 0;
    for (int i = 0; i < nProcs; i++)
        tot += allRankIntegrateSteps[i];
    os<<"Rank   #Steps  %Steps"<<endl;
    for (int i = 0; i < nProcs; i++)
    {
        sprintf(tmp, "R_%03d: %6d  %4.2f", i,
                allRankIntegrateSteps[i],
                (float)allRankIntegrateSteps[i]/(float)tot);
        os<<tmp<<endl;
    }
    os<<endl;
    
    if (doBalance)
    {
        int tp = 0, ta = 0;
        for (int i = 0; i < numDomains; i++)
        {
            tp += (int)(blockPopularity[i] + 0.5);
            ta += allDomIntegrateSteps[i];
        }
        os<<"Dom     Pred  (%)    Act  (%)   :    Diff   err (<0 under predict)"<<endl;
        for (int i = 0; i < numDomains; i++)
        {
            float diff = blockPopularity[i]-allDomIntegrateSteps[i];
            float err;
            if (blockPopularity[i] > (float)(allDomIntegrateSteps[i])) //over predict
            {
                if (allDomIntegrateSteps[i] > 0)
                    err = (float)blockPopularity[i] / (float)allDomIntegrateSteps[i];
                else
                    err = 999.0;
            }
            else
            {
                if ((int)blockPopularity[i] == allDomIntegrateSteps[i]) //Spot on!
                    err = 0.0;
                else if (blockPopularity[i] > 0)
                    err = - (float)allDomIntegrateSteps[i] / (float)blockPopularity[i];
                else
                    err = -999.0;
            }
            sprintf(tmp, "D_%02d: %7d %3.2f %7d %3.2f : %8d %7.3f",
                    i,
                    (int)blockPopularity[i], blockPopularity[i]/(float)tp, 
                    allDomIntegrateSteps[i], allDomIntegrateSteps[i]/(float)ta,
                    (int)diff,  err);
            os<<tmp<<endl;
        }
        os<<endl;
        os<<"Domain Duplication:"<<endl;
        for (int i = 0; i < numDomains; i++)
        {
            if (blockAssignments[i].size() > 1)
                os<<i<<" "<<blockAssignments[i].size()<<" "<<blockAssignments[i]<<endl;
        }
        os<<endl;
        os<<"AvgTime (load, step) ("<<LOAD_TIME<<", "<<ADVECT_TIME<<") load/step= "<<LOAD_TIME/ADVECT_TIME<<endl;
        os<<endl;
    }

    if (rank==0)
    {
        DomainBlock::Dump(blockInfo, cout, 0);
    }
}

void
avtAChowderICAlgorithm::DumpPythonCode(std::vector<avtIntegralCurve *> &ics)
{
    if (rank != 0)
        return;

    int totNumLeafs = DomainBlock::TotalNumLeaves(blockInfo);
    
    FILE *fp = fopen("pyCode.py", "w");
    fprintf(fp, "import numpy as np\n");
    fprintf(fp, "np.set_printoptions(precision=2)\n");
    fprintf(fp, "nIters = 10\n");
    fprintf(fp, "A = np.zeros(shape=(%d,%d))\n", totNumLeafs, totNumLeafs);
    
    for (int i = 0; i < totNumLeafs; i++)
    {
        DomainBlock *blk = DomainBlock::GetBlockFromGID(blockInfo, i);
        for (int j = 0; j < blk->data.size(); j++)
        {
            fprintf(fp, "A[%d][%d] = %f\n", i, blk->data[j].blk->gid, blk->data[j].pct);
        }
    }
    fprintf(fp, "\n");
    //create GID to domain map.
    fprintf(fp, "blkMap = [-1]*%d\n", totNumLeafs);
    for (int i = 0; i < totNumLeafs; i++)
    {
        DomainBlock *blk = DomainBlock::GetBlockFromGID(blockInfo, i);
        fprintf(fp, "blkMap[%d] = %d\n", i, blk->dom);
    }
    fprintf(fp, "\n");

    fprintf(fp, "S = np.zeros(shape=(1,%d))\n", totNumLeafs);
    vector<int> seedCnt(totNumLeafs, 0);
    for (int i = 0; i < ics.size(); i++)
    {
        avtVector p = ics[i]->CurrentLocation();
        for (int j = 0; j < blockInfo.size(); j++)
        {
            DomainBlock *blk = blockInfo[j]->GetLeaf(p);
            if (blk)
                seedCnt[blk->gid]++;
        }
    }
    for (int i = 0; i < totNumLeafs; i++)
        fprintf(fp, "S[0][%d] = %d\n", i, seedCnt[i]);
    fprintf(fp, "\n\n");
    fprintf(fp, "X = np.dot(A,A)\n");
    fprintf(fp, "for i in range(nIters):\n");
    fprintf(fp, " X = np.dot(X,A)\n");
    fprintf(fp, "print X\n\n");
    fprintf(fp, "W = np.dot(S,X)\n");

    fprintf(fp, "\n\n");
    fprintf(fp, "Bal = [0]*%d\n", numDomains);
    for (int i = 0; i < totNumLeafs; i++)
    {
        fprintf(fp, "Bal[blkMap[%d]] = Bal[blkMap[%d]] + W[0][%d]\n", i, i, i);
    }
    fprintf(fp, "\n\n");
    fprintf(fp, "print Bal\n");

    fclose(fp);
}

void
avtAChowderICAlgorithm::DumpStats()
{
    //collect some information.
    MPI_Reduce(&domIntegrateSteps[0], &allDomIntegrateSteps[0], numDomains, MPI_INT, MPI_SUM, 0, VISIT_MPI_COMM);
    MPI_Reduce(&rankIntegrateSteps[0], &allRankIntegrateSteps[0], nProcs, MPI_INT, MPI_SUM, 0, VISIT_MPI_COMM);

    vector<float> advectTime(nProcs, 0.0f), IOTime(nProcs, 0.0f);
    advectTime[rank] = visitTimer->LookupTimer("AdvectParticle()");
    IOTime[rank] = picsFilter->timeForAllInitialIO + visitTimer->LookupTimer("GetDomain()");
    
    SumFloatArrayAcrossAllProcessors(&advectTime[0], &allAdvectTime[0], nProcs);
    SumFloatArrayAcrossAllProcessors(&IOTime[0], &allIOTime[0], nProcs);

    if (doBalance)
        DumpBlockStatsData(allDomIntegrateSteps);

#if 0

    vector<int> resultsDom(numDomains);
    MPI_Reduce(&domIntegrateSteps[0], &resultsDom[0], numDomains, MPI_INT, MPI_SUM, 0, VISIT_MPI_COMM);
    vector<int> resultsRank(nProcs);
    MPI_Reduce(&rankIntegrateSteps[0], &resultsRank[0], nProcs, MPI_INT, MPI_SUM, 0, VISIT_MPI_COMM);

    if (rank != 0)
        return;
    
    float totalIO = visitTimer->LookupTimer("Reading dataset") + visitTimer->LookupTimer("GetDomain()");
    cout<<"TIME: ADVECT: "<<visitTimer->LookupTimer("AdvectParticle()")<<" IO: "<<totalIO<<endl;
    cout<<"Block Assignments: R_i, {d0, d1, ...}"<<endl;
    for (int i = 0; i < nProcs; i++)
    {
        cout<<i<<": {";
        for (int j = 0; j < numDomains; j++)
            for (int k = 0; k < blockAssignments[j].size(); k++)
                if (blockAssignments[j][k] == i)
                    cout<<"R_"<<i<<" ";
        cout<<"}"<<endl;
    }
    cout<<"Load Balance: R, #Steps, %ofSteps"<<endl;
    float tot = 0.0;
    for (int i = 0; i < nProcs; i++)
        tot += resultsRank[i];
    for (int i = 0; i < nProcs; i++)
        cout<<i<<": "<<resultsRank[i], (float)resultsRank[i]/tot;
    
    if (doBalance)
    {
        cout<<"Avg DomLoad, Step "<<LOAD_TIME<<" "<<ADVECT_TIME<<" ratio= "<<LOAD_TIME/ADVECT_TIME<<endl;
        cout<<"Domain Load D : Pred Act : Diff"<<endl;
        for (int i = 0; i < numDomains; i++)
        {
            float diff = blockPopularity[i]-resultsDom[i];
            cout<<i<<": "<<blockPopularity[i]<<" "<<resultsDom[i]<<" : "<<diff<<" err: "<<diff/resultsDom[i]<<endl;
        }
        cout<<"Domain Duplication:"<<endl;
        for (int i = 0; i < numDomains; i++)
        {
            if (blockAssignments[i].size() > 1)
                cout<<i<<" "<<blockAssignments[i].size()<<" "<<blockAssignments[i]<<endl;
        }

        DumpBlockStatsData(resultsDom);
    }
#endif
}

void
avtAChowderICAlgorithm::CommTerm()
{
    if (numTerminated == 0)
        return;
    
    totalNumICs -= numTerminated;

    //Tell everyone else the good news.
    vector<int> msg(2);
    msg[0] = avtAChowderICAlgorithm::TERMINATE;
    msg[1] = numTerminated;
    for (int i = 0; i < nProcs; i++)
        if (i != rank)
            SendMsg(i, msg);
    numTermMsgs++;
    numTerminated = 0;
    //cout<<rank<<" CommTerm("<<numTerm<<") total= "<<totalNumICs<<endl;
}

void
avtAChowderICAlgorithm::CommReq()
{
    if (maxStealIC == 0 || maxICReq == 0 || ranksWithMyBlocks.empty() || numICReqPosted >= maxICReq)
        return;
    if (numICReqPosted > 0)
        return;

    //Beg for some work.
    int n = maxICReq;
    if (maxICReq < 0 || (maxICReq > 0 && maxICReq > ranksWithMyBlocks.size()))
    {
        n = ranksWithMyBlocks.size();
        random_shuffle(ranksWithMyBlocks.begin(), ranksWithMyBlocks.end());
    }

    vector<int> msg(2);
    msg[0] = avtAChowderICAlgorithm::REQUEST;
    msg[1] = n;

vector<int> victims(n);
    for (int i = 0; i < n; i++)
    {
        SendMsg(ranksWithMyBlocks[i], msg);
        numICRequests++;
        numICReqPosted++;
        victims[i] = ranksWithMyBlocks[i];
    }

//if (rank>=0) cout<<rank<<" has no work. REQUEST work from: "<<victims<<" of "<<ranksWithMyBlocks<<endl;
//    if (numICReqPosted > 1)
//      if (rank==0) cout<<rank<<" has multiple requests out: "<<numICReqPosted<<endl;
}

void
avtAChowderICAlgorithm::CommICs(list<avtIntegralCurve *> &l, int dstRank)
{
    //cout<<rank<<": sending "<<l.size()<<endl;

    list<avtIntegralCurve*>::iterator s;
    map<int, vector<avtIntegralCurve *> > sendICs;
    map<int, vector<avtIntegralCurve *> >::iterator it;

    for (s = l.begin(); s != l.end(); s++)
    {
        //int domRank = DomainToRank((*s)->blockList.front());
        
        //Pick a random destination.
        int domRank = dstRank;
        if (dstRank == -1)
            domRank = DomainToRank2((*s)->blockList.front());
        
        if (domRank == rank)
            activeICs.push_back(*s);
        else
        {
            //Add to sending map.
            it = sendICs.find(domRank);
            if (it == sendICs.end())
            {
                vector<avtIntegralCurve *> v;
                v.push_back(*s);
                sendICs[domRank] = v;
            }
            else
                it->second.push_back(*s);
        }
    }
    
    //Send out my ICs.
    for (it = sendICs.begin(); it != sendICs.end(); it++)
        SendICs(it->first, it->second);
}

void
avtAChowderICAlgorithm::ProcessMessages(std::vector<MsgCommData> &msgs)
{
    for (int i = 0; i < msgs.size(); i++)
    {
        int fromRank = msgs[i].rank;
        vector<int> &msg = msgs[i].message;
        int msgType = msg[0];
        
        if (msgType == avtAChowderICAlgorithm::TERMINATE)
            totalNumICs -= msg[1];
        else if (msgType == avtAChowderICAlgorithm::REQUEST)
        {
            list<avtIntegralCurve*> ics;
            set<int> reqDoms = rankToBlockMap.find(fromRank)->second;
            int msgNumReqs = msg[1];
            int maxSend = maxStealIC;
            int tot = activeICs.size() + inactiveICs.size();
            
            if (maxSend == -2)
                maxSend = activeICs.size();
            else if (maxSend == -1)
                maxSend = activeICs.size() / (msgNumReqs+1);
            
            FindICsInDoms(ics, activeICs, reqDoms, maxSend);
            FindICsInDoms(ics, inactiveICs, reqDoms, maxSend);
            if (!ics.empty())
            {
                vector<avtIntegralCurve *> icsV;
                icsV.insert(icsV.end(), ics.begin(), ics.end());
                SendICs(fromRank, icsV);
                //cout<<rank<<": "<<fromRank<<" REQUESTS. Sending "<<ics.size()<<" from "<<reqDoms<<endl;
            }
        }
    }
}

bool
avtAChowderICAlgorithm::CheckMessages()
{
    vector<MsgCommData> msgs;
    RecvMsg(msgs);
    ProcessMessages(msgs);
    return false;
}

void
avtAChowderICAlgorithm::FindICsInDoms(list<avtIntegralCurve *> &lOut,
                                      list<avtIntegralCurve *> &lIn,
                                      set<int> &reqDoms,
                                      int maxSend)
{
    int cnt = lOut.size();

    list<avtIntegralCurve*> tmp;
    while (!lIn.empty())
    {
        avtIntegralCurve *ic = lIn.front();
        lIn.pop_front();
        int d = ic->blockList.front().domain;
        
        if (reqDoms.find(d) != reqDoms.end())
        {
            lOut.push_back(ic);
            cnt++;
        }
        else
            tmp.push_back(ic);
        
        if (cnt >= maxSend)
            break;
    }
    
    lIn.insert(lIn.end(), tmp.begin(), tmp.end());
}

#endif


