/* Distributed under the Apache License, Version 2.0.
   See accompanying NOTICE file for details.*/

#include <stdio.h>
#include <string>
#include "LBM.h"

// LBM direction encoding. 3 letter variable formed from O=zero, P=plus, M=minus
// encode in 6 bits zzyyxx with less significant bit associated with P direction
// LDM model indices                        D3Q19
#define OOO  0  // 00 00 00                 0
#define OOP  1  // 00 00 01                 1  
#define OOM  2  // 00 00 10                 2
                // 00 00 11  not used
#define OPO  4  // 00 01 00                 3
#define OPP  5  // 00 01 01                 4 
#define OPM  6  // 00 01 10                 5
                // 00 01 11  not used
#define OMO  8  // 00 10 00                 6
#define OMP  9  // 00 10 01                 7
#define OMM 10  // 00 10 10                 8
                // 00 10 11  not used
                // 00 11 00  not used
                // 00 11 01  not used
                // 00 11 10  not used
                // 00 11 11  not used
#define POO 16  // 01 00 00                 9
#define POP 17  // 01 00 01                10
#define POM 18  // 01 00 10                11
                // 01 00 11  not used
#define PPO 20  // 01 01 00                12
#define PPP 21  // 01 01 01                not used
#define PPM 22  // 01 01 10                not used 
                // 01 01 11  not used
#define PMO 24  // 01 10 00                13
#define PMP 25  // 01 10 01                not used  
#define PMM 26  // 01 10 10                not used
                // 01 10 11  not used
                // 01 11 00  not used
                // 01 11 01  not used
                // 01 11 10  not used
                // 01 11 11  not used
#define MOO 32  // 10 00 00                14
#define MOP 33  // 10 00 01                15
#define MOM 34  // 10 00 10                16
                // 10 00 11  not used
#define MPO 36  // 10 01 00                17
#define MPP 37  // 10 01 01                not used
#define MPM 38  // 10 01 10                not used
                // 10 01 11  not used
#define MMO 40  // 10 10 00                18
#define MMP 41  // 10 10 01                not used
#define MMM 42  // 10 10 10                not used
                // 10 10 11  not used
                // 10 11 00  not used
                // 10 11 01  not used
                // 10 11 10  not used
                // 10 11 11  not used
                // 11 xx xx  not used

static float    WD3Q19 = 36.;
static float    wD3Q19[19] = { 12., 2., 2., 2., 1., 1., 2., 1., 1., 2., 1., 1., 1., 1., 2., 1., 1., 1., 1. };
static int    velD3Q19[19] = { OOO,OOP,OOM,OPO,OPP,OPM,OMO,OMP,OMM,POO,POP,POM,PPO,PMO,MOO,MOP,MOM,MPO,MMO };

// Direction codes in order of increasing distance from current node
//                Direction    0       1       2       4       8       16      32        5      10         6        9       20      40
//                           zyx     zyx     zyx      zyx     zyx      zyx     zyx      zyx     zyx       zyx      zyx      zyx     zyx
static int   nrmlD3Q19[19] = { OOO,    OOP,    OOM,     OPO,    OMO,     POO,    MOO,     OPP,    OMM,      OPM,     OMP,     PPO,    MMO,      PMO,     MPO,     POP,    MOM,      POM,     MOP };
static int dirD3Q19[19][3] = { {0,0,0},{0,0,1},{0,0,-1},{0,1,0},{0,-1,0},{1,0,0},{-1,0,0},{0,1,1},{0,-1,-1},{0,1,-1},{0,-1,1},{1,1,0},{-1,-1,0},{1,-1,0},{-1,1,0},{1,0,1},{-1,0,-1},{1,0,-1},{-1,0,1} };
static int   indxD3Q19[19] = { 0,      1,      2,       3,      4,       5,      6,       7,      8,        9,       10,      11,     12,       13,      14,      15,     16,       17,      18 };
static int StreamD3Q19[19] = { 0,  1,  1,  1,  2,  2,  1,  2,  2,  1,  2,  2,  2,  2,  1,  2,  2,  2,  2 };
//static int noslpD3Q19[19] = { OOO,OOM,OOP,OMO,OMM,OMP,OPO,OPM,OPP,MOO,MOM,MOP,MMO,MPO,POO,POM,POP,PMO,PPO };
//static int xMrflD3Q19[19] = { OOO,OOP,OOP,OPO,OPP,OPP,OMO,OMP,OMP,POO,POP,POP,PPO,PMO,MOO,MOP,MOP,MPO,MMO };
//static int xPrflD3Q19[19] = { OOO,OOM,OOM,OPO,OPM,OPM,OMO,OMM,OMM,POO,POM,POM,PPO,PMO,MOO,MOM,MOM,MPO,MMO };
//static int yMrflD3Q19[19] = { OOO,OOP,OOM,OPO,OPP,OPM,OPO,OPP,OPM,POO,POP,POM,PPO,PPO,MOO,MOP,MOM,MPO,MPO };
//static int yPrflD3Q19[19] = { OOO,OOP,OOM,OMO,OMP,OMM,OMO,OMP,OMM,POO,POP,POM,PMO,PPO,MOO,MOP,MOM,MMO,MMO };
//static int zMrflD3Q19[19] = { OOO,OOP,OOM,OPO,OPP,OPM,OMO,OMP,OMM,POO,POP,POM,PPO,PMO,POO,POP,POM,PPO,PMO };
//static int zPrflD3Q19[19] = { OOO,OOP,OOM,OPO,OPP,OPM,OMO,OMP,OMM,MOO,MOP,MOM,MPO,MMO,MOO,MOP,MOM,MPO,MMO };

// LBM model codes
#define D3Q19     1    //    1: Navier-Stokes D3Q19
#define D3Q19p0   2    //    2: Initial pressure approximation at u=0 for Navier-Stokes D3Q19
#define D3Q19T    3    //    3: Steady-state thermal field for given u D3Q19

// Set number of allowed wall node normal directions
#define nDirD3Q19 19   //  7: Identify normals along orthogonal directions
                       // 19: Identify normals along all LBM directions

static int nPDFmodels[4] = { 0,19,19,19 };
static int nContmodels[4] = { 0, 10, 4, 4 }; // 7 global computations for D3Q19 only


// LBM model sound speed (in nondimensional lattice units)
//                     N/A,        D3Q19:1/sqrt(3)
static float csLBM[4] = { 0.577350269,   0.577350269, 0.577350269, 0.577350269 };

// Boundary condition codes
#define BCinterior  0
#define BCinactive -1
#define WALL        0
#define mnWALLN     1
#define mxWALLN     99
#define PHYSICSBC   100 
#define IMPOSED_INFLOW 100
#define IMPOSED_OUTFLOW 200
#define COMPUTED_INFLOW 300
#define COMPUTED_OUTFLOW 400
#define uINFLOW  500
#define uOUTFLOW 600
#define xOUTFLOW 1200
#define xINFLOW  1100
#define xOUTFLOW 1200

// Constants
// Buffer size for x-direction streaming
#define mL 512
// Tile size for y,z-directions streaming
#define mB 16
// sqrt(2.)
#define S2 0.70710678118654746
// Maximum PDF components
#define maxF 24
// Maximum Q components
#define maxQ 12

static int DeviceMB = 0; // Max device memory in MB (16GB), should be replace with appropriate CUDA DeviceInfo call

extern "C"
LBM_DECL bool cuda_device_check(int use_device)
{
  int nDevices;
  size_t nMostMemory = 0;
  int nBiggestDevice = -1;
  std::string sBiggestDevice;
  std::string sUsingDevice;

  cudaError_t err = cudaGetDeviceCount(&nDevices);
  if (err != cudaSuccess || nDevices < 0)
  {
    printf("%s\n", cudaGetErrorString(err));
    printf("No CUDA compatible device found.\n");
    return false;
  }

  for (int i = 0; i < nDevices; i++)
  {
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, i);
    printf("Device Number: %d\n", i);
    printf("  Device name: %s\n", prop.name);
    printf("  Total Global Memory (Gb): %f\n", (prop.totalGlobalMem / 1e+9));
    printf("  Shared Memory Per Block (bytes): %lu\n", prop.sharedMemPerBlock);
    printf("  Memory Clock Rate (KHz): %d\n", prop.memoryClockRate);
    printf("  Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
    printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
      2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6);

    if (i == use_device)
    {
      sUsingDevice = prop.name;
      DeviceMB = prop.totalGlobalMem / 1e+6;
    }
    if (prop.totalGlobalMem > nMostMemory)
    {
      nBiggestDevice = i;
      sBiggestDevice = prop.name;
      nMostMemory = prop.totalGlobalMem;
    }
  }

  if (use_device >= nDevices)
  {
    printf("Requested device not found.\n");
    use_device = -1;
  }
  if (use_device < 0)
  {
    use_device = nBiggestDevice;
    sUsingDevice = sBiggestDevice;
    DeviceMB = nMostMemory / 1e+6;
    printf("Automatically using the largest device found\n");
  }
  cudaSetDevice(use_device);
  printf("Running CUDA on the %s\n\n", sUsingDevice.c_str());

  return true;
}

__constant__ __device__ float vecD3Q19[19][3] = { {0.,0.,0.},{0.,0.,1.},{0.,0.,-1.},{0.,1.,0.},{0.,-1.,0.},{1.,0.,0.},{-1.,0.,0.},{0.,S2,S2},{0.,-S2,-S2},{0.,S2,-S2},{0.,-S2,S2},{S2,S2,0.},{-S2,-S2,0.},{S2,-S2,0.},{-S2,S2,0.},{S2,0.,S2},{-S2,0.,-S2},{S2,0.,-S2},{-S2,0.,S2} };

//
// velD3Q19[19]={OOO,OOP,OOM,OPO,OPP,OPM,OMO,OMP,OMM,POO,POP,POM,PPO,PMO,MOO,MOP,MOM,MPO,MMO};
//                 0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18

__device__ void ComputeEqQ_D3Q19(float* f, float* fn, float* qeq, float tmp) {
  qeq[0] = 0.; for (int n = 0; n < 19; n++) qeq[0] += f[n];  // Pressure
  qeq[1] = f[1] - f[2] + f[4] - f[5] + f[7] - f[8] + f[10] - f[11] + f[15] - f[16]; // Velocity in x-dir
  qeq[2] = f[3] + f[4] + f[5] - f[6] - f[7] - f[8] + f[12] - f[13] + f[17] - f[18]; // Velocity in y-dir
  qeq[3] = f[9] + f[10] + f[11] + f[12] + f[13] - f[14] - f[15] - f[16] - f[17] - f[18]; // Velocity in z-dir
  qeq[4] = tmp * (fn[4] - fn[5] - fn[7] + fn[8]);  //Sigma_xy
  qeq[5] = tmp * (fn[10] - fn[11] - fn[15] - fn[16]);  //Sigma_xz
  qeq[6] = tmp * (fn[12] - fn[13] - fn[17] + fn[18]);  //Sigma_yz
  qeq[7] = tmp * (fn[1] + fn[2] + fn[4] + fn[5] + fn[7] + fn[8] + fn[10] + fn[11] + fn[15] + fn[16]); // Sigma_xx
  qeq[8] = tmp * (fn[3] + fn[4] + fn[5] + fn[6] + fn[7] + fn[8] + fn[12] + fn[13] + fn[17] + fn[18]); // Sigma_yy
  qeq[9] = tmp * (fn[9] + fn[10] + fn[11] + fn[12] + fn[13] + fn[14] + fn[15] + fn[16] + fn[17] + fn[18]); // Sigma_zz
}

__device__ void ComputeEqQns_D3Q19(float* f, float* qeq) {
    qeq[0] = 0.; for (int n = 0; n < 19; n++) qeq[0] += f[n];
    qeq[1] = f[1] - f[2] + f[4] - f[5] + f[7] - f[8] + f[10] - f[11] + f[15] - f[16];
    qeq[2] = f[3] + f[4] + f[5] - f[6] - f[7] - f[8] + f[12] - f[13] + f[17] - f[18];
    qeq[3] = f[9] + f[10] + f[11] + f[12] + f[13] - f[14] - f[15] - f[16] - f[17] - f[18];
}

__device__ void pComputeEqQ_D3Q19(float* f, float* qeq) {
  qeq[0] = 0.; for (int n = 0; n < 19; n++) qeq[0] += f[n];
}

__device__ void ComputeEqF_D3Q19(float* qeq, float* feq) {
  float p, u, v, w, U32, eu;
  p = qeq[0]; u = qeq[1]; v = qeq[2]; w = qeq[3];
  U32 = 1.5 * (u * u + v * v + w * w);
  eu = 0.;  feq[0] = 0.333333333 * (p - U32);
  eu = u;   feq[1] = 0.055555556 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = -u;   feq[2] = 0.055555556 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = v;   feq[3] = 0.055555556 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = v + u; feq[4] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = v - u; feq[5] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = -v;   feq[6] = 0.055555556 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = -v + u; feq[7] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = -v - u; feq[8] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = w;   feq[9] = 0.055555556 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = w + u; feq[10] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = w - u; feq[11] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = w + v; feq[12] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = w - v; feq[13] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = -w;   feq[14] = 0.055555556 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = -w + u; feq[15] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = -w - u; feq[16] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = -w + v; feq[17] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = -w - v; feq[18] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  for (int n = 0; n < 19; n++) if (feq[n] < 0.) feq[n] = 0.;
}

__device__ void pComputeEqF_D3Q19(float* qeq, float* feq) {
  float p;
  p = qeq[0]; p = p > 0. ? p : 0.;
  feq[0] = 0.333333333 * p;
  feq[1] = 0.055555556 * p;
  feq[2] = 0.055555556 * p;
  feq[3] = 0.055555556 * p;
  feq[4] = 0.027777778 * p;
  feq[5] = 0.027777778 * p;
  feq[6] = 0.055555556 * p;
  feq[7] = 0.027777778 * p;
  feq[8] = 0.027777778 * p;
  feq[9] = 0.055555556 * p;
  feq[10] = 0.027777778 * p;
  feq[11] = 0.027777778 * p;
  feq[12] = 0.027777778 * p;
  feq[13] = 0.027777778 * p;
  feq[14] = 0.055555556 * p;
  feq[15] = 0.027777778 * p;
  feq[16] = 0.027777778 * p;
  feq[17] = 0.027777778 * p;
  feq[18] = 0.027777778 * p;
}

void HostComputeEqF_D3Q19(float* qeq, float* feq) {
  float p, u, v, w, U32, eu;
  p = qeq[0]; u = qeq[1]; v = qeq[2]; w = qeq[3];
  U32 = 1.5 * (u * u + v * v + w * w);
  eu = 0.;  feq[0] = 0.333333333 * (p - U32);
  eu = u;   feq[1] = 0.055555556 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = -u;   feq[2] = 0.055555556 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = v;   feq[3] = 0.055555556 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = v + u; feq[4] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = v - u; feq[5] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = -v;   feq[6] = 0.055555556 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = -v + u; feq[7] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = -v - u; feq[8] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = w;   feq[9] = 0.055555556 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = w + u; feq[10] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = w - u; feq[11] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = w + v; feq[12] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = w - v; feq[13] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = -w;   feq[14] = 0.055555556 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = -w + u; feq[15] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = -w - u; feq[16] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = -w + v; feq[17] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  eu = -w - v; feq[18] = 0.027777778 * (p - U32 + (3. + 4.5 * eu) * eu);
  for (int n = 0; n < 19; n++) if (feq[n] < 0.) feq[n] = 0.;
}

// Testing routines
__global__ void EqF_D3Q19(int nPDF, int mQ, int Mx, int My,
  int Fsize, int Qsize, float* Q, float* F) {
  int i = threadIdx.x;  // Lattice i-index
  int j = blockIdx.x;  // Lattice j-index
  int k = blockIdx.y;  // Lattice k-index
  long int loc0, loc;
  int n;
  float f[maxF], q[maxQ];
  loc0 = i + Mx * (j + My * k);
  loc = loc0;
  for (n = 0; n < mQ; n++) {
    q[n] = Q[loc]; loc += Qsize;
  }
  ComputeEqF_D3Q19(q, f);
  loc = loc0;
  for (n = 0; n < nPDF; n++) {
    F[loc] = f[n]; loc += Fsize;
  }
}

__global__ void EqQ_D3Q19(int nPDF, int mQ, int Mx, int My,
  int Fsize, int Qsize, float* F, float* Q) {
  int i = threadIdx.x;  // Lattice i-index
  int j = blockIdx.x;  // Lattice j-index
  int k = blockIdx.y;  // Lattice k-index
  long int loc0, loc;
  int n;
  float f[maxF], q[maxQ];
  loc0 = i + Mx * (j + My * k);
  loc = loc0;
  for (n = 0; n < nPDF; n++) {
    f[n] = F[loc]; loc += Fsize;
  }
  ComputeEqQns_D3Q19(f, q);
  loc = loc0;
  for (n = 0; n < mQ; n++) {
    Q[loc] = q[n]; loc += Qsize;
  }
}

// Translate entries 1 index position to right
__global__ void StreamRight(float* Fin, float* Fout, int Lx, int Mx, int My) {
  int i = threadIdx.x;  // Lattice i-index
  int j = blockIdx.x;  // Lattice j-index
  int k = blockIdx.y;  // Lattice k-index
  long int loc;
  __shared__ float line[mL];
  loc = i + Mx * (j + My * k);
  if (i < Lx)
    line[i] = Fin[loc];
  __syncthreads();
  if (i > 0)
    Fout[loc] = line[i - 1];
  else
    Fout[loc] = line[0];
}

// Translate entries 1 index position to left
__global__ void StreamLeft(float* Fin, float* Fout, int Lx, int Mx, int My) {
  int i = threadIdx.x;  // Lattice i-index
  int j = blockIdx.x;  // Lattice j-index
  int k = blockIdx.y;  // Lattice k-index
  long int loc;
  __shared__ float line[mL];
  loc = i + Mx * (j + My * k);
  if (i < Lx)
    line[i] = Fin[loc];
  __syncthreads();
  if (i < Lx - 1)
    Fout[loc] = line[i + 1];
  else
    Fout[loc] = line[Lx - 1];
}

// Translate entries 1 index position up
__global__ void StreamUp(float* Fin, float* Fout, int Lx, int Ly, int Lz, int Mx, int My) {
  int tix = threadIdx.x;
  int tiy = threadIdx.y;
  int i = threadIdx.x + blockIdx.x * blockDim.x; // Lattice i-index
  int j = threadIdx.y + blockIdx.y * blockDim.y; // Lattice j-index
  int k, MxMy;
  long int loc0, loc1, locBC;
  __shared__ float tile[mB][mB + 1];
  MxMy = Mx * My;
  for (k = 0; k < Lz; k++) {
    locBC = i + MxMy * k; loc0 = locBC + Mx * (j - 1); loc1 = loc0 + Mx;
    if ((i < Lx) & (j < Ly)) {
      if (j > 0)
        tile[tix][tiy] = Fin[loc0];
      else
        tile[tix][tiy] = Fin[locBC];
    }
    __syncthreads();
    if ((i < Lx) & (j < Ly))
      Fout[loc1] = tile[tix][tiy];
  }
}
// Translate entries 1 index position down
__global__ void StreamDown(float* Fin, float* Fout, int Lx, int Ly, int Lz, int Mx, int My) {
  int tix = threadIdx.x;
  int tiy = threadIdx.y;
  int i = threadIdx.x + blockIdx.x * blockDim.x; // Lattice i-index
  int j = threadIdx.y + blockIdx.y * blockDim.y; // Lattice j-index
  int k, MxMy;
  long int loc0, loc1, locBC;
  __shared__ float tile[mB][mB + 1];
  MxMy = Mx * My;
  for (k = 0; k < Lz; k++) {
    locBC = i + Mx * j + MxMy * k; loc1 = locBC; loc0 = loc1 + Mx;
    if ((i < Lx) & (j < Ly)) {
      if (j < Ly - 1)
        tile[tix][tiy] = Fin[loc0];
      else
        tile[tix][tiy] = Fin[locBC];
    }
    __syncthreads();
    if ((i < Lx) & (j < Ly))
      Fout[loc1] = tile[tix][tiy];
  }
}
// Translate entries 1 index position up
__global__ void StreamFront(float* Fin, float* Fout, int Lx, int Ly, int Lz, int Mx, int My) {
  int tix = threadIdx.x;
  int tiy = threadIdx.y;
  int i = threadIdx.x + blockIdx.x * blockDim.x; // Lattice i-index
  int k = threadIdx.y + blockIdx.y * blockDim.y; // Lattice k-index
  int j, MxMy;
  long int loc0, loc1, locBC;
  __shared__ float tile[mB][mB + 1];
  MxMy = Mx * My;
  for (j = 0; j < Ly; j++) {
    locBC = i + Mx * j; loc0 = locBC + MxMy * (k - 1); loc1 = loc0 + MxMy;
    if ((i < Lx) & (k < Lz)) {
      if (k > 0)
        tile[tix][tiy] = Fin[loc0];
      else
        tile[tix][tiy] = Fin[locBC];
    }
    __syncthreads();
    if ((i < Lx) & (k < Lz))
      Fout[loc1] = tile[tix][tiy];
  }
}
// Translate entries 1 index position down
__global__ void StreamBack(float* Fin, float* Fout, int Lx, int Ly, int Lz, int Mx, int My) {
  int tix = threadIdx.x;
  int tiy = threadIdx.y;
  int i = threadIdx.x + blockIdx.x * blockDim.x; // Lattice i-index
  int k = threadIdx.y + blockIdx.y * blockDim.y; // Lattice k-index
  int j, MxMy;
  long int locBC, loc0, loc1;
  __shared__ float tile[mB][mB + 1];
  MxMy = Mx * My;
  for (j = 0; j < Ly; j++) {
    locBC = i + Mx * j + MxMy * k; loc1 = locBC; loc0 = loc1 + MxMy;
    if ((i < Lx) & (k < Lz)) {
      if (k < Lz - 1)
        tile[tix][tiy] = Fin[loc0];
      else
        tile[tix][tiy] = Fin[locBC];
    }
    __syncthreads();
    if ((i < Lx) & (k < Lz))
      Fout[loc1] = tile[tix][tiy];
  }
}

// Modify F values from most recent time step (stored in f0) to impose flow BC. Place result in f1
__device__ void SetF_FlowBC(int nLBMmodel, float* f0, float* f1,
  float* f0n1, float* f1n1, float* f0n2, float* f1n2,
  float* feq, int bcnrml, float* qbc) {
  //int i0 = threadIdx.x;  // Lattice i-index
  //int j0 =  blockIdx.x;  // Lattice j-index
  //int k0 =  blockIdx.y;  // Lattice k-index
  int n, bc, nrml;
  float q[maxQ];
  nrml = bcnrml % 100; bc = bcnrml - nrml;
  switch (nLBMmodel) {
  case D3Q19:   // Fluid flow, impose continuum values    
  {
    // velD3Q19[19]={OOO,OOP,OOM,OPO,OPP,OPM,OMO,OMP,OMM,POO,POP,POM,PPO,PMO,MOO,MOP,MOM,MPO,MMO};
    //                 0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
    // see LBM Mathematica notebook: $(qPAflow)/notes/LBMcalculations.m      
    switch (bc) {
    case IMPOSED_INFLOW: {
      for (n = 0; n < 4; n++) q[n] = qbc[n];
      // q[0] contains desired pressure
      // q[1] contains desired normal velocity. Project on Cartesian components
      // vec[0] - z component, vec[1] - y component, vec[2] - x component
      q[2] = q[1] * vecD3Q19[nrml][1]; q[3] = q[1] * vecD3Q19[nrml][0]; q[1] = q[1] * vecD3Q19[nrml][2];
      ComputeEqF_D3Q19(q, f1);
      break;
    }
    case IMPOSED_OUTFLOW: {
      for (n = 0; n < 4; n++) q[n] = qbc[n];
      // q[0] contains desired pressure
      // q[1] contains desired normal velocity. Project on Cartesian components
      q[2] = -q[1] * vecD3Q19[nrml][1]; q[3] = -q[1] * vecD3Q19[nrml][0]; q[1] = -q[1] * vecD3Q19[nrml][2];
      ComputeEqF_D3Q19(q, f1);
      break;
    }
    default: {
      printf("Warning: undefined flow BC: bcnrml=%d bc=%d nrml=%d\n", bcnrml, bc, nrml);
      for (int n = 0; n < 19; n++) f1[n] = f0[n];
    }
    }
    break;
  }
  case D3Q19p0: //  Dirichlet boundary condition for either initial pressure approximation
  case D3Q19T:  //  or steady-state advection-diffusion (e.g., temperature)
  {
    q[0] = qbc[0]; q[1] = q[2] = q[3] = 0.; // Dirichlet boundary condition on q[0]
    pComputeEqF_D3Q19(q, f1);
    break;
  }
  }
}

// Modify F values from most recent time step (stored in f0) to impose wall BC. Place result in f1
__device__ void SetF_WallBC(int nLBMmodel, float* f0, float* f1, int wBC) {
  switch (nLBMmodel) {
  case D3Q19:   // Fluid flow, implement bounce-back
  case D3Q19p0: //   same for initial pressure approximation
  case D3Q19T:  //   same for steady-state advection-diffusion
  {
    // velD3Q19[19]={OOO,OOP,OOM,OPO,OPP,OPM,OMO,OMP,OMM,POO,POP,POM,PPO,PMO,MOO,MOP,MOM,MPO,MMO};
    //                 0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
    f1[0] = f0[0];
    f1[1] = f0[2];
    f1[2] = f0[1];
    f1[3] = f0[6];
    f1[4] = f0[8];
    f1[5] = f0[7];
    f1[6] = f0[3];
    f1[7] = f0[5];
    f1[8] = f0[4];
    f1[9] = f0[14];
    f1[10] = f0[16];
    f1[11] = f0[15];
    f1[12] = f0[18];
    f1[13] = f0[17];
    f1[14] = f0[9];
    f1[15] = f0[11];
    f1[16] = f0[10];
    f1[17] = f0[13];
    f1[18] = f0[12];
    break;
  }
  }
}

// 1. Impose boundary conditions on most recent PDFs stored in F1
// 2. Transfer PDFs modified by boundary conditions into F0 in preparation for next iteration
__global__ void SetBC(int nLBMmodel, int nPDF, int Fsize, int nQ, int Qsize,
  int Mx, int My,
  float* F0, float* F1, float* Finact, int* G, float* Q1) {
  int i = threadIdx.x;  // Lattice i-index
  int j = blockIdx.x;  // Lattice j-index
  int k = blockIdx.y;  // Lattice k-index
  static int DirD3Q19[19][3] = { {0,0,0},{0,0,1},{0,0,-1},{0,1,0},{0,-1,0},{1,0,0},{-1,0,0},{0,1,1},{0,-1,-1},{0,1,-1},{0,-1,1},{1,1,0},{-1,-1,0},{1,-1,0},{-1,1,0},{1,0,1},{-1,0,-1},{1,0,-1},{-1,0,1} };
  long int loc, loc0, loc1, loc2;
  int n, d, bc, j0[3], j1[3], j2[3];
  // f0 , f1    old,new f values on a BC node
  // f0n1, f1n1 old,new f values on node interior to BC node along lattice normal n by 1 lattice step
  // f0n2, f1n2 old,new f values on node interior to BC node along lattice normal n by 2 lattice steps
  float f0[maxF], f1[maxF], f0n1[maxF], f1n1[maxF], f0n2[maxF], f1n2[maxF], feq[maxF], qbc[maxQ];
  // Offset within GPU memory of this lattice node
  loc0 = i + Mx * (j + My * k); j0[0] = i; j0[1] = j; j0[2] = k;
  // This node's boundary condition code
  bc = G[loc0];
  // Process node PDFs according to boundary condition type:
  // (-1) Inactive node: reinitialze node values in f1 to undo stream & collide
  if (bc == BCinactive) for (n = 0; n < nPDF; n++) f1[n] = Finact[n];
  // ( 1) Interior node: place values from current time step into f1
  if (bc == BCinterior) {
    loc = loc0;
    for (n = 0; n < nPDF; n++) { f1[n] = F1[loc]; loc += Fsize; }
  }
  // (1-64) Wall node: - place current time values current time step into f0;
  //                   - call WallBC to obtain f1 from f0
  if ((mnWALLN <= bc) && (bc <= mxWALLN)) {
    n = bc % 100;  // Code for normal orientation, find indices
    for (d = 0; d < 3; d++) j1[d] = j0[d] + DirD3Q19[n][2 - d];
    //   Place interior F1 values into f0 (1 lattice step in)
    loc1 = j1[0] + Mx * (j1[1] + My * j1[2]);
    loc = loc1;
    for (n = 0; n < nPDF; n++) { f0[n] = F1[loc]; loc += Fsize; }
    SetF_WallBC(nLBMmodel, f0, f1, bc);
  }
  // (>100) Physics condition (inflow/outflow/interface)
  if (bc >= PHYSICSBC) {
    // Provide data that might be needed by various physical boundary conditions:
    //   1. Load node continuum values into registers
    loc = loc0;
    for (n = 0; n < nQ; n++)
    {
      qbc[n] = Q1[loc]; loc += Qsize;
    }
    //   2. Load node PDFs into registers
    loc = loc0;
    for (n = 0; n < nPDF; n++) {
      f0[n] = F0[loc]; f1[n] = F1[loc]; loc += Fsize;
    }
    //   3. Load PDFs along interior-pointing normal direction into registers
    n = bc % 100;  // Code for normal orientation, find indices
    for (d = 0; d < 3; d++) {
      j1[d] = j0[d] + DirD3Q19[n][2 - d];
      j2[d] = j0[d] + 2 * DirD3Q19[n][2 - d];
    }
    //   4. Place interior F0,F1 values into f0n1,f1n1 (1 lattice step in)
    loc1 = j1[0] + Mx * (j1[1] + My * j1[2]);
    loc = loc1;
    for (n = 0; n < nPDF; n++) {
      f0n1[n] = F0[loc]; f1n1[n] = F1[loc]; loc += Fsize;
    }
    //   5. Place interior F0,F1 values into f0n2,f1n2 (2 lattice steps in)
    loc2 = j2[0] + Mx * (j2[1] + My * j2[2]);
    loc = loc2;
    for (n = 0; n < nPDF; n++) {
      f0n2[n] = F0[loc]; f1n2[n] = F1[loc]; loc += Fsize;
    }
    // Process BC
    SetF_FlowBC(nLBMmodel, f0, f1, f0n1, f1n1, f0n2, f1n2, feq, bc, qbc);
  }
  // Store BC-modified values from registers f1 into F0 in preparation for next time step
  // Also place values into F1 that will be used as work space in 2-step streaming operations
  loc = loc0;
  for (n = 0; n < nPDF; n++) {
    F1[loc] = F0[loc] = f1[n]; loc += Fsize;
  }
}

__global__ void CollideD3Q19(int nPDF, int Fsize, float tau,
  int Mx, int My,
  float* F1, float* Fneq) {
  int i = threadIdx.x;  // Lattice i-index
  int j = blockIdx.x;  // Lattice j-index
  int k = blockIdx.y;  // Lattice k-index
  long int loc, loc0;
  int n;
  float f0[19], f1[19], feq[19], qeq[4], fn[19];
  loc0 = i + Mx * (j + My * k);
  loc = loc0;
  for (n = 0; n < nPDF; n++)
  {
    f0[n] = F1[loc]; loc += Fsize;
  }
  ComputeEqQns_D3Q19(f0, qeq);
  ComputeEqF_D3Q19(qeq, feq);
  for (n = 0; n < nPDF; n++){
      f1[n] = f0[n] + (feq[n] - f0[n]) / tau;
      fn[n] = f1[n] - feq[n];
  }
  loc = loc0;
  for (n = 0; n < nPDF; n++)
  {
    F1[loc] = f1[n];
    Fneq[loc] = fn[n];
    loc += Fsize;
  }
}

// Approximate pressure Poisson solver (keeps velocity=0)
__global__ void p0CollideD3Q19(int nPDF, int Fsize, float tau,
  int Mx, int My,
  float* F1) {
  int i = threadIdx.x;  // Lattice i-index
  int j = blockIdx.x;  // Lattice j-index
  int k = blockIdx.y;  // Lattice k-index
  long int loc, loc0;
  int n;
  float f0[19], f1[19], feq[19], qeq[4];
  loc0 = i + Mx * (j + My * k);
  loc = loc0;
  for (n = 0; n < nPDF; n++)
  {
    f0[n] = F1[loc]; loc += Fsize;
  }
  pComputeEqQ_D3Q19(f0, qeq);
  pComputeEqF_D3Q19(qeq, feq);
  for (n = 0; n < nPDF; n++)
    f1[n] = f0[n] + (feq[n] - f0[n]) / tau;
  loc = loc0;
  for (n = 0; n < nPDF; n++)
  {
    F1[loc] = f1[n]; loc += Fsize;
  }
}

// Advection-diffusion solver (given velocity field)
__global__ void TCollideD3Q19(int nPDF, int Fsize, float tau,
  int Mx, int My,
  float* F1) {
  int i = threadIdx.x;  // Lattice i-index
  int j = blockIdx.x;  // Lattice j-index
  int k = blockIdx.y;  // Lattice k-index
  long int loc, loc0;
  int n;
  float f0[19], f1[19], feq[19], qeq[4];
  loc0 = i + Mx * (j + My * k);
  loc = loc0;
  for (n = 0; n < nPDF; n++)
  {
    f0[n] = F1[loc]; loc += Fsize;
  }
  pComputeEqQ_D3Q19(f0, qeq);
  pComputeEqF_D3Q19(qeq, feq);
  for (n = 0; n < nPDF; n++)
    f1[n] = f0[n] + (feq[n] - f0[n]) / tau;
  loc = loc0;
  for (n = 0; n < nPDF; n++)
  {
    F1[loc] = f1[n]; loc += Fsize;
  }
}

// Continuum field values for Navier-Stokes D3Q19p0 and D3Q19. Later also computes tangential components of viscous stress tensor
__global__ void ContVD3Q19p0(int nPDF, int mQ, int Fsize, int Qsize,
                           int Mx, int My, float* F1, float* Q1) {
    int i = threadIdx.x;  // Lattice i-index
    int j = blockIdx.x;  // Lattice j-index
    int k = blockIdx.y;  // Lattice k-index
    long int loc, loc0;
    int m, n;
    float f1[19], qeq[4];
    loc0 = i + Mx * (j + My * k);
    loc = loc0;
//    if(loc0>nPDF*Mx*My*My){printf("OUCH\n");};
    for (n = 0; n < nPDF; n++)
    {
        f1[n] = F1[loc]; loc += Fsize;
    }
    ComputeEqQns_D3Q19(f1, qeq);
    loc = loc0;
    for (m = 0; m < mQ; m++)
    {
        Q1[loc] = qeq[m]; loc += Qsize;
    }
}
__global__ void ContVD3Q19(int nPDF, int mQ, int Fsize, int Qsize,
  float tmp, int Mx, int My, float* F1, float* Q1, float* Fneq) {
  int i = threadIdx.x;  // Lattice i-index
  int j = blockIdx.x;  // Lattice j-index
  int k = blockIdx.y;  // Lattice k-index
  long int loc, loc0;
  int m, n;
  float f1[19], fn[19], qeq[10];
  loc0 = i + Mx * (j + My * k);
  loc = loc0;
  for (n = 0; n < nPDF; n++)
  {
    f1[n] = F1[loc];
    fn[n] = Fneq[loc];
    loc += Fsize;
  }
  ComputeEqQ_D3Q19(f1, fn, qeq, tmp);
  loc = loc0;
  for (m = 0; m < mQ; m++)
  {
    Q1[loc] = qeq[m]; loc += Qsize;
  }
}

// Continuum field values for advection-diffusion D3Q19
__global__ void ContTD3Q19(int nPDF, int mQ, int Fsize, int Qsize,
  int Mx, int My, float* F1, float* Q1) {
  int i = threadIdx.x;  // Lattice i-index
  int j = blockIdx.x;  // Lattice j-index
  int k = blockIdx.y;  // Lattice k-index
  long int loc, loc0;
  int m, n;
  float f1[19], qeq[4];
  loc0 = i + Mx * (j + My * k);
  loc = loc0;
  for (n = 0; n < nPDF; n++)
  {
    f1[n] = F1[loc]; loc += Fsize;
  }
  ComputeEqQns_D3Q19(f1, qeq);
  loc = loc0;
  for (m = 0; m < mQ; m++)
  {
    Q1[loc] = qeq[m]; loc += Qsize;
  }
}

// Host routines

void LBMinit(int nLBMmodel, int nPDF, int mQ, int* vel, int* s, float* w, float* finact) {
  int n;
  float qinact[maxQ];
  switch (nLBMmodel) {
  case D3Q19:
  case D3Q19p0:
  case D3Q19T: {
    for (n = 0; n < 19; n++) {
      vel[n] = velD3Q19[n]; s[n] = StreamD3Q19[n]; w[n] = wD3Q19[n] / WD3Q19;
    }
    qinact[0] = 1.; for (n = 1; n < mQ; n++) qinact[n] = 0.;
    HostComputeEqF_D3Q19(qinact, finact);
    break;
  }
  }
}

void LBMinitF(int nLBMmodel, int nPDF, int mQ, int* L, int* M, int Fsize, int Qsize,
  int* G, float* Q, float* F, float* QBC) {
  dim3 dG, dB;
  // A CUDA block processes a line of data along x-axis, one thread per lattice site
  dB.x = L[0]; dB.y = 1;    dB.z = 1;
  // The CUDA grid corresponds to dimensions along y and z
  dG.x = L[1]; dG.y = L[2];
  //
  switch (nLBMmodel) {
  case D3Q19:
  case D3Q19p0:
  case D3Q19T:
    EqF_D3Q19 << <dG, dB >> > (nPDF, mQ, M[0], M[1], Fsize, Qsize, Q, F);
    break;
  }
}

void LBMsetBC(int nLBMmodel, int nPDF, int Fsize, int nQ, int Qsize, int* L, int* M,
  long int* mF, float* F0, float* F1, float* Finact, int* G, float* QBC) {
  dim3 dG, dB;
  // Go through entire volume
  // A CUDA block processes a line of data along x-axis, one thread per lattice site
  dB.x = L[0]; dB.y = 1;    dB.z = 1;
  // The CUDA grid corresponds to dimensions along y and z
  dG.x = L[1]; dG.y = L[2];
  SetBC << <dG, dB >> > (nLBMmodel, nPDF, Fsize, nQ, Qsize, M[0], M[1], F0, F1, Finact, G, QBC);
}

void stream(int dir, int* L, int* M, float* F0, float* F1) {
  dim3 dG, dB;
  int nBx, nBy, nBz;
  switch (dir) {
  case 0: {
    dB.x = L[0]; dB.y = 1; dB.z = 1;
    dG.x = L[1]; dG.y = L[2];
    StreamRight << <dG, dB >> > (F0, F1, L[0], M[0], M[1]); // stream in positive x-direction
    break;
  }
  case 1: {
    dB.x = L[0]; dB.y = 1; dB.z = 1;
    dG.x = L[1]; dG.y = L[2];
    StreamLeft << <dG, dB >> > (F0, F1, L[0], M[0], M[1]); // stream in negative x-direction
    break;
  }
  case 2: {
    dB.x = mB; dB.y = mB; dB.z = 1;
    nBx = L[0] / mB; if (L[0] % mB > 0) nBx++;
    nBy = L[1] / mB; if (L[1] % mB > 0) nBy++;
    dG.x = nBx; dG.y = nBy;
    StreamUp << <dG, dB >> > (F0, F1, L[0], L[1], L[2], M[0], M[1]);  // stream in positive y-direction
    break;
  }
  case 3: {
    dB.x = mB; dB.y = mB; dB.z = 1;
    nBx = L[0] / mB; if (L[0] % mB > 0) nBx++;
    nBy = L[1] / mB; if (L[1] % mB > 0) nBy++;
    dG.x = nBx; dG.y = nBy;
    StreamDown << <dG, dB >> > (F0, F1, L[0], L[1], L[2], M[0], M[1]);  // stream in negative y-direction
    break;
  }
  case 4: {
    dB.x = mB; dB.y = mB; dB.z = 1;
    nBx = L[0] / mB; if (L[0] % mB > 0) nBx++;
    nBz = L[2] / mB; if (L[2] % mB > 0) nBz++;
    dG.x = nBx; dG.y = nBz;
    StreamFront << <dG, dB >> > (F0, F1, L[0], L[1], L[2], M[0], M[1]);  // stream in positive z-direction
    break;
  }
  case 5: {
    dB.x = mB; dB.y = mB; dB.z = 1;
    nBx = L[0] / mB; if (L[0] % mB > 0) nBx++;
    nBz = L[2] / mB; if (L[2] % mB > 0) nBz++;
    dG.x = nBx; dG.y = nBz;
    StreamBack << <dG, dB >> > (F0, F1, L[0], L[1], L[2], M[0], M[1]);  // stream in negative z-direction
    break;
  }
  }
}

void LBMstream(int nLBMmodel, int nPDF, int* vel, int* nStream,
  int* L, int* M, long int* mF, float* F0, float* F1) {
  int d, n;
  int mask;
  int nS[maxF];
  mask = 1;
  for (n = 1; n < nPDF; n++) nS[n] = nStream[n];
  for (d = 0; d < 6; d++) { // Loop over directions
    for (n = 1; n < nPDF; n++) { // Loop over PDFs
      if (vel[n] & mask) {
        if (nS[n] % 2 == 1)
          stream(d, L, M, F0 + mF[n], F1 + mF[n]);
        else
          stream(d, L, M, F1 + mF[n], F0 + mF[n]);
        nS[n]--;
      }
    }
    mask = mask << 1;
  }
}

void LBMcollide(int nLBMmodel, int nPDF, int Fsize, int* vel, float* w, float tau,
  int* L, int* M, float* F1, float* Fneq) {
  dim3 dG, dB;
  dB.x = L[0]; dB.y = 1; dB.z = 1;
  dG.x = L[1]; dG.y = L[2];
  switch (nLBMmodel) {
  case (D3Q19): {
    CollideD3Q19 << <dG, dB >> > (nPDF, Fsize, tau, M[0], M[1], F1, Fneq);
    break;
  }
  case (D3Q19p0): {
    p0CollideD3Q19 << <dG, dB >> > (nPDF, Fsize, tau, M[0], M[1], F1);
    break;
  }
  case (D3Q19T): {
    TCollideD3Q19 << <dG, dB >> > (nPDF, Fsize, tau, M[0], M[1], F1);
    break;
  }
  }
}

void LBMcontV(int nLBMmodel, int nPDF, int nCont, int Fsize, int Qsize,
  float tmp, int* L, int* M, float* F1, float* Q1, float* Fneq) {
  dim3 dG, dB;
  dB.x = L[0]; dB.y = 1; dB.z = 1;
  dG.x = L[1]; dG.y = L[2];
  switch (nLBMmodel) {
  case (D3Q19):{
    ContVD3Q19 << <dG, dB >> > (nPDF, nCont, Fsize, Qsize, tmp, M[0], M[1], F1, Q1, Fneq);
    break;
  }
  case (D3Q19p0): {
    ContVD3Q19p0 << <dG, dB >> > (nPDF, nCont, Fsize, Qsize, M[0], M[1], F1, Q1);
    break;
  }
  case (D3Q19T): {
    ContTD3Q19 << <dG, dB >> > (nPDF, nCont, Fsize, Qsize, M[0], M[1], F1, Q1);
    break;
  }
  }
}

// Utility routines:

/* FindWalls: Given the array g with boundary codes:
     -1   = inactive node
      0   = active node
      >63 = flow condition node
   modify g such that any inactive node linked by a lattice direction to an active node becomes a wall
   node with the appropriate inward-pointing normal direction. Lattice direction numbering convention is
   encoded in least-significant 6 bits:
*/
extern "C" int FindNormals(float* p, int* g, int Mx, int My, int Mz) {
  int TRUE = 1, INTERIOR = 0, INACTIVE = -1; // FALSE=0
  int i, j[3], idx1[3], idx2[3], n, d, nLBMmodel;
  int NeighborInLattice1, NeighborInLattice2, nloc1, loc;
  int L[3], M[3], fWallSitePrint, fWallSiteDebug, nWallSites, nFlowSites, BCcategory;
  int FoundInteriorNeighbor, InactiveSite, BCFlowSiteNoNormal;
  int DefineWallSiteNormal, DefineFlowSiteNormal;
  nLBMmodel = (int)p[0];                    // LBM model
  for (i = 0; i < 3; i++) L[i] = (int)p[i + 1];   // p[1:3] = lattice dimensions
  for (i = 0; i < 3; i++) M[i] = (int)p[i + 4];   // p[4:6] = lattice memory space
  fWallSitePrint = (int)p[31]; fWallSiteDebug = (int)p[30];
  nWallSites = 0; nFlowSites = 0;
  if (fWallSiteDebug)
    printf("FindNormals: nLBMmodel=%d, L[0]=%4d, L[1]=%4d, L[2]=%4d, M[0]=%4d, M[1]=%4d, M[2]=%4d\n",
      nLBMmodel, L[0], L[1], L[2], M[0], M[1], M[2]);
  for (j[0] = 0; j[0] < L[0]; j[0]++) {
    for (j[1] = 0; j[1] < L[1]; j[1]++) {
      for (j[2] = 0; j[2] < L[2]; j[2]++) {
        loc = j[0] + M[0] * (j[1] + M[1] * j[2]);
        if (g[loc] != INTERIOR) {
          // See if the inward-pointing normal direction must be defined at this site
          // Is this site inactive? (but perhaps a wall site for which a normal needs to be defined)
          InactiveSite = g[loc] < 0;
          // Is this site on a flow condition boundary for which the normal has not yet been defined?
          BCFlowSiteNoNormal = g[loc] % 100 == 0;
          BCcategory = g[loc] / 100;
          if (fWallSiteDebug)  printf("Checking site i,j,k=%d %d %d bc=%4d\n", j[0], j[1], j[2], g[loc]);
          switch (nLBMmodel) {
          case (D3Q19):
            for (n = 1; n < nDirD3Q19; n++) {
              // Find indices of neighboring lattice sites along each direction
              for (d = 0; d < 3; d++) {
                idx1[d] = j[d] + dirD3Q19[n][2 - d];
                idx2[d] = j[d] + 2 * dirD3Q19[n][2 - d];
              }
              NeighborInLattice1 = TRUE; NeighborInLattice2 = TRUE;
              for (d = 0; d < 3; d++) {
                NeighborInLattice1 = NeighborInLattice1 && (idx1[d] >= 0) && (idx1[d] < L[d]);
                NeighborInLattice2 = NeighborInLattice2 && (idx2[d] >= 0) && (idx2[d] < L[d]);
              }
              if (fWallSiteDebug)
                printf("Along direction nr. %d, idx1 = %4d %4d %d, NeighborInLattice1 = %d\n",
                  n, idx1[0], idx1[1], idx1[2], NeighborInLattice1);
              if (NeighborInLattice1) {
                // idx1[:] defines a valid interior site
                nloc1 = idx1[0] + M[0] * (idx1[1] + M[1] * idx1[2]);
                // Is this neighbor lattice site in the interior?
                FoundInteriorNeighbor = g[nloc1] == INTERIOR;
                // Is this a wall site, i.e., an inactive site with an interior neighbor?
                // If so, an interior pointing normal direction must be defined
                DefineWallSiteNormal = InactiveSite && FoundInteriorNeighbor;
                // Does this flow BC site have an interior neighbor?
                // If so, an interior pointing normal direction must be defined
                DefineFlowSiteNormal = BCFlowSiteNoNormal && FoundInteriorNeighbor;
                if (fWallSiteDebug)
                  printf("FoundInteriorNeighbor=%d, DefineWallSiteNormal=%d, DefineFlowSiteNormal=%d\n",
                    FoundInteriorNeighbor, DefineWallSiteNormal, DefineFlowSiteNormal);
                if (DefineWallSiteNormal) {
                  g[loc] = indxD3Q19[n]; nWallSites++;
                  if (fWallSitePrint) {
                    printf("Found wall node at i=%4d, j=%4d, k=%4d, n=%2d normal=%2d G=%4d\n", j[0], j[1], j[2], n, nrmlD3Q19[n], g[loc]);
                    printf("Neighbor at i=%4d j=%4d k=%4d with G=%4d\n", idx1[0], idx1[1], idx1[2], g[nloc1]);
                  }
                  break;
                }
                if (DefineFlowSiteNormal) {
                  // Store lattice direction corresponding to interior normal
                  g[loc] = BCcategory * 100 + indxD3Q19[n]; nFlowSites++;
                  if (!NeighborInLattice2) {
                    printf("Error: grid does not contain two nodes interior to flow boundary condition\n");
                    exit(-2);
                  }
                  if (fWallSitePrint) {
                    if (BCcategory == 1)
                      printf("Have defined inflow normal at i=%4d, j=%4d, k=%4d, n=%2d normal=%2d G=%4d\n", j[0], j[1], j[2], n, nrmlD3Q19[n], g[loc]);
                    else
                      printf("Have defined outflow normal at i=%4d, j=%4d, k=%4d, n=%2d normal=%2d G=%4d\n", j[0], j[1], j[2], n, nrmlD3Q19[n], g[loc]);
                  }
                  break;
                }
              }
            }
            // Flow BC sites not identified as connected to an interior site are set inactive
            if ((g[loc] >= 100) && (g[loc] % 100 == 0)) g[loc] = INACTIVE;
            break;
          }
        }
      }
    }
  }
  printf("   FindNormals has defined normals at %6d wall sites, %6d flow sites\n", nWallSites, nFlowSites);
  return 0;
}

/* LBMbc: Given an array of incompressible fluid parameters in SI units, modify it to
          LBM computational units
*/
extern "C" int LBMbc(float* p, float* qBC) {
  int i, n, nBound, nQ, nLBMmodel;
  // Variables with 'Ref' suffix are expressed in SI units
  float dxRef, dtRef, pRef, cSoundRef, viscRef, uRef, tau;
  // Load reference parameters
  nLBMmodel = (int)p[0];
  dxRef = p[20]; pRef = p[22]; cSoundRef = p[23]; viscRef = p[24];
  // Compute additional reference quantities
  uRef = cSoundRef / csLBM[nLBMmodel];        // Reference velocity
  dtRef = dxRef / uRef;                       // Reference time
  p[21] = dtRef;
  tau = 0.5 * (1. + 6. * viscRef * dtRef / dxRef / dxRef); // BGK relaxation time (non-dimensional)
  p[9] = tau;
  // Transform qBC from physical quantities to LBM non-dimensional quantities
  nBound = (int)p[10];                     // Nr. of boundary condition sets
  nQ = (int)p[11];                     // Nr. of Q components (e.g., 4 for Navier-Stokes)
  for (i = 0; i < nBound; i++) {
    qBC[nQ * i] = qBC[nQ * i] / pRef;
    for (n = 1; n < nQ; n++) qBC[nQ * i + n] = qBC[nQ * i + n] / uRef;
  };
  return 0;
}

// Main entry point to LBM time advancement
/* Known LBM models:
Code  Name       nPDF  nCont
0     (reserved)
1     D3Q19      19    4
*/

extern "C" int microLBM(float* p, float* q, int* g, float* qBC, float* f, float* fn, int Mx, int My, int Mz)
{
  int nt, nLatticeSites;
  static int Fsize, Qsize;
  long long nBytesGeom, nBytesPDF, nBytesCont, nBytes, nMB, nBytesQBC, nBytesFinact;
  int nPDF, nCont;
  int i, nLBMmodel, nQ, nBound, nProgress;
  int L[3], M[3];
  static long int offF[maxF];
  static int LBMvel[maxF], LBMstr[maxF];
  static float LBMw[maxF], finact[maxF];
  int InitializePDFs, CopyPDFsToHost, ComputeRelErr;
  float* q0;
  float tau, tmp, err, nrm, relerr, val, tol = 1.0e-08;
  static float* F0, * F1, *Fneq, * Q1, * QBC, * Finact;
  static int* G;
  int MegaByte = 1024 * 1024;
  //----------------------------------------------------------------------------
  /* Load run parameters */
  nLBMmodel = (int)p[0];                    // LBM model
  for (i = 0; i < 3; i++)
    L[i] = (int)p[i + 1];                   // p[1:3] = lattice dimensions
  for (i = 0; i < 3; i++)
    M[i] = (int)p[i + 4];                   // p[4:6] = lattice memory space
  nt = (int)p[7];                           // number of iterations
  InitializePDFs = (int)p[8];               // flag to initialize PDFs from q flow field
  tau = p[9];                               // Collide operator relaxation time
  nBound = (int)p[10];                      // Nr. of boundary condition sets
  nQ = (int)p[11];                          // Nr. of Q components (e.g., 4 for Navier-Stokes)
  CopyPDFsToHost = (int)p[12];              // flag to return PDFs  
  ComputeRelErr = (int)p[13];               // flag to compute rel err in 1st q component  
  nProgress = (int)p[14];                   // Iteration stride to show update in first q component
  tmp = 0.333333333 * (-1 + 0.5f/tau);      // Coefficient of viscous stress tensor
  //----------------------------------------------------------------------------
  nPDF = nPDFmodels[nLBMmodel];
  nCont = nContmodels[nLBMmodel];
  /* Allocate device memory */
  nLatticeSites = 1;
  for (i = 0; i < 3; i++)
    nLatticeSites *= M[i];
  nBytesGeom = nLatticeSites * sizeof(int);
  nBytesPDF = nLatticeSites * nPDF * sizeof(float);
  nBytesCont = nLatticeSites * nCont * sizeof(float);
  nBytesQBC = nQ * nBound * sizeof(float);
  nBytesFinact = maxF * sizeof(float);
  // Total space
  nBytes = nBytesGeom + 3 * nBytesPDF + nBytesCont + nBytesQBC + nBytesFinact;
  nMB = nBytes / 1024 / 1024;
  printf("Allocating %lld MB in CUDA\n", nMB);
  if (nMB > DeviceMB)
  {
    printf("Not enough memory on GPU device\n");
    printf("M=(%d,%d,%d) \n", M[0], M[1], M[2]);
    printf("nLatticeSites=%d \n", nLatticeSites);
    printf("nBytesGeom=%d MB\n", (int)(nBytesGeom / MegaByte));
    printf("nBytesPDFs=%d MB\n", (int)(2 * nBytesPDF / MegaByte));
    printf("nBytesCont=%d MB\n", (int)(nBytesCont / MegaByte));
    exit(1);
  }
  // Allocate space in Host memory
  if (ComputeRelErr)
  {
    q0 = (float*)malloc(nBytesCont / nCont);
    for (i = 0; i < M[0] * M[1] * M[2]; i++)
      q0[i] = q[nCont * i];
  }
  // Allocate space on GPU device memory:
  cudaMalloc((void**)&G, nBytesGeom);        // geometry flags
  cudaMalloc((void**)&F0, nBytesPDF);        // old PDFs
  cudaMalloc((void**)&F1, nBytesPDF);        // new PDFs
  cudaMalloc((void**)&Fneq, nBytesPDF);      // non-equilibrium PDFs
  cudaMalloc((void**)&Q1, nBytesCont);       // continuum variables from LBM PDFs
  cudaMalloc((void**)&QBC, nBytesQBC);       // imposed boundary conditions on continuum variables
  cudaMalloc((void**)&Finact, nBytesFinact); // inactive node values
  // Size of F,Q arrays
  Fsize = nLatticeSites;
  Qsize = nLatticeSites;
  // Offsets to each PDF
  for (i = 0; i < nPDF; i++)
    offF[i] = i * nLatticeSites;
  // Load geometry data to device memory
  cudaMemcpy(G, g, nBytesGeom, cudaMemcpyHostToDevice); // geometry flags
  // Load boundary conditions to device memory
  cudaMemcpy(QBC, qBC, nBytesQBC, cudaMemcpyHostToDevice);
  // Load continuum field values to device memory (always needed in order to transmit boundary conditions)
  cudaMemcpy(Q1, q, nBytesCont, cudaMemcpyHostToDevice);
  // Load lattice model
  LBMinit(nLBMmodel, nPDF, nQ, LBMvel, LBMstr, LBMw, finact);
  cudaMemcpy(Finact, finact, nBytesFinact, cudaMemcpyHostToDevice);
  // Set LBM PDFs
  if (InitializePDFs)
  {
    // Set initial PDFs from continuum field variables q (on Host), Q1 (on GPU)
    LBMinitF(nLBMmodel, nPDF, nCont, L, M, Fsize, Qsize, G, Q1, F1, QBC);
    LBMinitF(nLBMmodel, nPDF, nCont, L, M, Fsize, Qsize, G, Q1, F0, QBC);
  }
  else
  {
    // Transfer PDF values from Host to Device memory (continue from previous computation state)
    cudaMemcpy(F1, f, nBytesPDF, cudaMemcpyHostToDevice);
    cudaMemcpy(F0, f, nBytesPDF, cudaMemcpyHostToDevice);
  }
    cudaMemcpy(Fneq, fn, nBytesPDF, cudaMemcpyHostToDevice);
    //============================ Main iterative loop ======================================
  relerr = 0;
  for (int n = 0; n < nt; n++)
  {
    LBMsetBC(nLBMmodel, nPDF, Fsize, nQ, Qsize, L, M, offF, F0, F1, Finact, G, Q1); // BC(F1)    -> F0,F1
    LBMstream(nLBMmodel, nPDF, LBMvel, LBMstr, L, M, offF, F0, F1);              // stream(F0)-> F1
    LBMsetBC(nLBMmodel, nPDF, Fsize, nQ, Qsize, L, M, offF, F0, F1, Finact, G, Q1); // BC(F1)    -> F0,F1
    LBMcollide(nLBMmodel, nPDF, Fsize, LBMvel, LBMw, tau, L, M, F1, Fneq);         // relax(F1) -> F1
    if (ComputeRelErr && !(n % nProgress))
    {// Check for progress indicator display
      for (i = 0; i < M[0] * M[1] * M[2]; i++)
        q0[i] = q[nCont * i];       // Save previous state, first q component
      LBMcontV(nLBMmodel, nPDF, nCont, Fsize, Qsize, tmp, L, M, F1, Q1, Fneq);    // Find current continuum field values
      cudaMemcpy(q, Q1, nBytesCont, cudaMemcpyDeviceToHost); // Copy continuum field values to host
      nrm = 0.; err = 0.;
      for (i = 0; i < M[0] * M[1] * M[2]; i++)
      {// Compute relative error in first q component
        val = q[nCont * i];
        nrm = nrm + abs(val);
        err = err + abs(val - q0[i]);
      }
      relerr = err / nrm;
      printf("     Iteration %4.4d relative error %9.2e\n", n, relerr);
    }
    if (relerr < tol) break;
  }
  //======================================================================================
  // Save latest computation set
  LBMsetBC(nLBMmodel, nPDF, Fsize, nQ, Qsize, L, M, offF, F0, F1, Finact, G, Q1); // BC(F1)    -> F0,F1
  LBMcontV(nLBMmodel, nPDF, nCont, Fsize, Qsize, tmp, L, M, F1, Q1, Fneq);
  cudaMemcpy(q, Q1, nBytesCont, cudaMemcpyDeviceToHost);   // Copy continuum field values
  if (CopyPDFsToHost)
  {
    cudaMemcpy(f, F1, nBytesPDF, cudaMemcpyDeviceToHost); // Copy PDF values
    cudaMemcpy(fn, Fneq, nBytesPDF, cudaMemcpyDeviceToHost); // Copy non-equilibrium PDF values
  }
  cudaFree(G);
  cudaFree(F0);
  cudaFree(F1);
  cudaFree(Fneq);
  cudaFree(Q1);
  cudaFree(QBC);
  if (ComputeRelErr)
    free(q0);
  return 0;
}
