Skip to content

File Halo.cu

File List > src > Halo.cu

Go to the documentation of this file

#include "Halo.h"


template <class T> void fillHaloD(Param XParam, int ib, BlockP<T> XBlock, T* z)
{


    fillLeft(XParam, ib, XBlock, z);
    fillRight(XParam, ib, XBlock, z);
    fillTop(XParam, ib, XBlock, z);
    fillBot(XParam, ib, XBlock, z);
    //fill bot
    //fill top


}
template void fillHaloD<double>(Param XParam, int ib, BlockP<double> XBlock, double* z);
template void fillHaloD<float>(Param XParam, int ib, BlockP<float> XBlock, float* z);

template <class T> void fillHaloC(Param XParam, BlockP<T> XBlock, T* z)
{
    int ib;
    for (int ibl = 0; ibl < XParam.nblk; ibl++)
    {
        ib = XBlock.active[ibl];
        fillHaloD(XParam, ib, XBlock, z);
    }
}
template void fillHaloC<float>(Param XParam, BlockP<float> XBlock, float* z);
template void fillHaloC<double>(Param XParam, BlockP<double> XBlock, double* z);

template <class T> void RecalculateZs(Param XParam, BlockP<T> XBlock, EvolvingP<T> Xev, T* zb)
{
    int ib, n;
    for (int ibl = 0; ibl < XParam.nblk; ibl++)
    {
        ib = XBlock.active[ibl];
        /*
        //We only need to recalculate zs on the halo side 
        for (int n = -1; n <= (XParam.blkwidth); n++)
        {
            left = memloc(XParam.halowidth, XParam.blkmemwidth, -1, n, ib);
            right = memloc(XParam.halowidth, XParam.blkmemwidth, XParam.blkwidth, n, ib);
            top = memloc(XParam.halowidth, XParam.blkmemwidth, n, XParam.blkwidth, ib);
            bot = memloc(XParam.halowidth, XParam.blkmemwidth, n, -1, ib);

            Xev.zs[left] = zb[left] + Xev.h[left];
            Xev.zs[right] = zb[right] + Xev.h[right];
            Xev.zs[top] = zb[top] + Xev.h[top];
            Xev.zs[bot] = zb[bot] + Xev.h[bot];

            //printf("n=%d; zsold=%f; zsnew=%f (zb=%f + h=%f)\n",n, Xev.zs[n], zb[n] + Xev.h[n], zb[n] , Xev.h[n]);
        }
        */

        // Recalculate zs everywhere maybe we only need to do that on the halo ?
        for (int j = -1; j < (XParam.blkwidth+1); j++)
        {
            for (int i = -1; i < (XParam.blkwidth+1); i++)
            {
                n = memloc(XParam.halowidth,XParam.blkmemwidth, i, j, ib);
                Xev.zs[n] = zb[n] + Xev.h[n];
            }
        }

    }
}
template void RecalculateZs<float>(Param XParam, BlockP<float> XBlock, EvolvingP<float> Xev, float* zb);
template void RecalculateZs<double>(Param XParam, BlockP<double> XBlock, EvolvingP<double> Xev, double* zb);

template <class T> void Recalculatehh(Param XParam, BlockP<T> XBlock, EvolvingP<T> Xev, T* zb)
{
    int ib, n;
    for (int ibl = 0; ibl < XParam.nblk; ibl++)
    {
        ib = XBlock.active[ibl];
        /*
        //We only need to recalculate zs on the halo side
        for (int n = -1; n <= (XParam.blkwidth); n++)
        {
            left = memloc(XParam.halowidth, XParam.blkmemwidth, -1, n, ib);
            right = memloc(XParam.halowidth, XParam.blkmemwidth, XParam.blkwidth, n, ib);
            top = memloc(XParam.halowidth, XParam.blkmemwidth, n, XParam.blkwidth, ib);
            bot = memloc(XParam.halowidth, XParam.blkmemwidth, n, -1, ib);

            Xev.zs[left] = zb[left] + Xev.h[left];
            Xev.zs[right] = zb[right] + Xev.h[right];
            Xev.zs[top] = zb[top] + Xev.h[top];
            Xev.zs[bot] = zb[bot] + Xev.h[bot];

            //printf("n=%d; zsold=%f; zsnew=%f (zb=%f + h=%f)\n",n, Xev.zs[n], zb[n] + Xev.h[n], zb[n] , Xev.h[n]);
        }
        */

        // Recalculate zs everywhere maybe we only need to do that on the halo ?
        for (int j = -1; j < (XParam.blkwidth + 1); j++)
        {
            for (int i = -1; i < (XParam.blkwidth + 1); i++)
            {
                n = memloc(XParam.halowidth, XParam.blkmemwidth, i, j, ib);

                Xev.h[n] = max(Xev.zs[n]- zb[n],(T)0.0) ;
            }
        }

    }
}
template void Recalculatehh<float>(Param XParam, BlockP<float> XBlock, EvolvingP<float> Xev, float* zb);
template void Recalculatehh<double>(Param XParam, BlockP<double> XBlock, EvolvingP<double> Xev, double* zb);


template <class T> __global__ void RecalculateZsGPU(Param XParam, BlockP<T> XBlock, EvolvingP<T> Xev, T* zb)
{
    unsigned int blkmemwidth = XParam.blkmemwidth;

    int ix = threadIdx.x -1;
    int iy = threadIdx.y -1;
    unsigned int ibl = blockIdx.x;
    unsigned int ib = XBlock.active[ibl];

    int  n;

    //ib = XBlock.active[ibl];

    n = memloc(XParam.halowidth, blkmemwidth, ix, iy, ib);
    Xev.zs[n] = zb[n] + Xev.h[n];
    /*
    if(zb[n] < XParam.eps)
    {
        printf("ix=%d, iy=%d, ib=%d, n=%d; zsold=%f; zsnew=%f (zb=%f + h=%f)\n",ix,iy,ib, n, Xev.zs[n], zb[n] + Xev.h[n], zb[n], Xev.h[n]);
    }
    */

}
template __global__ void RecalculateZsGPU<float>(Param XParam, BlockP<float> XBlock, EvolvingP<float> Xev, float* zb);
template __global__ void RecalculateZsGPU<double>(Param XParam, BlockP<double> XBlock, EvolvingP<double> Xev, double* zb);

template <class T> void fillHaloF(Param XParam, bool doProlongation, BlockP<T> XBlock, T* z)
{
    int ib;
    for (int ibl = 0; ibl < XParam.nblk; ibl++)
    {
        ib = XBlock.active[ibl];
        fillLeftFlux(XParam, doProlongation, ib, XBlock, z);
        fillBotFlux(XParam, doProlongation, ib, XBlock, z);
        fillRightFlux(XParam, doProlongation, ib, XBlock, z);
        fillTopFlux(XParam, doProlongation, ib, XBlock, z);

    }
}
template void fillHaloF<float>(Param XParam, bool doProlongation, BlockP<float> XBlock, float* z);
template void fillHaloF<double>(Param XParam, bool doProlongation, BlockP<double> XBlock, double* z);

template <class T> void fillHaloGPU(Param XParam, BlockP<T> XBlock, cudaStream_t stream, T* z)
{

    dim3 blockDimHaloLR(1, XParam.blkwidth, 1);
    dim3 blockDimHaloBT(XParam.blkwidth, 1, 1);
    dim3 gridDim(XParam.nblk, 1, 1);

    fillLeft <<<gridDim, blockDimHaloLR, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z);
    //fillLeft <<<gridDim, blockDimHaloLR, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z);
    CUDA_CHECK(cudaDeviceSynchronize());
    fillRight <<<gridDim, blockDimHaloLR, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z);
    //fillRight <<<gridDim, blockDimHaloLR, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z);
    CUDA_CHECK(cudaDeviceSynchronize());
    fillBot <<<gridDim, blockDimHaloBT, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z);
    //fillBot <<<gridDim, blockDimHaloBT, 0>>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z);
    CUDA_CHECK(cudaDeviceSynchronize());
    fillTop <<<gridDim, blockDimHaloBT, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z);
    //fillTop <<<gridDim, blockDimHaloBT, 0>>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z);
    CUDA_CHECK(cudaDeviceSynchronize());
    //CUDA_CHECK(cudaStreamSynchronize(stream));

}
template void fillHaloGPU<double>(Param XParam, BlockP<double> XBlock, cudaStream_t stream, double* z);
template void fillHaloGPU<float>(Param XParam, BlockP<float> XBlock, cudaStream_t stream, float* z);

template <class T> void fillHaloGPU(Param XParam, BlockP<T> XBlock,  T* z)
{

    dim3 blockDimHaloLR(1, XParam.blkwidth, 1);
    dim3 blockDimHaloBT(XParam.blkwidth, 1, 1);
    dim3 gridDim(XParam.nblk, 1, 1);

    fillLeft << <gridDim, blockDimHaloLR, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z);
    //fillLeft << <gridDim, blockDimHaloLR, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z);
    CUDA_CHECK(cudaDeviceSynchronize());
    fillRight << <gridDim, blockDimHaloLR, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z);
    //fillRight << <gridDim, blockDimHaloLR, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z);
    CUDA_CHECK(cudaDeviceSynchronize());
    fillBot << <gridDim, blockDimHaloBT, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z);
    //fillBot << <gridDim, blockDimHaloBT, 0>> > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z);
    CUDA_CHECK(cudaDeviceSynchronize());
    fillTop << <gridDim, blockDimHaloBT, 0 >> > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z);
    //fillTop << <gridDim, blockDimHaloBT, 0>> > (XParam.halowidth, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z);
    CUDA_CHECK(cudaDeviceSynchronize());
    //CUDA_CHECK(cudaStreamSynchronize(stream));

}
template void fillHaloGPU<double>(Param XParam, BlockP<double> XBlock,double* z);
template void fillHaloGPU<float>(Param XParam, BlockP<float> XBlock, float* z);


template <class T> void fillHaloGPUnew(Param XParam, BlockP<T> XBlock, cudaStream_t stream, T* z)
{
    dim3 blockDimHaloLR(1, XParam.blkwidth, 1);
    dim3 blockDimHaloBT(XParam.blkwidth, 1, 1);
    dim3 gridDim(XParam.nblk, 1, 1);

    dim3 blockDimHaloLRx2(2, XParam.blkwidth, 1);
    dim3 blockDimHaloBTx2(XParam.blkwidth, 2, 1);
    dim3 gridDimx2(ceil(XParam.nblk/2), 1, 1);

    //fillLeftnew <<<gridDimx2, blockDimHaloLRx2, 0>>> (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z);
    fillLeft <<<gridDim, blockDimHaloLR, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, z);
    CUDA_CHECK(cudaDeviceSynchronize());
    //fillRightnew <<<gridDimx2, blockDimHaloLRx2, 0 >>> (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z);
    fillRight <<<gridDim, blockDimHaloLR, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z);
    CUDA_CHECK(cudaDeviceSynchronize());
    //fillBotnew <<<gridDimx2, blockDimHaloBTx2, 0>>> (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z);
    fillBot <<<gridDim, blockDimHaloBT, 0>>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, z);
    CUDA_CHECK(cudaDeviceSynchronize());
    //fillTopnew <<<gridDimx2, blockDimHaloBTx2, 0 >>> (XParam.halowidth, XParam.nblk, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z);
    fillTop <<<gridDim, blockDimHaloBT, 0>>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z);
    CUDA_CHECK(cudaDeviceSynchronize());
    //CUDA_CHECK(cudaStreamSynchronize(stream));

}
template void fillHaloGPUnew<double>(Param XParam, BlockP<double> XBlock, cudaStream_t stream, double* z);
template void fillHaloGPUnew<float>(Param XParam, BlockP<float> XBlock, cudaStream_t stream, float* z);


template <class T> void fillHaloTopRightC(Param XParam, BlockP<T> XBlock, T* z)
{
    // for flux term and actually most terms, only top and right neighbours are needed!

    //fillLeft(XParam, ib, XBlock, z);
    int ib;
    for (int ibl = 0; ibl < XParam.nblk; ibl++)
    {
        ib = XBlock.active[ibl];
        HaloFluxCPULR(XParam, ib, XBlock, z);
        HaloFluxCPUBT(XParam, ib, XBlock, z);

        //fillRightFlux(XParam,true, ib, XBlock, z);
        //fillTopFlux(XParam,true, ib, XBlock, z);

    }



}
template void fillHaloTopRightC<double>(Param XParam, BlockP<double> XBlock, double* z);
template void fillHaloTopRightC<float>(Param XParam, BlockP<float> XBlock, float* z);

template <class T> void fillHaloLRFluxC(Param XParam, BlockP<T> XBlock, T* z)
{
    // for flux term and actually most terms, only top and right neighbours are needed!

    //fillLeft(XParam, ib, XBlock, z);
    int ib;
    for (int ibl = 0; ibl < XParam.nblk; ibl++)
    {
        ib = XBlock.active[ibl];
        HaloFluxCPULR(XParam, ib, XBlock, z);
        //HaloFluxCPUBT(XParam, ib, XBlock, z);

        //fillRightFlux(XParam,true, ib, XBlock, z);
        //fillTopFlux(XParam,true, ib, XBlock, z);

    }



}
template void fillHaloLRFluxC<double>(Param XParam, BlockP<double> XBlock, double* z);
template void fillHaloLRFluxC<float>(Param XParam, BlockP<float> XBlock, float* z);

template <class T> void fillHaloBTFluxC(Param XParam, BlockP<T> XBlock, T* z)
{
    // for flux term and actually most terms, only top and right neighbours are needed!

    //fillLeft(XParam, ib, XBlock, z);
    int ib;
    for (int ibl = 0; ibl < XParam.nblk; ibl++)
    {
        ib = XBlock.active[ibl];
        //HaloFluxCPULR(XParam, ib, XBlock, z);
        HaloFluxCPUBT(XParam, ib, XBlock, z);

        //fillRightFlux(XParam,true, ib, XBlock, z);
        //fillTopFlux(XParam,true, ib, XBlock, z);

    }



}
template void fillHaloBTFluxC<double>(Param XParam, BlockP<double> XBlock, double* z);
template void fillHaloBTFluxC<float>(Param XParam, BlockP<float> XBlock, float* z);


template <class T> void fillHaloTopRightGPU(Param XParam, BlockP<T> XBlock, cudaStream_t stream, T* z)
{

    dim3 blockDimHaloLR(1, 16, 1);
    dim3 blockDimHaloBT(16, 1, 1);
    dim3 gridDim(XParam.nblk, 1, 1);


    //fillLeft <<<gridDim, blockDimHaloLR, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, a);
    //fillRightFlux <<<gridDim, blockDimHaloLR, 0, stream >>> (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z);
    HaloFluxGPULR <<<gridDim, blockDimHaloLR, 0, stream >>> (XParam, XBlock, z);

    //fillBot <<<gridDim, blockDimHaloBT, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, a);
    //fillTopFlux <<<gridDim, blockDimHaloBT, 0, stream >>> (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z);
    HaloFluxGPUBT <<<gridDim, blockDimHaloBT, 0, stream >>> (XParam, XBlock, z);
    CUDA_CHECK(cudaStreamSynchronize(stream));

}
template void fillHaloTopRightGPU<double>(Param XParam, BlockP<double> XBlock, cudaStream_t stream, double* z);
template void fillHaloTopRightGPU<float>(Param XParam, BlockP<float> XBlock, cudaStream_t stream, float* z);


template <class T> void fillHaloLeftRightGPU(Param XParam, BlockP<T> XBlock, cudaStream_t stream, T* z)
{

    dim3 blockDimHaloLR(1, 16, 1);
    //dim3 blockDimHaloBT(16, 1, 1);
    dim3 gridDim(XParam.nblk, 1, 1);


    //fillLeft <<<gridDim, blockDimHaloLR, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, a);
    //fillRightFlux <<<gridDim, blockDimHaloLR, 0, stream >>> (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z);
    HaloFluxGPULR <<<gridDim, blockDimHaloLR, 0, stream >>> (XParam, XBlock, z);

    //fillBot <<<gridDim, blockDimHaloBT, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, a);
    //fillTopFlux <<<gridDim, blockDimHaloBT, 0, stream >>> (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z);
    //HaloFluxGPUBT <<<gridDim, blockDimHaloBT, 0, stream >>> (XParam, XBlock, z);
    //CUDA_CHECK(cudaStreamSynchronize(stream));

}
template void fillHaloLeftRightGPU<double>(Param XParam, BlockP<double> XBlock, cudaStream_t stream, double* z);
template void fillHaloLeftRightGPU<float>(Param XParam, BlockP<float> XBlock, cudaStream_t stream, float* z);


template <class T> void fillHaloLeftRightGPUnew(Param XParam, BlockP<T> XBlock, cudaStream_t stream, T* z)
{

    dim3 blockDimHaloLR(2, 16, 1);
    //dim3 blockDimHaloBT(16, 1, 1);
    dim3 gridDim(ceil(XParam.nblk/2), 1, 1);

    HaloFluxGPULRnew <<<gridDim, blockDimHaloLR, 0, stream >>> (XParam, XBlock, z);



}
template void fillHaloLeftRightGPUnew<double>(Param XParam, BlockP<double> XBlock, cudaStream_t stream, double* z);
template void fillHaloLeftRightGPUnew<float>(Param XParam, BlockP<float> XBlock, cudaStream_t stream, float* z);

template <class T> void fillHaloBotTopGPU(Param XParam, BlockP<T> XBlock, cudaStream_t stream, T* z)
{

    //dim3 blockDimHaloLR(1, 16, 1);
    dim3 blockDimHaloBT(16, 1, 1);
    dim3 gridDim(XParam.nblk, 1, 1);


    //fillLeft <<<gridDim, blockDimHaloLR, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.LeftBot, XBlock.LeftTop, XBlock.RightBot, XBlock.BotRight, XBlock.TopRight, a);
    //fillRightFlux <<<gridDim, blockDimHaloLR, 0, stream >>> (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.RightBot, XBlock.RightTop, XBlock.LeftBot, XBlock.BotLeft, XBlock.TopLeft, z);
    //HaloFluxGPULR <<<gridDim, blockDimHaloLR, 0, stream >>> (XParam, XBlock, z);

    //fillBot <<<gridDim, blockDimHaloBT, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, XBlock.BotLeft, XBlock.BotRight, XBlock.TopLeft, XBlock.LeftTop, XBlock.RightTop, a);
    //fillTopFlux <<<gridDim, blockDimHaloBT, 0, stream >>> (XParam.halowidth,false, XBlock.active, XBlock.level, XBlock.TopLeft, XBlock.TopRight, XBlock.BotLeft, XBlock.LeftBot, XBlock.RightBot, z);
    HaloFluxGPUBT <<<gridDim, blockDimHaloBT, 0, stream >>> (XParam, XBlock, z);
    //CUDA_CHECK(cudaStreamSynchronize(stream));

}
template void fillHaloBotTopGPU<double>(Param XParam, BlockP<double> XBlock, cudaStream_t stream, double* z);
template void fillHaloBotTopGPU<float>(Param XParam, BlockP<float> XBlock, cudaStream_t stream, float* z);

template <class T> void fillHaloBotTopGPUnew(Param XParam, BlockP<T> XBlock, cudaStream_t stream, T* z)
{

    //dim3 blockDimHaloLR(1, 16, 1);
    dim3 blockDimHaloBT(16, 2, 1);
    dim3 gridDim(ceil(XParam.nblk/2), 1, 1);


    HaloFluxGPUBTnew <<<gridDim, blockDimHaloBT, 0, stream >>> (XParam, XBlock, z);


}
template void fillHaloBotTopGPUnew<double>(Param XParam, BlockP<double> XBlock, cudaStream_t stream, double* z);
template void fillHaloBotTopGPUnew<float>(Param XParam, BlockP<float> XBlock, cudaStream_t stream, float* z);


template <class T> void fillHalo(Param XParam, BlockP<T> XBlock, EvolvingP<T> Xev, T*zb)
{


        std::thread t0(fillHaloC<T>,XParam, XBlock, Xev.h);
        std::thread t1(fillHaloC<T>,XParam, XBlock, Xev.zs);
        //std::thread t2(fillHaloF<T>,XParam,true, XBlock, Xev.u);
        //std::thread t3(fillHaloF<T>,XParam,true, XBlock, Xev.v);

        std::thread t2(fillHaloC<T>, XParam, XBlock, Xev.u);
        std::thread t3(fillHaloC<T>, XParam, XBlock, Xev.v);

        t0.join();
        t1.join();
        t2.join();
        t3.join();

        if (XParam.conserveElevation)
        {
            conserveElevation(XParam, XBlock, Xev, zb);
        }
        else if (XParam.wetdryfix)
        {
            WetDryRestriction(XParam, XBlock, Xev, zb);
        }

        RecalculateZs(XParam, XBlock, Xev, zb);

        maskbnd(XParam, XBlock, Xev, zb);

}
template void fillHalo<float>(Param XParam, BlockP<float> XBlock, EvolvingP<float> Xev, float *zb);
template void fillHalo<double>(Param XParam, BlockP<double> XBlock, EvolvingP<double> Xev,double * zb);

template <class T> void fillHalo(Param XParam, BlockP<T> XBlock, EvolvingP<T> Xev)
{

    std::thread t0(fillHaloC<T>, XParam, XBlock, Xev.h);
    std::thread t1(fillHaloC<T>, XParam, XBlock, Xev.zs);
    std::thread t2(fillHaloF<T>, XParam, true, XBlock, Xev.u);
    std::thread t3(fillHaloF<T>, XParam, true, XBlock, Xev.v);

    t0.join();
    t1.join();
    t2.join();
    t3.join();


    //maskbnd(XParam, XBlock, Xev, zb);

}
template void fillHalo<float>(Param XParam, BlockP<float> XBlock, EvolvingP<float> Xev);
template void fillHalo<double>(Param XParam, BlockP<double> XBlock, EvolvingP<double> Xev);

template <class T> void fillHaloGPU(Param XParam, BlockP<T> XBlock, EvolvingP<T> Xev)
{
    const int num_streams = 4;

    cudaStream_t streams[num_streams];

    for (int i = 0; i < num_streams; i++)
    {
        CUDA_CHECK(cudaStreamCreate(&streams[i]));
    }


    fillHaloGPU(XParam, XBlock, streams[0], Xev.h);
    fillHaloGPU(XParam, XBlock, streams[1], Xev.zs);
    fillHaloGPU(XParam, XBlock, streams[2], Xev.u);
    fillHaloGPU(XParam, XBlock, streams[3], Xev.v);

    for (int i = 0; i < num_streams; i++)
    {
        cudaStreamDestroy(streams[i]);
    }
}
template void fillHaloGPU<float>(Param XParam, BlockP<float> XBlock, EvolvingP<float> Xev);
template void fillHaloGPU<double>(Param XParam, BlockP<double> XBlock, EvolvingP<double> Xev);

template <class T> void fillHaloGPU(Param XParam, BlockP<T> XBlock, EvolvingP<T> Xev,T * zb)
{
    const int num_streams = 4;
    dim3 blockDimHalo(XParam.blkwidth,1, 1);

    dim3 gridDim(XBlock.mask.nblk, 1, 1);

    dim3 blockDimfull(XParam.blkmemwidth, XParam.blkmemwidth, 1);
    dim3 gridDimfull(XParam.nblk, 1, 1);

    cudaStream_t streams[num_streams];

    for (int i = 0; i < num_streams; i++)
    {
        CUDA_CHECK(cudaStreamCreate(&streams[i]));
    }


    fillHaloGPU(XParam, XBlock, streams[0], Xev.h);
    fillHaloGPU(XParam, XBlock, streams[1], Xev.zs);
    fillHaloGPU(XParam, XBlock, streams[2], Xev.u);
    fillHaloGPU(XParam, XBlock, streams[3], Xev.v);
    CUDA_CHECK(cudaDeviceSynchronize());

    if (XParam.conserveElevation)
    {
        conserveElevationGPU(XParam, XBlock, Xev, zb);
    }
    else if (XParam.wetdryfix)
    {
        WetDryRestrictionGPU(XParam, XBlock, Xev, zb);
    }

    RecalculateZsGPU <<< gridDimfull, blockDimfull, 0 >>> (XParam, XBlock, Xev, zb);
    CUDA_CHECK(cudaDeviceSynchronize());

    //if (XBlock.mask.nblk > 0)
    //{
    //  maskbndGPUleft <<<gridDim, blockDimHalo, 0, streams[0] >>> (XParam, XBlock, Xev, zb);
    //  maskbndGPUtop <<<gridDim, blockDimHalo, 0, streams[1] >>> (XParam, XBlock, Xev, zb);
    //  maskbndGPUright <<<gridDim, blockDimHalo, 0, streams[2] >>> (XParam, XBlock, Xev, zb);
    //  maskbndGPUtop <<<gridDim, blockDimHalo, 0, streams[3] >>> (XParam, XBlock, Xev, zb);

    //  //CUDA_CHECK(cudaDeviceSynchronize());
    //}
    for (int i = 0; i < num_streams; i++)
    {
        cudaStreamDestroy(streams[i]);
    }


}
template void fillHaloGPU<float>(Param XParam, BlockP<float> XBlock, EvolvingP<float> Xev,float *zb);
template void fillHaloGPU<double>(Param XParam, BlockP<double> XBlock, EvolvingP<double> Xev,double* zb);

template <class T> void fillHalo(Param XParam, BlockP<T> XBlock, GradientsP<T> Grad)
{

    /*
    std::thread t0(fillHaloF<T>,XParam, true, XBlock, Grad.dhdx);
    std::thread t1(fillHaloF<T>,XParam, true, XBlock, Grad.dudx);
    std::thread t2(fillHaloF<T>,XParam, true, XBlock, Grad.dvdx);
    std::thread t3(fillHaloF<T>,XParam, true, XBlock, Grad.dzsdx);

    std::thread t4(fillHaloF<T>,XParam, true, XBlock, Grad.dhdy);
    std::thread t5(fillHaloF<T>,XParam, true, XBlock, Grad.dudy);
    std::thread t6(fillHaloF<T>,XParam, true, XBlock, Grad.dvdy);
    std::thread t7(fillHaloF<T>,XParam, true, XBlock, Grad.dzsdy);

    */
    std::thread t0(fillHaloC<T>, XParam, XBlock, Grad.dhdx);
    std::thread t1(fillHaloC<T>, XParam, XBlock, Grad.dudx);
    std::thread t2(fillHaloC<T>, XParam, XBlock, Grad.dvdx);
    std::thread t3(fillHaloC<T>, XParam, XBlock, Grad.dzsdx);

    std::thread t4(fillHaloC<T>, XParam, XBlock, Grad.dhdy);
    std::thread t5(fillHaloC<T>, XParam, XBlock, Grad.dudy);
    std::thread t6(fillHaloC<T>, XParam, XBlock, Grad.dvdy);
    std::thread t7(fillHaloC<T>, XParam, XBlock, Grad.dzsdy);

    t0.join();
    t1.join();
    t2.join();
    t3.join();
    t4.join();
    t5.join();
    t6.join();
    t7.join();


}
template void fillHalo<float>(Param XParam, BlockP<float> XBlock, GradientsP<float> Grad);
template void fillHalo<double>(Param XParam, BlockP<double> XBlock, GradientsP<double> Grad);

template <class T> void fillHaloGPU(Param XParam, BlockP<T> XBlock, GradientsP<T> Grad)
{
    const int num_streams = 8;

    cudaStream_t streams[num_streams];

    for (int i = 0; i < num_streams; i++)
    {
        CUDA_CHECK(cudaStreamCreate(&streams[i]));
    }

    fillHaloGPU(XParam, XBlock, streams[0], Grad.dhdx);
    fillHaloGPU(XParam, XBlock, streams[2], Grad.dudx);
    fillHaloGPU(XParam, XBlock, streams[3], Grad.dvdx);
    fillHaloGPU(XParam, XBlock, streams[4], Grad.dzsdx);

    fillHaloGPU(XParam, XBlock, streams[5], Grad.dhdy);
    fillHaloGPU(XParam, XBlock, streams[6], Grad.dudy);
    fillHaloGPU(XParam, XBlock, streams[7], Grad.dvdy);
    fillHaloGPU(XParam, XBlock, streams[1], Grad.dzsdy);

    for (int i = 0; i < num_streams; i++)
    {
        cudaStreamDestroy(streams[i]);
    }

}
template void fillHaloGPU<float>(Param XParam, BlockP<float> XBlock, GradientsP<float> Grad);
template void fillHaloGPU<double>(Param XParam, BlockP<double> XBlock, GradientsP<double> Grad);


template <class T> void fillHalo(Param XParam, BlockP<T> XBlock, FluxP<T> Flux)
{


    //std::thread t0(fillHaloTopRightC<T>,XParam, XBlock, Flux.Fhu);
    //std::thread t1(fillHaloTopRightC<T>,XParam, XBlock, Flux.Fhv);
    //std::thread t2(fillHaloTopRightC<T>,XParam, XBlock, Flux.Fqux);
    //std::thread t3(fillHaloTopRightC<T>, XParam, XBlock, Flux.Fquy);

    //std::thread t4(fillHaloTopRightC<T>, XParam, XBlock, Flux.Fqvx);
    //std::thread t5(fillHaloTopRightC<T>, XParam, XBlock, Flux.Fqvy);
    //std::thread t6(fillHaloTopRightC<T>, XParam, XBlock, Flux.Su);
    //std::thread t7(fillHaloTopRightC<T>, XParam, XBlock, Flux.Sv);

    std::thread t0(fillHaloLRFluxC<T>, XParam, XBlock, Flux.Fhu);
    std::thread t1(fillHaloLRFluxC<T>, XParam, XBlock, Flux.Fqux);
    std::thread t2(fillHaloLRFluxC<T>, XParam, XBlock, Flux.Su);

    std::thread t6(fillHaloLRFluxC<T>, XParam, XBlock, Flux.Fqvx);

    std::thread t3(fillHaloBTFluxC<T>, XParam, XBlock, Flux.Fhv);
    std::thread t4(fillHaloBTFluxC<T>, XParam, XBlock, Flux.Fqvy);
    std::thread t5(fillHaloBTFluxC<T>, XParam, XBlock, Flux.Sv);

    std::thread t7(fillHaloBTFluxC<T>, XParam, XBlock, Flux.Fquy);

    t0.join();
    t1.join();
    t2.join();
    t3.join();
    t4.join();
    t5.join();
    t6.join();
    t7.join();

}
template void fillHalo<float>(Param XParam, BlockP<float> XBlock, FluxP<float> Flux);
template void fillHalo<double>(Param XParam, BlockP<double> XBlock, FluxP<double> Flux);

template <class T> void fillHaloGPU(Param XParam, BlockP<T> XBlock, FluxP<T> Flux)
{
    const int num_streams = 8;

    cudaStream_t streams[num_streams];

    for (int i = 0; i < num_streams; i++)
    {
        CUDA_CHECK(cudaStreamCreate(&streams[i]));
    }

    dim3 blockDimHalo(XParam.blkwidth, 1, 1);

    dim3 gridDim(XBlock.mask.nblk, 1, 1);


    fillHaloLeftRightGPUnew(XParam, XBlock, streams[0], Flux.Fhu);
    fillHaloLeftRightGPUnew(XParam, XBlock, streams[1], Flux.Su);
    fillHaloLeftRightGPUnew(XParam, XBlock, streams[2], Flux.Fqux);
    fillHaloLeftRightGPUnew(XParam, XBlock, streams[3], Flux.Fqvx);



    fillHaloBotTopGPUnew(XParam, XBlock, streams[4], Flux.Fquy);
    fillHaloBotTopGPUnew(XParam, XBlock, streams[5], Flux.Fqvy);
    fillHaloBotTopGPUnew(XParam, XBlock, streams[6], Flux.Fhv);
    fillHaloBotTopGPUnew(XParam, XBlock, streams[7], Flux.Sv);


    for (int i = 0; i < num_streams; i++)
    {
        cudaStreamSynchronize(streams[i]);
    }
    // Below has now moved to its own function
    //if (XBlock.mask.nblk > 0)
    //{
    //  maskbndGPUFluxleft <<<gridDim, blockDimHalo, 0, streams[0] >>> (XParam, XBlock, Flux);
    //  maskbndGPUFluxtop <<<gridDim, blockDimHalo, 0, streams[1] >>> (XParam, XBlock, Flux);
    //  maskbndGPUFluxright <<<gridDim, blockDimHalo, 0, streams[2] >>> (XParam, XBlock, Flux);
    //  maskbndGPUFluxbot <<<gridDim, blockDimHalo, 0, streams[3] >>> (XParam, XBlock, Flux);

    //  //CUDA_CHECK(cudaDeviceSynchronize());
    //}

    for (int i = 0; i < num_streams; i++)
    {
        cudaStreamDestroy(streams[i]);
    }


}
template void fillHaloGPU<float>(Param XParam, BlockP<float> XBlock, FluxP<float> Flux);
template void fillHaloGPU<double>(Param XParam, BlockP<double> XBlock, FluxP<double> Flux);

template <class T> void bndmaskGPU(Param XParam, BlockP<T> XBlock, EvolvingP<T> Xev, FluxP<T> Flux)
{
    const int num_streams = 8;

    cudaStream_t streams[num_streams];

    for (int i = 0; i < num_streams; i++)
    {
        CUDA_CHECK(cudaStreamCreate(&streams[i]));
    }

    dim3 blockDimHalo(XParam.blkwidth, 1, 1);

    dim3 gridDim(XBlock.mask.nblk, 1, 1);
    if (XBlock.mask.nblk > 0)
    {
        maskbndGPUFluxleft <<<gridDim, blockDimHalo, 0, streams[0] >>> (XParam, XBlock, Xev, Flux);
        maskbndGPUFluxtop <<<gridDim, blockDimHalo, 0, streams[1] >>> (XParam, XBlock,  Flux);
        maskbndGPUFluxright <<<gridDim, blockDimHalo, 0, streams[2] >>> (XParam, XBlock,  Flux);
        maskbndGPUFluxbot <<<gridDim, blockDimHalo, 0, streams[3] >>> (XParam, XBlock, Flux);

        //CUDA_CHECK(cudaDeviceSynchronize());
    }

    for (int i = 0; i < num_streams; i++)
    {
        cudaStreamDestroy(streams[i]);
    }

}
template void bndmaskGPU<float>(Param XParam, BlockP<float> XBlock, EvolvingP<float> Xev, FluxP<float> Flux);
template void bndmaskGPU<double>(Param XParam, BlockP<double> XBlock, EvolvingP<double> Xev, FluxP<double> Flux);

//template <class T> void refine_linearCPU(Param XParam, int ib, bool isLR, bool isoposit, BlockP<T> XBlock, T* z, T* dzdx, T* dzdy)
//{
//  int Neighblock, Mirrorblock;
//
//  int ir = isoposit ? 0 : XParam.blkwidth - 1;
//  int iw = isoposit ? XParam.blkwidth : -1;
//  if (isLR)
//  {
//      Neighblock = isoposit ? XBlock.RightBot[ib] : XBlock.LeftBot[ib];
//      Mirrorblock = isoposit ? XBlock.LeftBot[Neighblock] : XBlock.RightBot[Neighblock]
//  }
//  else
//  {
//      Neighblock = isoposit ? XBlock.TopLeft[ib] : XBlock.BotLeft[ib];
//      Mirrorblock = isoposit ? XBlock.BotLeft[Neighblock] : XBlock.TopLeft[Neighblock]
//  }
//
//  if (XBlock.level[Neighblock] < XBlock.level[ib])
//  {
//      double ilevdx = calcres(XParam.dx, XBlock.level[ib]) * T(0.25);
//      for (int ix = 0; ix < XParam.blkwidth; ix++)
//      {
//          int jj = Mirrorblock == ib ? floor(ix * (T)0.5) : floor(ix * (T)0.5) + XParam.blkwidth / 2;
//          int il = isLR ? memloc(XParam, ir, jj, Neighblock) : memloc(XParam, jj, ir, Neighblock);
//          int write = isLR ? memloc(XParam, iw, j, ib) : memloc(XParam, j, iw, ib);
//          T faclr = T(-1.0);
//          T facbt = floor(j * (T)0.5) * T(2.0) > j ? 1.0 : -1.0;
//
//          T newz = z[il] + (faclr * dzdx[il] + facbt * dzdy[il]) * ilevdx;
//
//          z[write] = newz;
//
//
//      }
//  }
//}
//template void refine_linearCPU<float>(Param XParam, int ib, BlockP<float> XBlock, float* z, float* dzdx, float* dzdy);
//template void refine_linearCPU<double>(Param XParam, int ib, BlockP<double> XBlock, double* z, double* dzdx, double* dzdy);

template <class T> void refine_linear_Left(Param XParam, int ib, BlockP<T> XBlock, T* z, T * dzdx, T * dzdy)
{
    if (XBlock.level[XBlock.LeftBot[ib]] < XBlock.level[ib])
    {

        double ilevdx = calcres(XParam.delta, XBlock.level[ib])*T(0.5);

        for (int j = 0; j < XParam.blkwidth; j++)
        {
            int jj = XBlock.RightBot[XBlock.LeftBot[ib]] == ib ? ftoi(floor(j * (T)0.5)) : ftoi(floor(j * (T)0.5) + XParam.blkwidth / 2);
            int il = memloc(XParam, XParam.blkwidth - 1, jj , XBlock.LeftBot[ib]);
            int write = memloc(XParam, -1, j, ib);
            T faclr = T(1.0);
            T facbt = floor(j * (T)0.5) * T(2.0) < (j-T(0.01)) ? 1.0 : -1.0;

            T newz = z[il] + (faclr*dzdx[il]+facbt*dzdy[il]) * ilevdx;

            z[write] = newz;


        }
    }
}
template void refine_linear_Left<float>(Param XParam, int ib, BlockP<float> XBlock, float* z, float* dzdx, float* dzdy);
template void refine_linear_Left<double>(Param XParam, int ib, BlockP<double> XBlock, double* z, double* dzdx, double* dzdy);


template <class T> __global__ void refine_linear_LeftGPU(Param XParam, BlockP<T> XBlock, T* z, T* dzdx,T*dzdy)
{
    int blkmemwidth = blockDim.y + XParam.halowidth * 2;
    //unsigned int blksize = blkmemwidth * blkmemwidth;
    //unsigned int ix = 0;
    int iy = threadIdx.y;
    int ibl = blockIdx.x;
    int ib = XBlock.active[ibl];




    if (XBlock.level[XBlock.LeftBot[ib]] < XBlock.level[ib])
    {
        int j = iy;

        double ilevdx = calcres(XParam.delta, XBlock.level[ib]) * T(0.5);


        int jj = XBlock.RightBot[XBlock.LeftBot[ib]] == ib ? floor(j * (T)0.5) : floor(j * (T)0.5) + XParam.blkwidth / 2;
        int il = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth - 1, jj, XBlock.LeftBot[ib]);
        int write = memloc(XParam.halowidth, blkmemwidth, -1, j, ib);
        T faclr = T(1.0);
        T facbt = floor(j * (T)0.5) * T(2.0) < (j - T(0.01)) ? 1.0 : -1.0;

        T newz = z[il] + (faclr * dzdx[il] + facbt * dzdy[il]) * ilevdx;

        z[write] = newz;



    }
}
template __global__ void refine_linear_LeftGPU<float>(Param XParam, BlockP<float> XBlock, float* z, float* dzdx, float* dzdy);
template __global__ void refine_linear_LeftGPU<double>(Param XParam, BlockP<double> XBlock, double* z, double* dzdx, double* dzdy);


template <class T> void refine_linear_Right(Param XParam, int ib, BlockP<T> XBlock, T* z, T* dzdx, T* dzdy)
{
    if (XBlock.level[XBlock.RightBot[ib]] < XBlock.level[ib])
    {

        T ilevdx = calcres(T(XParam.delta), XBlock.level[ib] ) * T(0.5);

        for (int j = 0; j < XParam.blkwidth; j++)
        {
            int jj = XBlock.LeftBot[XBlock.RightBot[ib]] == ib ? ftoi(floor(j * (T)0.5)) : ftoi(floor(j * (T)0.5) + XParam.blkwidth / 2);
            int il = memloc(XParam, 0, jj , XBlock.RightBot[ib]);
            int write = memloc(XParam, XParam.blkwidth, j, ib);
            T faclr = T(-1.0);
            T facbt = floor(j * (T)0.5) * T(2.0) < (j - T(0.01)) ? 1.0 : -1.0;

            T newz = z[il] + (faclr * dzdx[il] + facbt * dzdy[il]) * ilevdx;

            z[write] = newz;


        }
    }
}
template void refine_linear_Right<float>(Param XParam, int ib, BlockP<float> XBlock, float* z, float* dzdx, float* dzdy);
template void refine_linear_Right<double>(Param XParam, int ib, BlockP<double> XBlock, double* z, double* dzdx, double* dzdy);

template <class T> __global__ void refine_linear_RightGPU(Param XParam, BlockP<T> XBlock, T* z, T* dzdx, T* dzdy)
{
    unsigned int blkmemwidth = blockDim.y + XParam.halowidth * 2;
    //unsigned int blksize = blkmemwidth * blkmemwidth;

    int iy = threadIdx.y;
    int ibl = blockIdx.x;
    int ib = XBlock.active[ibl];



    if (XBlock.level[XBlock.RightBot[ib]] < XBlock.level[ib])
    {

        double ilevdx = calcres(XParam.delta, XBlock.level[ib]) * T(0.5);

        int j = iy;
        int jj = XBlock.LeftBot[XBlock.RightBot[ib]] == ib ? floor(j * (T)0.5) : floor(j * (T)0.5) + XParam.blkwidth / 2;
        int il = memloc(XParam.halowidth, blkmemwidth, 0, jj, XBlock.RightBot[ib]);
        int write = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth, j, ib);

        T faclr = T(-1.0);
        T facbt = floor(j * (T)0.5) * T(2.0) < (j - T(0.01)) ? 1.0 : -1.0;

        T newz = z[il] + (faclr * dzdx[il] + facbt * dzdy[il]) * ilevdx;

        z[write] = newz;



    }
}
template __global__ void refine_linear_RightGPU<float>(Param XParam, BlockP<float> XBlock, float* z, float* dzdx, float* dzdy);
template __global__ void refine_linear_RightGPU<double>(Param XParam, BlockP<double> XBlock, double* z, double* dzdx, double* dzdy);

template <class T> void refine_linear_Bot(Param XParam, int ib, BlockP<T> XBlock, T* z, T* dzdx, T* dzdy)
{
    if (XBlock.level[XBlock.BotLeft[ib]] < XBlock.level[ib])
    {

        T ilevdx = calcres(T(XParam.delta), XBlock.level[ib]) * T(0.5);

        for (int i = 0; i < XParam.blkwidth; i++)
        {
            int ii = XBlock.TopLeft[XBlock.BotLeft[ib]] == ib ? ftoi(floor(i * (T)0.5)) : ftoi(floor(i * (T)0.5) + XParam.blkwidth / 2);
            int jl = memloc(XParam,  ii, XParam.blkwidth - 1, XBlock.BotLeft[ib]);
            int write = memloc(XParam, i, -1, ib);

            T facbt = T(1.0);
            T faclr = floor(i * (T)0.5) * T(2.0) < (i - T(0.01)) ? 1.0 : -1.0;

            T newz = z[jl] + (faclr * dzdx[jl] + facbt * dzdy[jl]) * ilevdx;

            z[write] = newz;


        }
    }
}
template void refine_linear_Bot<float>(Param XParam, int ib, BlockP<float> XBlock, float* z, float* dzdx, float* dzdy);
template void refine_linear_Bot<double>(Param XParam, int ib, BlockP<double> XBlock, double* z, double* dzdx, double* dzdy);

template <class T> __global__ void refine_linear_BotGPU(Param XParam, BlockP<T> XBlock, T* z, T* dzdx, T* dzdy)
{
    int blkmemwidth = blockDim.x + XParam.halowidth * 2;
    //unsigned int blksize = blkmemwidth * blkmemwidth;

    int ix = threadIdx.x;
    int ibl = blockIdx.x;
    int ib = XBlock.active[ibl];

    if (XBlock.level[XBlock.BotLeft[ib]] < XBlock.level[ib])
    {

        double ilevdx = calcres(XParam.delta, XBlock.level[ib]) * T(0.5);

        int i = ix;
        int ii = XBlock.TopLeft[XBlock.BotLeft[ib]] == ib ? floor(i * (T)0.5) : floor(i * (T)0.5) + XParam.blkwidth / 2;
        int jl = memloc(XParam.halowidth, blkmemwidth, ii, XParam.blkwidth - 1, XBlock.BotLeft[ib]);
        int write = memloc(XParam.halowidth, blkmemwidth, i, -1, ib);

        T facbt = T(1.0);
        T faclr = floor(i * (T)0.5) * T(2.0) < (i - T(0.01)) ? 1.0 : -1.0;

        T newz = z[jl] + (faclr * dzdx[jl] + facbt * dzdy[jl]) * ilevdx;

        z[write] = newz;



    }
}
template __global__ void refine_linear_BotGPU<float>(Param XParam, BlockP<float> XBlock, float* z, float* dzdx, float* dzdy);
template __global__ void refine_linear_BotGPU<double>(Param XParam, BlockP<double> XBlock, double* z, double* dzdx, double* dzdy);

template <class T> void refine_linear_Top(Param XParam, int ib, BlockP<T> XBlock, T* z, T* dzdx, T* dzdy)
{
    if (XBlock.level[XBlock.TopLeft[ib]] < XBlock.level[ib])
    {

        double ilevdx = calcres(XParam.delta, XBlock.level[ib]) * T(0.5);

        for (int i = 0; i < XParam.blkwidth; i++)
        {
            int ii = XBlock.BotLeft[XBlock.TopLeft[ib]] == ib ? ftoi(floor(i * (T)0.5)) : ftoi(floor(i * (T)0.5) + XParam.blkwidth / 2);
            int jl = memloc(XParam, ii , 0, XBlock.TopLeft[ib]);
            int write = memloc(XParam, i, XParam.blkwidth, ib);

            T facbt = T(-1.0);
            T faclr = floor(i * (T)0.5) * T(2.0) < (i - T(0.01)) ? 1.0 : -1.0;

            T newz = z[jl] + (faclr * dzdx[jl] + facbt * dzdy[jl]) * ilevdx;

            z[write] = newz;


        }
    }
}
template void refine_linear_Top<float>(Param XParam, int ib, BlockP<float> XBlock, float* z, float* dzdx, float* dzdy);
template void refine_linear_Top<double>(Param XParam, int ib, BlockP<double> XBlock, double* z, double* dzdx, double* dzdy);

template <class T> __global__ void refine_linear_TopGPU(Param XParam, BlockP<T> XBlock, T* z, T* dzdx, T* dzdy)
{
    int blkmemwidth = blockDim.x + XParam.halowidth * 2;
    //unsigned int blksize = blkmemwidth * blkmemwidth;

    int ix = threadIdx.x;
    int ibl = blockIdx.x;
    int ib = XBlock.active[ibl];
    if (XBlock.level[XBlock.TopLeft[ib]] < XBlock.level[ib])
    {

        double ilevdx = calcres(XParam.delta, XBlock.level[ib]) * T(0.5);

    int i = ix;
        int ii = XBlock.BotLeft[XBlock.TopLeft[ib]] == ib ? floor(i * (T)0.5) : floor(i * (T)0.5) + XParam.blkwidth / 2;
        int jl = memloc(XParam.halowidth, blkmemwidth, ii , 0, XBlock.TopLeft[ib]);
        int write = memloc(XParam.halowidth, blkmemwidth, i, XParam.blkwidth, ib);

        T facbt = T(-1.0);
        T faclr = floor(i * (T)0.5) * T(2.0) < (i - T(0.01)) ? 1.0 : -1.0;

        T newz = z[jl] + (faclr * dzdx[jl] + facbt * dzdy[jl]) * ilevdx;

        z[write] = newz;



    }
}
template __global__ void refine_linear_TopGPU<float>(Param XParam, BlockP<float> XBlock, float* z, float* dzdx, float* dzdy);
template __global__ void refine_linear_TopGPU<double>(Param XParam, BlockP<double> XBlock, double* z, double* dzdx, double* dzdy);

template <class T> void refine_linear(Param XParam, BlockP<T> XBlock, T* z, T* dzdx, T* dzdy)
{
    for (int ibl = 0; ibl < XParam.nblk; ibl++)
    {
        int ib = XBlock.active[ibl];
        refine_linear_Left(XParam, ib, XBlock, z, dzdx, dzdy);
        refine_linear_Right(XParam, ib, XBlock, z, dzdx, dzdy);
        refine_linear_Top(XParam, ib, XBlock, z, dzdx, dzdy);
        refine_linear_Bot(XParam, ib, XBlock, z, dzdx, dzdy);
    }
}
template void refine_linear<float>(Param XParam, BlockP<float> XBlock, float* z, float* dzdx, float* dzdy);
template void refine_linear<double>(Param XParam, BlockP<double> XBlock, double* z, double* dzdx, double* dzdy);

template <class T> void refine_linearGPU(Param XParam, BlockP<T> XBlock, T* z, T* dzdx, T* dzdy)
{
    dim3 blockDimHaloLR(1, XParam.blkwidth, 1);
    dim3 blockDimHaloBT(XParam.blkwidth, 1, 1);
    dim3 gridDim(XParam.nblk, 1, 1);

    refine_linear_LeftGPU<<<gridDim, blockDimHaloLR, 0>>>(XParam, XBlock, z, dzdx, dzdy);
    refine_linear_RightGPU <<<gridDim, blockDimHaloLR, 0 >>> (XParam, XBlock, z, dzdx, dzdy);
    refine_linear_TopGPU <<<gridDim, blockDimHaloBT, 0 >>> (XParam, XBlock, z, dzdx, dzdy);
    refine_linear_BotGPU <<<gridDim, blockDimHaloBT, 0 >>> (XParam, XBlock, z, dzdx, dzdy);
    CUDA_CHECK(cudaDeviceSynchronize());

}
template void refine_linearGPU<float>(Param XParam, BlockP<float> XBlock, float* z, float* dzdx, float* dzdy);
template void refine_linearGPU<double>(Param XParam, BlockP<double> XBlock, double* z, double* dzdx, double* dzdy);

template <class T> void HaloFluxCPULR(Param XParam, int ib, BlockP<T> XBlock, T *z)
{
    int jj, i,il,itl;
    if (XBlock.level[XBlock.LeftBot[ib]] > XBlock.level[ib])//The lower half is a boundary 
    {
        for (int j = 0; j < (XParam.blkwidth / 2); j++)
        {
            //
            i = memloc(XParam, 0, j, ib);


            jj = j*2;
            il = memloc(XParam, XParam.blkwidth, jj, XBlock.LeftBot[ib]);
            itl = memloc(XParam, XParam.blkwidth, jj+1, XBlock.LeftBot[ib]);

            z[i] = T(0.5) * (z[il] + z[itl]);





        }
        //
    }
    if (XBlock.level[XBlock.LeftTop[ib]] > XBlock.level[ib])//The lower half is a boundary 
    {
        for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
        {
            //
            i = memloc(XParam, 0, j, ib);


            jj = (j - XParam.blkwidth / 2) * 2;
            il = memloc(XParam, XParam.blkwidth, jj, XBlock.LeftTop[ib]);
            itl = memloc(XParam, XParam.blkwidth, jj + 1, XBlock.LeftTop[ib]);

            z[i] = T(0.5) * (z[il] + z[itl]);





        }
        //
    }
    if (XBlock.level[XBlock.RightBot[ib]] > XBlock.level[ib])//The lower half is a boundary 
    {
        for (int j = 0; j < (XParam.blkwidth / 2); j++)
        {
            //
            i = memloc(XParam, XParam.blkwidth, j, ib);


            jj = j * 2;
            il = memloc(XParam, 0, jj, XBlock.RightBot[ib]);
            itl = memloc(XParam, 0, jj + 1, XBlock.RightBot[ib]);

            z[i] = T(0.5) * (z[il] + z[itl]);





        }
        //
    }
    if (XBlock.level[XBlock.RightTop[ib]] > XBlock.level[ib])//The lower half is a boundary 
    {
        for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
        {

            jj = (j - XParam.blkwidth / 2) * 2;
            //
            i = memloc(XParam, XParam.blkwidth, j, ib);


            il = memloc(XParam, 0, jj, XBlock.RightTop[ib]);
            itl = memloc(XParam, 0, jj + 1, XBlock.RightTop[ib]);

            z[i] = T(0.5) * (z[il] + z[itl]);





        }
        //
    }
}

template <class T> __global__  void HaloFluxGPULR(Param XParam, BlockP<T> XBlock, T* z)
{
    int jj, i, il, itl;
    int blkmemwidth = blockDim.y + XParam.halowidth * 2;
    //unsigned int blksize = blkmemwidth * blkmemwidth;
    //unsigned int ix = 0;
    int iy = threadIdx.y;
    int ibl = blockIdx.x;
    int ib = XBlock.active[ibl];


    int j = iy;


    if (XBlock.level[XBlock.LeftBot[ib]] > XBlock.level[ib])//The lower half is a boundary 
    {

        if (j< (XParam.blkwidth / 2))
        {
            //
            i = memloc(XParam.halowidth, blkmemwidth, 0, j, ib);


            jj = j * 2;
            il = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth, jj, XBlock.LeftBot[ib]);
            itl = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth, jj + 1, XBlock.LeftBot[ib]);

            z[i] = T(0.5) * (z[il] + z[itl]);





        }
        //
    }
    if (XBlock.level[XBlock.LeftTop[ib]] > XBlock.level[ib])//The lower half is a boundary 
    {

        if (j >= (XParam.blkwidth / 2))
        {
            //
            i = memloc(XParam.halowidth, blkmemwidth, 0, j, ib);


            jj = (j - XParam.blkwidth / 2) * 2;
            il = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth, jj, XBlock.LeftTop[ib]);
            itl = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth, jj + 1, XBlock.LeftTop[ib]);

            z[i] = T(0.5) * (z[il] + z[itl]);





        }
        //
    }
    if (XBlock.level[XBlock.RightBot[ib]] > XBlock.level[ib])//The lower half is a boundary 
    {

        if (j < (XParam.blkwidth / 2))
        {
            //
            i = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth, j, ib);


            jj = j * 2;
            il = memloc(XParam.halowidth, blkmemwidth, 0, jj, XBlock.RightBot[ib]);
            itl = memloc(XParam.halowidth, blkmemwidth, 0, jj + 1, XBlock.RightBot[ib]);

            z[i] = T(0.5) * (z[il] + z[itl]);





        }
        //
    }
    if (XBlock.level[XBlock.RightTop[ib]] > XBlock.level[ib])//The lower half is a boundary 
    {
        if (j >= (XParam.blkwidth / 2))
        {

            jj = (j - XParam.blkwidth / 2) * 2;
            //
            i = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth, j, ib);


            il = memloc(XParam.halowidth, blkmemwidth, 0, jj, XBlock.RightTop[ib]);
            itl = memloc(XParam.halowidth, blkmemwidth, 0, jj + 1, XBlock.RightTop[ib]);

            z[i] = T(0.5) * (z[il] + z[itl]);





        }
        //
    }
}

template <class T> __global__  void HaloFluxGPULRnew(Param XParam, BlockP<T> XBlock, T* z)
{
    int jj, i, il, itl;
    int blkmemwidth = blockDim.y + XParam.halowidth * 2;
    //unsigned int blksize = blkmemwidth * blkmemwidth;
    //unsigned int ix = 0;
    int iy = threadIdx.y;
    int ibl = threadIdx.x + blockIdx.x * blockDim.x;
    if (ibl < XParam.nblk)
    {

        int ib = XBlock.active[ibl];


        int j = iy;


        if (XBlock.level[XBlock.LeftBot[ib]] > XBlock.level[ib])//The lower half is a boundary 
        {

            if (j < (XParam.blkwidth / 2))
            {
                //
                i = memloc(XParam.halowidth, blkmemwidth, 0, j, ib);


                jj = j * 2;
                il = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth, jj, XBlock.LeftBot[ib]);
                itl = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth, jj + 1, XBlock.LeftBot[ib]);

                z[i] = T(0.5) * (z[il] + z[itl]);





            }
            //
        }
        if (XBlock.level[XBlock.LeftTop[ib]] > XBlock.level[ib])//The lower half is a boundary 
        {

            if (j >= (XParam.blkwidth / 2))
            {
                //
                i = memloc(XParam.halowidth, blkmemwidth, 0, j, ib);


                jj = (j - XParam.blkwidth / 2) * 2;
                il = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth, jj, XBlock.LeftTop[ib]);
                itl = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth, jj + 1, XBlock.LeftTop[ib]);

                z[i] = T(0.5) * (z[il] + z[itl]);





            }
            //
        }
        if (XBlock.level[XBlock.RightBot[ib]] > XBlock.level[ib])//The lower half is a boundary 
        {

            if (j < (XParam.blkwidth / 2))
            {
                //
                i = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth, j, ib);


                jj = j * 2;
                il = memloc(XParam.halowidth, blkmemwidth, 0, jj, XBlock.RightBot[ib]);
                itl = memloc(XParam.halowidth, blkmemwidth, 0, jj + 1, XBlock.RightBot[ib]);

                z[i] = T(0.5) * (z[il] + z[itl]);





            }
            //
        }
        if (XBlock.level[XBlock.RightTop[ib]] > XBlock.level[ib])//The lower half is a boundary 
        {
            if (j >= (XParam.blkwidth / 2))
            {

                jj = (j - XParam.blkwidth / 2) * 2;
                //
                i = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth, j, ib);


                il = memloc(XParam.halowidth, blkmemwidth, 0, jj, XBlock.RightTop[ib]);
                itl = memloc(XParam.halowidth, blkmemwidth, 0, jj + 1, XBlock.RightTop[ib]);

                z[i] = T(0.5) * (z[il] + z[itl]);





            }
            //
        }
    }
}

template <class T> void HaloFluxCPUBT(Param XParam, int ib, BlockP<T> XBlock, T* z)
{
    int jj, i, il, itl;
    if (XBlock.level[XBlock.BotLeft[ib]] > XBlock.level[ib])//The lower half is a boundary 
    {
        for (int j = 0; j < (XParam.blkwidth / 2); j++)
        {
            //
            i = memloc(XParam, j, 0, ib);


            jj = j * 2;
            il = memloc(XParam, jj, XParam.blkwidth,  XBlock.BotLeft[ib]);
            itl = memloc(XParam, jj+1, XParam.blkwidth,  XBlock.BotLeft[ib]);

            z[i] = T(0.5) * (z[il] + z[itl]);





        }
        //
    }
    if (XBlock.level[XBlock.BotRight[ib]] > XBlock.level[ib])//The lower half is a boundary 
    {
        for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
        {
            //
            i = memloc(XParam, j, 0, ib);


            jj = (j - XParam.blkwidth / 2) * 2;
            il = memloc(XParam,  jj, XParam.blkwidth, XBlock.BotRight[ib]);
            itl = memloc(XParam,  jj + 1, XParam.blkwidth, XBlock.BotRight[ib]);

            z[i] = T(0.5) * (z[il] + z[itl]);





        }
        //
    }
    if (XBlock.level[XBlock.TopLeft[ib]] > XBlock.level[ib])//The lower half is a boundary 
    {
        for (int j = 0; j < (XParam.blkwidth / 2); j++)
        {
            //
            i = memloc(XParam, j, XParam.blkwidth, ib);


            jj = j * 2;
            il = memloc(XParam, jj, 0,  XBlock.TopLeft[ib]);
            itl = memloc(XParam, jj + 1, 0, XBlock.TopLeft[ib]);

            z[i] = T(0.5) * (z[il] + z[itl]);





        }
        //
    }
    if (XBlock.level[XBlock.TopRight[ib]] > XBlock.level[ib])//The lower half is a boundary 
    {
        for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
        {

            jj = (j - XParam.blkwidth / 2) * 2;
            //
            i = memloc(XParam, j, XParam.blkwidth, ib);


            il = memloc(XParam, jj, 0, XBlock.TopRight[ib]);
            itl = memloc(XParam, jj + 1, 0, XBlock.TopRight[ib]);

            z[i] = T(0.5) * (z[il] + z[itl]);





        }
        //
    }
}

template <class T> __global__ void HaloFluxGPUBT(Param XParam, BlockP<T> XBlock, T* z)
{
    int jj, i, il, itl;
    int blkmemwidth = blockDim.x + XParam.halowidth * 2;
    //unsigned int blksize = blkmemwidth * blkmemwidth;
    int ix = threadIdx.x;
    //unsigned int iy = threadIdx.x;
    int ibl = blockIdx.x;
    int ib = XBlock.active[ibl];

    int j = ix;

    if (XBlock.level[XBlock.BotLeft[ib]] > XBlock.level[ib])//The lower half is a boundary 
    {
        if (j < (XParam.blkwidth / 2))
        {
            //
            i = memloc(XParam.halowidth, blkmemwidth, j, 0, ib);


            jj = j * 2;
            il = memloc(XParam.halowidth, blkmemwidth, jj, XParam.blkwidth, XBlock.BotLeft[ib]);
            itl = memloc(XParam.halowidth, blkmemwidth, jj + 1, XParam.blkwidth, XBlock.BotLeft[ib]);

            z[i] = T(0.5) * (z[il] + z[itl]);





        }
        //
    }
    if (XBlock.level[XBlock.BotRight[ib]] > XBlock.level[ib])//The lower half is a boundary 
    {
        if (j >= (XParam.blkwidth / 2))
        {
            //
            i = memloc(XParam.halowidth, blkmemwidth, j, 0, ib);


            jj = (j - XParam.blkwidth / 2) * 2;
            il = memloc(XParam.halowidth, blkmemwidth, jj, XParam.blkwidth, XBlock.BotRight[ib]);
            itl = memloc(XParam.halowidth, blkmemwidth, jj + 1, XParam.blkwidth, XBlock.BotRight[ib]);

            z[i] = T(0.5) * (z[il] + z[itl]);





        }
        //
    }
    if (XBlock.level[XBlock.TopLeft[ib]] > XBlock.level[ib])//The lower half is a boundary 
    {
        if (j < (XParam.blkwidth / 2))
        {
            //
            i = memloc(XParam.halowidth, blkmemwidth, j, XParam.blkwidth, ib);


            jj = j * 2;
            il = memloc(XParam.halowidth, blkmemwidth, jj, 0, XBlock.TopLeft[ib]);
            itl = memloc(XParam.halowidth, blkmemwidth, jj + 1, 0, XBlock.TopLeft[ib]);

            z[i] = T(0.5) * (z[il] + z[itl]);





        }
        //
    }
    if (XBlock.level[XBlock.TopRight[ib]] > XBlock.level[ib])//The lower half is a boundary 
    {
        if (j >= (XParam.blkwidth / 2))
        {

            jj = (j - XParam.blkwidth / 2) * 2;
            //
            i = memloc(XParam.halowidth, blkmemwidth, j, XParam.blkwidth, ib);


            il = memloc(XParam.halowidth, blkmemwidth, jj, 0, XBlock.TopRight[ib]);
            itl = memloc(XParam.halowidth, blkmemwidth, jj + 1, 0, XBlock.TopRight[ib]);

            z[i] = T(0.5) * (z[il] + z[itl]);





        }
        //
    }
}

template <class T> __global__ void HaloFluxGPUBTnew(Param XParam, BlockP<T> XBlock, T* z)
{
    int jj, i, il, itl;
    int blkmemwidth = blockDim.x + XParam.halowidth * 2;
    //unsigned int blksize = blkmemwidth * blkmemwidth;
    int ix = threadIdx.x;
    //unsigned int iy = threadIdx.x;
    int ibl = threadIdx.y + blockIdx.x * blockDim.y;
    if (ibl < XParam.nblk)
    {



        int ib = XBlock.active[ibl];

        int j = ix;

        if (XBlock.level[XBlock.BotLeft[ib]] > XBlock.level[ib])//The lower half is a boundary 
        {
            if (j < (XParam.blkwidth / 2))
            {
                //
                i = memloc(XParam.halowidth, blkmemwidth, j, 0, ib);


                jj = j * 2;
                il = memloc(XParam.halowidth, blkmemwidth, jj, XParam.blkwidth, XBlock.BotLeft[ib]);
                itl = memloc(XParam.halowidth, blkmemwidth, jj + 1, XParam.blkwidth, XBlock.BotLeft[ib]);

                z[i] = T(0.5) * (z[il] + z[itl]);





            }
            //
        }
        if (XBlock.level[XBlock.BotRight[ib]] > XBlock.level[ib])//The lower half is a boundary 
        {
            if (j >= (XParam.blkwidth / 2))
            {
                //
                i = memloc(XParam.halowidth, blkmemwidth, j, 0, ib);


                jj = (j - XParam.blkwidth / 2) * 2;
                il = memloc(XParam.halowidth, blkmemwidth, jj, XParam.blkwidth, XBlock.BotRight[ib]);
                itl = memloc(XParam.halowidth, blkmemwidth, jj + 1, XParam.blkwidth, XBlock.BotRight[ib]);

                z[i] = T(0.5) * (z[il] + z[itl]);





            }
            //
        }
        if (XBlock.level[XBlock.TopLeft[ib]] > XBlock.level[ib])//The lower half is a boundary 
        {
            if (j < (XParam.blkwidth / 2))
            {
                //
                i = memloc(XParam.halowidth, blkmemwidth, j, XParam.blkwidth, ib);


                jj = j * 2;
                il = memloc(XParam.halowidth, blkmemwidth, jj, 0, XBlock.TopLeft[ib]);
                itl = memloc(XParam.halowidth, blkmemwidth, jj + 1, 0, XBlock.TopLeft[ib]);

                z[i] = T(0.5) * (z[il] + z[itl]);





            }
            //
        }
        if (XBlock.level[XBlock.TopRight[ib]] > XBlock.level[ib])//The lower half is a boundary 
        {
            if (j >= (XParam.blkwidth / 2))
            {

                jj = (j - XParam.blkwidth / 2) * 2;
                //
                i = memloc(XParam.halowidth, blkmemwidth, j, XParam.blkwidth, ib);


                il = memloc(XParam.halowidth, blkmemwidth, jj, 0, XBlock.TopRight[ib]);
                itl = memloc(XParam.halowidth, blkmemwidth, jj + 1, 0, XBlock.TopRight[ib]);

                z[i] = T(0.5) * (z[il] + z[itl]);





            }
            //
        }
    }
}


template <class T> void fillLeft(Param XParam, int ib, BlockP<T> XBlock, T* &z)
{
    int jj,bb;
    int read, write;
    int ii, ir, it, itr;


    if (XBlock.LeftBot[ib] == ib)//The lower half is a boundary 
    {
        for (int j = 0; j < (XParam.blkwidth / 2); j++)
        {

            read = memloc(XParam, 0, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
            write = memloc(XParam, -1, j, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
            z[write] = z[read];
        }

        if (XBlock.LeftTop[ib] == ib) // boundary on the top half too
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //

                read = memloc(XParam, 0, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, -1, j, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                z[write] = z[read];
            }
        }
        else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
        {

            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                write = memloc(XParam, -1, j, ib);
                jj = (j - XParam.blkwidth / 2) * 2;
                ii = memloc(XParam, (XParam.blkwidth - 1), jj, XBlock.LeftTop[ib]);
                ir = memloc(XParam, (XParam.blkwidth - 2), jj, XBlock.LeftTop[ib]);
                it = memloc(XParam, (XParam.blkwidth - 1), jj + 1, XBlock.LeftTop[ib]);
                itr = memloc(XParam, (XParam.blkwidth - 2), jj + 1, XBlock.LeftTop[ib]);

                z[write] = T(0.25) * (z[ii] + z[ir] + z[it] + z[itr]);

            }
        }
    }
    else if (XBlock.level[ib] == XBlock.level[ XBlock.LeftBot[ib] ]) // LeftTop block does not exist
    {
        for (int j = 0; j < XParam.blkwidth; j++)
        {
            //

            write = memloc(XParam, -1, j, ib);
            read = memloc(XParam, (XParam.blkwidth - 1), j, XBlock.LeftBot[ib]);
            z[write] = z[read];
        }
    }
    else if (XBlock.level[XBlock.LeftBot[ib] ]> XBlock.level[ib])
    {

        for (int j = 0; j < XParam.blkwidth / 2; j++)
        {

            write = memloc(XParam, -1, j, ib);

            jj = j * 2;
            bb = XBlock.LeftBot[ib];

            ii = memloc(XParam, (XParam.blkwidth - 1), jj, bb);
            ir = memloc(XParam, (XParam.blkwidth - 2), jj, bb);
            it = memloc(XParam, (XParam.blkwidth - 1), jj + 1, bb);
            itr = memloc(XParam, (XParam.blkwidth - 2), jj + 1, bb);

            z[write] = T(0.25) * (z[ii] + z[ir] + z[it] + z[itr]);
        }
        //now find out aboy lefttop block
        if (XBlock.LeftTop[ib] == ib)
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //

                read = memloc(XParam, 0, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, -1, j, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                z[write] = z[read];
            }
        }
        else
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //
                jj = (j - (XParam.blkwidth / 2)) * 2;
                bb = XBlock.LeftTop[ib];

                //read = memloc(XParam, 0, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, -1, j, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                //z[write] = z[read];
                ii = memloc(XParam, (XParam.blkwidth - 1), jj, bb);
                ir = memloc(XParam, (XParam.blkwidth - 2), jj, bb);
                it = memloc(XParam, (XParam.blkwidth - 1), jj + 1, bb);
                itr = memloc(XParam, (XParam.blkwidth - 2), jj + 1, bb);

                z[write] = T(0.25) * (z[ii] + z[ir] + z[it] + z[itr]);
            }
        }

    }
    else if (XBlock.level[XBlock.LeftBot[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo 
    {
        for (int j = 0; j < XParam.blkwidth; j++)
        {
            write = memloc(XParam, -1, j, ib);

            T w1, w2, w3;


            int jj = XBlock.RightBot[XBlock.LeftBot[ib]] == ib? ftoi(ceil(j * (T)0.5)): ftoi(ceil(j * (T)0.5)+ XParam.blkwidth/2);
            w1 = T(1.0 / 3.0);
            w2 = ceil(j * (T)0.5) * 2 > j ? T(1.0 / 6.0) : T(0.5);
            w3 = ceil(j * (T)0.5) * 2 > j ? T(0.5) : T(1.0 / 6.0);

            ii= memloc(XParam, 0, j, ib);
            ir= memloc(XParam, XParam.blkwidth-1, jj, XBlock.LeftBot[ib]);
            it = memloc(XParam, XParam.blkwidth-1, jj - 1, XBlock.LeftBot[ib]);
            //2 scenarios here ib is the rightbot neighbour of the leftbot block or ib is the righttop neighbour
            if (XBlock.RightBot[XBlock.LeftBot[ib]] == ib)
            {
                if (j == 0)
                {
                    if (XBlock.BotRight[XBlock.LeftBot[ib]] == XBlock.LeftBot[ib]) // no botom of leftbot block
                    {
                        w3 = T(0.5) * (T(1.0) - w1);
                        w2 = w3;
                        it = ir;

                    }
                    else if (XBlock.level[XBlock.BotRight[XBlock.LeftBot[ib]]] < XBlock.level[XBlock.LeftBot[ib]]) // exists but is coarser
                    {
                        w1 = T(4.0 / 10.0);
                        w2 = T(5.0 / 10.0);
                        w3 = T(1.0 / 10.0);
                        it = memloc(XParam, XParam.blkwidth-1, XParam.blkwidth - 1, XBlock.BotRight[XBlock.LeftBot[ib]]);
                    }
                    else if (XBlock.level[XBlock.BotRight[XBlock.LeftBot[ib]]] == XBlock.level[XBlock.LeftBot[ib]]) // exists with same level
                    {
                        it = memloc(XParam, XParam.blkwidth - 1, XParam.blkwidth - 1, XBlock.BotRight[XBlock.LeftBot[ib]]);
                    }
                    else if (XBlock.level[XBlock.BotRight[XBlock.LeftBot[ib]]] > XBlock.level[XBlock.LeftBot[ib]]) // exists with higher level
                    {
                        w1 = T(1.0 / 4.0);
                        w2 = T(1.0 / 2.0);
                        w3 = T(1.0 / 4.0);
                        it = memloc(XParam, XParam.blkwidth - 1, XParam.blkwidth - 1, XBlock.BotRight[XBlock.LeftBot[ib]]);
                    }


                }


            }
            else//righttopleftif == ib
            {
                if (j == (XParam.blkwidth - 1))
                {
                    if (XBlock.TopRight[XBlock.LeftTop[ib]] == XBlock.LeftTop[ib]) // no botom of leftbot block
                    {
                        w3 = T(0.5*(1.0-w1));
                        w2 = w3;
                        ir = it;

                    }
                    else if (XBlock.level[XBlock.TopRight[XBlock.LeftTop[ib]]] < XBlock.level[XBlock.LeftTop[ib]]) // exists but is coarser
                    {
                        w1 = T(4.0 / 10.0);
                        w2 = T(1.0 / 10.0);
                        w3 = T(5.0 / 10.0);
                        ir = memloc(XParam, XParam.blkwidth - 1,0, XBlock.TopRight[XBlock.LeftTop[ib]]);
                    }
                    else if (XBlock.level[XBlock.TopRight[XBlock.LeftTop[ib]]] == XBlock.level[XBlock.LeftTop[ib]]) // exists with same level
                    {
                        ir = memloc(XParam, XParam.blkwidth - 1, 0, XBlock.TopRight[XBlock.LeftTop[ib]]);
                    }
                    else if (XBlock.level[XBlock.TopRight[XBlock.LeftTop[ib]]] > XBlock.level[XBlock.LeftTop[ib]]) // exists with higher level
                    {
                        w1 = T(1.0 / 4.0);
                        w2 = T(1.0 / 2.0);
                        w3 = T(1.0 / 4.0);
                        ir = memloc(XParam, XParam.blkwidth - 1, 0, XBlock.TopRight[XBlock.LeftTop[ib]]);
                    }
                }
                //
            }


            z[write] = w1 * z[ii] + w2 * z[ir] + w3 * z[it];



        }
    }



}


template <class T> __global__ void fillLeft(int halowidth, int* active, int * level, int* leftbot, int * lefttop, int * rightbot, int* botright,int * topright, T * a)
{
    unsigned int blkmemwidth = blockDim.y + halowidth * 2;
    //unsigned int blksize = blkmemwidth * blkmemwidth;
    //unsigned int ix = 0;
    unsigned int iy = threadIdx.y;
    unsigned int ibl = blockIdx.x;
    unsigned int ib = active[ibl];

    int lev = level[ib];
    int LB = leftbot[ib];
    int LT = lefttop[ib];

    int RBLB = rightbot[LB];
    int BRLB = botright[LB];
    int TRLT = topright[LT];

    int levBRLB = level[BRLB];
    int levTRLT = level[TRLT];
    int levLB = level[LB];
    int levLT = level[LT];
    int write = memloc(halowidth, blkmemwidth, -1, iy, ib);
    int read;
    int jj, ii, ir, it, itr;
    T a_read;
    T w1, w2, w3;

    if (LB == ib)
    {
        if (iy < (blockDim.y / 2))
        {
            read = memloc(halowidth, blkmemwidth, 0, iy, ib);
            a_read = a[read];
        }
        else
        {
            if (LT == ib)
            {
                read = memloc(halowidth, blkmemwidth, 0, iy, ib);
                a_read = a[read];

            }
            else
            {

                jj = (iy - (blockDim.y / 2)) * 2;
                ii = memloc(halowidth, blkmemwidth, (blockDim.y - 1), jj, LT);
                ir = memloc(halowidth, blkmemwidth, (blockDim.y - 2), jj, LT);
                it = memloc(halowidth, blkmemwidth, (blockDim.y - 1), jj + 1, LT);
                itr = memloc(halowidth, blkmemwidth, (blockDim.y - 2), jj + 1, LT);

                a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
            }
        }

    }
    else if (levLB == lev )
    {
        read = memloc(halowidth, blkmemwidth, (blockDim.y - 1), iy, LB);
        a_read = a[read];
    }
    else if (levLB > lev)
    {
        if (iy < (blockDim.y / 2))
        {
            jj = iy * 2;
            ii = memloc(halowidth, blkmemwidth, (blockDim.y - 1), jj, LB);
            ir = memloc(halowidth, blkmemwidth, (blockDim.y - 2), jj, LB);
            it = memloc(halowidth, blkmemwidth, (blockDim.y - 1), jj + 1, LB);
            itr = memloc(halowidth, blkmemwidth, (blockDim.y - 2), jj + 1, LB);
            a_read= T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
        }
        else
        {
            if (LT == ib)
            {
                read = memloc(halowidth, blkmemwidth, 0, iy, ib);
                a_read = a[read];
            }
            else
            {
                jj = (iy - (blockDim.y / 2)) * 2;

                ii = memloc(halowidth, blkmemwidth, (blockDim.y - 1), jj, LT);
                ir = memloc(halowidth, blkmemwidth, (blockDim.y - 2), jj, LT);
                it = memloc(halowidth, blkmemwidth, (blockDim.y - 1), jj + 1, LT);
                itr = memloc(halowidth, blkmemwidth, (blockDim.y - 2), jj + 1, LT);

                a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
            }
        }
    }
    else if (levLB < lev)
    {
        jj = RBLB==ib? ceil(iy * (T)0.5): ceil(iy * (T)0.5) + blockDim.y / 2;
        w1 = (T)1.0 / (T)3.0;
        w2 = ceil(iy * (T)0.5) * 2 > iy ? T(1.0 / 6.0) : T(0.5);
        w3 = ceil(iy * (T)0.5) * 2 > iy ? T(0.5) : T(1.0 / 6.0);

        ii = memloc(halowidth, blkmemwidth, 0, iy, ib);
        ir = memloc(halowidth, blkmemwidth, blockDim.y - 1, jj, LB);
        it = memloc(halowidth, blkmemwidth, blockDim.y - 1, jj - 1, LB);
        if (RBLB == ib)
        {
            if (iy == 0)
            {
                if (BRLB == LB)
                {
                    w3 = (T)0.5 * (1.0 - w1);
                    w2 = w3;
                    it = ir;
                }
                else if (levBRLB < levLB)
                {
                    w1 = T(4.0 / 10.0);
                    w2 = T(5.0 / 10.0);
                    w3 = T(1.0 / 10.0);
                    it = memloc(halowidth, blkmemwidth, blockDim.y - 1, blockDim.y - 1, BRLB);

                }
                else if (levBRLB == levLB)
                {
                    it = memloc(halowidth, blkmemwidth, blockDim.y - 1, blockDim.y - 1, BRLB);
                }
                else if (levBRLB > levLB)
                {
                    w1 = T(1.0 / 4.0);
                    w2 = T(1.0 / 2.0);
                    w3 = T(1.0 / 4.0);
                    it = memloc(halowidth, blkmemwidth, blockDim.y - 1, blockDim.y - 1, BRLB);
                }
            }
        }
        else
        {
            if (iy == (blockDim.y - 1))
            {
                if (TRLT == LT)
                {
                    w3 = 0.5 * (1.0 - w1);
                    w2 = w3;
                    ir = it;
                }
                else if (levTRLT < levLT)
                {
                    w1 = 4.0 / 10.0;
                    w2 = 1.0 / 10.0;
                    w3 = 5.0 / 10.0;
                    ir = memloc(halowidth, blkmemwidth, blockDim.y - 1, 0, TRLT);
                }
                else if (levTRLT == levLT)
                {
                    ir = memloc(halowidth, blkmemwidth, blockDim.y - 1, 0, TRLT);
                }
                else if (levTRLT > levLT)
                {
                    w1 = 1.0 / 4.0;
                    w2 = 1.0 / 2.0;
                    w3 = 1.0 / 4.0;
                    ir = memloc(halowidth, blkmemwidth, blockDim.y - 1, 0, TRLT);

                }
            }
        }
        a_read = w1 * a[ii] + w2 * a[ir] + w3 * a[it];
    }

    a[write] = a_read;
}
template __global__ void fillLeft<float>(int halowidth, int* active, int* level, int* leftbot, int* lefttop, int* rightbot, int* botright, int* topright, float* a);
template __global__ void fillLeft<double>(int halowidth, int* active, int* level, int* leftbot, int* lefttop, int* rightbot, int* botright, int* topright, double* a);

template <class T> __global__ void fillLeftnew(int halowidth, int nblk, int* active, int* level, int* leftbot, int* lefttop, int* rightbot, int* botright, int* topright, T* a)
{
    int blkmemwidth = blockDim.y + halowidth * 2;
    //unsigned int blksize = blkmemwidth * blkmemwidth;
    //unsigned int ix = 0;
    int iy = threadIdx.y;
    //need to take min of ibl or total number of blks in case nblk is not dividable by 2
    int ibl = threadIdx.x + blockIdx.x * blockDim.x;;
    if (ibl < nblk)
    {


        int ib = active[ibl];

        int lev = level[ib];
        int LB = leftbot[ib];
        int LT = lefttop[ib];

        int RBLB = rightbot[LB];
        int BRLB = botright[LB];
        int TRLT = topright[LT];

        int levBRLB = level[BRLB];
        int levTRLT = level[TRLT];
        int levLB = level[LB];
        int levLT = level[LT];
        int write = memloc(halowidth, blkmemwidth, -1, iy, ib);
        int read;
        int jj, ii, ir, it, itr;
        T a_read;
        T w1, w2, w3;

        if (LB == ib)
        {
            if (iy < (blockDim.y / 2))
            {
                read = memloc(halowidth, blkmemwidth, 0, iy, ib);
                a_read = a[read];
            }
            else
            {
                if (LT == ib)
                {
                    read = memloc(halowidth, blkmemwidth, 0, iy, ib);
                    a_read = a[read];

                }
                else
                {

                    jj = (iy - (blockDim.y / 2)) * 2;
                    ii = memloc(halowidth, blkmemwidth, (blockDim.y - 1), jj, LT);
                    ir = memloc(halowidth, blkmemwidth, (blockDim.y - 2), jj, LT);
                    it = memloc(halowidth, blkmemwidth, (blockDim.y - 1), jj + 1, LT);
                    itr = memloc(halowidth, blkmemwidth, (blockDim.y - 2), jj + 1, LT);

                    a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
                }
            }

        }
        else if (levLB == lev)
        {
            read = memloc(halowidth, blkmemwidth, (blockDim.y - 1), iy, LB);
            a_read = a[read];
        }
        else if (levLB > lev)
        {
            if (iy < (blockDim.y / 2))
            {
                jj = iy * 2;
                ii = memloc(halowidth, blkmemwidth, (blockDim.y - 1), jj, LB);
                ir = memloc(halowidth, blkmemwidth, (blockDim.y - 2), jj, LB);
                it = memloc(halowidth, blkmemwidth, (blockDim.y - 1), jj + 1, LB);
                itr = memloc(halowidth, blkmemwidth, (blockDim.y - 2), jj + 1, LB);
                a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
            }
            else
            {
                if (LT == ib)
                {
                    read = memloc(halowidth, blkmemwidth, 0, iy, ib);
                    a_read = a[read];
                }
                else
                {
                    jj = (iy - (blockDim.y / 2)) * 2;

                    ii = memloc(halowidth, blkmemwidth, (blockDim.y - 1), jj, LT);
                    ir = memloc(halowidth, blkmemwidth, (blockDim.y - 2), jj, LT);
                    it = memloc(halowidth, blkmemwidth, (blockDim.y - 1), jj + 1, LT);
                    itr = memloc(halowidth, blkmemwidth, (blockDim.y - 2), jj + 1, LT);

                    a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
                }
            }
        }
        else if (levLB < lev)
        {
            jj = RBLB == ib ? ceil(iy * (T)0.5) : ceil(iy * (T)0.5) + blockDim.y / 2;
            w1 = (T)1.0 / (T)3.0;
            w2 = ceil(iy * (T)0.5) * 2 > iy ? T(1.0 / 6.0) : T(0.5);
            w3 = ceil(iy * (T)0.5) * 2 > iy ? T(0.5) : T(1.0 / 6.0);

            ii = memloc(halowidth, blkmemwidth, 0, iy, ib);
            ir = memloc(halowidth, blkmemwidth, blockDim.y - 1, jj, LB);
            it = memloc(halowidth, blkmemwidth, blockDim.y - 1, jj - 1, LB);
            if (RBLB == ib)
            {
                if (iy == 0)
                {
                    if (BRLB == LB)
                    {
                        w3 = (T)0.5 * (1.0 - w1);
                        w2 = w3;
                        it = ir;
                    }
                    else if (levBRLB < levLB)
                    {
                        w1 = T(4.0 / 10.0);
                        w2 = T(5.0 / 10.0);
                        w3 = T(1.0 / 10.0);
                        it = memloc(halowidth, blkmemwidth, blockDim.y - 1, blockDim.y - 1, BRLB);

                    }
                    else if (levBRLB == levLB)
                    {
                        it = memloc(halowidth, blkmemwidth, blockDim.y - 1, blockDim.y - 1, BRLB);
                    }
                    else if (levBRLB > levLB)
                    {
                        w1 = T(1.0 / 4.0);
                        w2 = T(1.0 / 2.0);
                        w3 = T(1.0 / 4.0);
                        it = memloc(halowidth, blkmemwidth, blockDim.y - 1, blockDim.y - 1, BRLB);
                    }
                }
            }
            else
            {
                if (iy == (blockDim.y - 1))
                {
                    if (TRLT == LT)
                    {
                        w3 = 0.5 * (1.0 - w1);
                        w2 = w3;
                        ir = it;
                    }
                    else if (levTRLT < levLT)
                    {
                        w1 = 4.0 / 10.0;
                        w2 = 1.0 / 10.0;
                        w3 = 5.0 / 10.0;
                        ir = memloc(halowidth, blkmemwidth, blockDim.y - 1, 0, TRLT);
                    }
                    else if (levTRLT == levLT)
                    {
                        ir = memloc(halowidth, blkmemwidth, blockDim.y - 1, 0, TRLT);
                    }
                    else if (levTRLT > levLT)
                    {
                        w1 = 1.0 / 4.0;
                        w2 = 1.0 / 2.0;
                        w3 = 1.0 / 4.0;
                        ir = memloc(halowidth, blkmemwidth, blockDim.y - 1, 0, TRLT);

                    }
                }
            }
            a_read = w1 * a[ii] + w2 * a[ir] + w3 * a[it];
        }

        a[write] = a_read;
    }
}
template __global__ void fillLeftnew<float>(int halowidth, int nblk, int* active, int* level, int* leftbot, int* lefttop, int* rightbot, int* botright, int* topright, float* a);
template __global__ void fillLeftnew<double>(int halowidth, int nblk, int* active, int* level, int* leftbot, int* lefttop, int* rightbot, int* botright, int* topright, double* a);

template <class T> void fillLeftFlux(Param XParam, bool doProlongation, int ib, BlockP<T> XBlock, T*& z)
{
    int jj, bb;
    int read, write;
    int ii,  it;


    if (XBlock.LeftBot[ib] == ib)//The lower half is a boundary 
    {
        for (int j = 0; j < (XParam.blkwidth / 2); j++)
        {

            read = memloc(XParam, 0, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
            write = memloc(XParam, -1, j, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
            z[write] = z[read];
        }

        if (XBlock.LeftTop[ib] == ib) // boundary on the top half too
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //

                read = memloc(XParam, 0, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, -1, j, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                z[write] = z[read];
            }
        }
        else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
        {

            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                write = memloc(XParam, -1, j, ib);
                jj = (j - XParam.blkwidth / 2) * 2;
                ii = memloc(XParam, (XParam.blkwidth - 1), jj, XBlock.LeftTop[ib]);
                //ir = memloc(XParam, (XParam.blkwidth - 2), jj, XBlock.LeftTop[ib]);
                it = memloc(XParam, (XParam.blkwidth - 1), jj + 1, XBlock.LeftTop[ib]);
                //itr = memloc(XParam, (XParam.blkwidth - 2), jj + 1, XBlock.LeftTop[ib]);

                z[write] = T(0.5) * (z[ii]  + z[it]);

            }
        }
    }
    else if (XBlock.level[ib] == XBlock.level[XBlock.LeftBot[ib]]) // LeftTop block does not exist
    {
        for (int j = 0; j < XParam.blkwidth; j++)
        {
            //

            write = memloc(XParam, -1, j, ib);
            read = memloc(XParam, (XParam.blkwidth - 1), j, XBlock.LeftBot[ib]);
            z[write] = z[read];
        }
    }
    else if (XBlock.level[XBlock.LeftBot[ib]] > XBlock.level[ib])
    {

        for (int j = 0; j < XParam.blkwidth / 2; j++)
        {

            write = memloc(XParam, -1, j, ib);

            jj = j * 2;
            bb = XBlock.LeftBot[ib];

            ii = memloc(XParam, (XParam.blkwidth - 1), jj, bb);
            //ir = memloc(XParam, (XParam.blkwidth - 2), jj, bb);
            it = memloc(XParam, (XParam.blkwidth - 1), jj + 1, bb);
            //itr = memloc(XParam, (XParam.blkwidth - 2), jj + 1, bb);

            z[write] = T(0.5) * (z[ii] + z[it]);
        }
        //now find out aboy lefttop block
        if (XBlock.LeftTop[ib] == ib)
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //

                read = memloc(XParam, 0, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, -1, j, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                z[write] = z[read];
            }
        }
        else
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //
                jj = (j - 8) * 2;
                bb = XBlock.LeftTop[ib];

                //read = memloc(XParam, 0, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, -1, j, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                //z[write] = z[read];
                ii = memloc(XParam, (XParam.blkwidth - 1), jj, bb);
                //ir = memloc(XParam, (XParam.blkwidth - 2), jj, bb);
                it = memloc(XParam, (XParam.blkwidth - 1), jj + 1, bb);
                //itr = memloc(XParam, (XParam.blkwidth - 2), jj + 1, bb);

                z[write] = T(0.5) * (z[ii] + z[it]);
            }
        }

    }
    else if (XBlock.level[XBlock.LeftBot[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo 
    {
        for (int j = 0; j < XParam.blkwidth; j++)
        {
            write = memloc(XParam, -1, j, ib);

            //T w1, w2, w3;


            int jj = XBlock.RightBot[XBlock.LeftBot[ib]] == ib ? ftoi(ceil(j * (T)0.5)) : ftoi(ceil(j * (T)0.5) + XParam.blkwidth / 2);


            ii = memloc(XParam, XParam.blkwidth - 1, jj, XBlock.LeftBot[ib]);
            if (doProlongation)
                z[write] = z[ii];



        }
    }



}


template <class T> void fillRight(Param XParam, int ib, BlockP<T> XBlock, T*& z)
{
    int jj, bb;
    int read, write;
    int ii, ir, it, itr;


    if (XBlock.RightBot[ib] == ib)//The lower half is a boundary 
    {
        for (int j = 0; j < (XParam.blkwidth / 2); j++)
        {

            read = memloc(XParam, XParam.blkwidth-1, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
            write = memloc(XParam, XParam.blkwidth, j, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
            z[write] = z[read];
        }

        if (XBlock.RightTop[ib] == ib) // boundary on the top half too
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //

                read = memloc(XParam, XParam.blkwidth - 1, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, XParam.blkwidth, j, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                z[write] = z[read];


            }
        }
        else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
        {

            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                write = memloc(XParam, XParam.blkwidth, j, ib);
                jj = (j - (XParam.blkwidth / 2)) * 2;
                ii = memloc(XParam, 0, jj, XBlock.RightTop[ib]);
                ir = memloc(XParam, 1, jj, XBlock.RightTop[ib]);
                it = memloc(XParam, 0, jj + 1, XBlock.RightTop[ib]);
                itr = memloc(XParam, 1, jj + 1, XBlock.RightTop[ib]);

                z[write] = T(0.25) * (z[ii] + z[ir] + z[it] + z[itr]);

            }
        }
    }
    else if (XBlock.level[ib] == XBlock.level[XBlock.RightBot[ib]]) // LeftTop block does not exist
    {
        for (int j = 0; j < XParam.blkwidth; j++)
        {
            //

            write = memloc(XParam, XParam.blkwidth, j, ib);
            read = memloc(XParam, 0, j, XBlock.RightBot[ib]);
            z[write] = z[read];
        }
    }
    else if (XBlock.level[XBlock.RightBot[ib]] > XBlock.level[ib])
    {

        for (int j = 0; j < XParam.blkwidth / 2; j++)
        {

            write = memloc(XParam, XParam.blkwidth, j, ib);

            jj = j * 2;
            bb = XBlock.RightBot[ib];

            ii = memloc(XParam, 0, jj, bb);
            ir = memloc(XParam, 1, jj, bb);
            it = memloc(XParam, 0, jj + 1, bb);
            itr = memloc(XParam, 1, jj + 1, bb);

            z[write] = T(0.25) * (z[ii] + z[ir] + z[it] + z[itr]);
        }
        //now find out aboy lefttop block
        if (XBlock.RightTop[ib] == ib)
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //

                read = memloc(XParam, XParam.blkwidth-1, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, XParam.blkwidth, j, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                z[write] = z[read];
            }
        }
        else
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //
                jj = (j - (XParam.blkwidth / 2)) * 2;
                bb = XBlock.RightTop[ib];

                //read = memloc(XParam, 0, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, XParam.blkwidth, j, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                //z[write] = z[read];
                ii = memloc(XParam, 0, jj, bb);
                ir = memloc(XParam, 1, jj, bb);
                it = memloc(XParam, 0, jj + 1, bb);
                itr = memloc(XParam, 1, jj + 1, bb);

                z[write] = T(0.25) * (z[ii] + z[ir] + z[it] + z[itr]);
            }
        }

    }
    else if (XBlock.level[XBlock.RightBot[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo 
    {
        for (int j = 0; j < XParam.blkwidth; j++)
        {
            write = memloc(XParam, XParam.blkwidth, j, ib);

            T w1, w2, w3;


            int jj = XBlock.LeftBot[XBlock.RightBot[ib]] == ib ? ftoi(ceil(j * (T)0.5)) : ftoi(ceil(j * (T)0.5) + XParam.blkwidth / 2);
            w1 = T(1.0 / 3.0);
            w2 = ceil(j * (T)0.5) * 2 > j ? T(1.0 / 6.0) : T(0.5);
            w3 = ceil(j * (T)0.5) * 2 > j ? T(0.5) : T(1.0 / 6.0);

            ii = memloc(XParam, XParam.blkwidth-1, j, ib);
            ir = memloc(XParam, 0, jj, XBlock.RightBot[ib]);
            it = memloc(XParam, 0, jj - 1, XBlock.RightBot[ib]);
            //2 scenarios here ib is the leftbot neighbour of the rightbot block or ib is the lefttop neighbour
            if (XBlock.LeftBot[XBlock.RightBot[ib]] == ib)
            {
                if (j == 0)
                {
                    if (XBlock.BotLeft[XBlock.RightBot[ib]] == XBlock.RightBot[ib]) // no botom of leftbot block
                    {
                        w3 = T(0.5 * (1.0 - w1));
                        w2 = w3;
                        it = ir;

                    }
                    else if (XBlock.level[XBlock.BotLeft[XBlock.RightBot[ib]]] < XBlock.level[XBlock.RightBot[ib]]) // exists but is coarser
                    {
                        w1 = T(4.0 / 10.0);
                        w2 = T(5.0 / 10.0);
                        w3 = T(1.0 / 10.0);
                        it = memloc(XParam, 0, XParam.blkwidth - 1, XBlock.BotLeft[XBlock.RightBot[ib]]);
                    }
                    else if (XBlock.level[XBlock.BotLeft[XBlock.RightBot[ib]]] == XBlock.level[XBlock.RightBot[ib]]) // exists with same level
                    {
                        it = memloc(XParam, 0, XParam.blkwidth - 1, XBlock.BotLeft[XBlock.RightBot[ib]]);
                    }
                    else if (XBlock.level[XBlock.BotLeft[XBlock.RightBot[ib]]] > XBlock.level[XBlock.RightBot[ib]]) // exists with higher level
                    {
                        w1 = T(1.0 / 4.0);
                        w2 = T(1.0 / 2.0);
                        w3 = T(1.0 / 4.0);
                        it = memloc(XParam, 0, XParam.blkwidth - 1, XBlock.BotLeft[XBlock.RightBot[ib]]);
                    }


                }


            }
            else//
            {
                if (j == (XParam.blkwidth - 1))
                {
                    if (XBlock.TopLeft[XBlock.RightTop[ib]] == XBlock.RightTop[ib]) // no botom of leftbot block
                    {
                        w3 = T(0.5 * (1.0 - w1));
                        w2 = w3;
                        ir = it;

                    }
                    else if (XBlock.level[XBlock.TopLeft[XBlock.RightTop[ib]]] < XBlock.level[XBlock.RightTop[ib]]) // exists but is coarser
                    {
                        w1 = T(4.0 / 10.0);
                        w2 = T(1.0 / 10.0);
                        w3 = T(5.0 / 10.0);
                        ir = memloc(XParam, 0, 0, XBlock.TopLeft[XBlock.RightTop[ib]]);
                    }
                    else if (XBlock.level[XBlock.TopLeft[XBlock.RightTop[ib]]] == XBlock.level[XBlock.RightTop[ib]]) // exists with same level
                    {
                        ir = memloc(XParam, 0, 0, XBlock.TopLeft[XBlock.RightTop[ib]]);
                    }
                    else if (XBlock.level[XBlock.TopLeft[XBlock.RightTop[ib]]] > XBlock.level[XBlock.RightTop[ib]]) // exists with higher level
                    {
                        w1 = 1.0 / 4.0;
                        w2 = 1.0 / 2.0;
                        w3 = 1.0 / 4.0;
                        ir = memloc(XParam, 0, 0, XBlock.TopLeft[XBlock.RightTop[ib]]);
                    }
                }
                //
            }


            z[write] = w1 * z[ii] + w2 * z[ir] + w3 * z[it];
        }
    }



}


template <class T> __global__ void fillRight(int halowidth, int* active, int* level, int * rightbot,int* righttop,int * leftbot,int*botleft,int* topleft, T* a)
{
    unsigned int blkmemwidth = blockDim.y + halowidth * 2;
    //unsigned int blksize = blkmemwidth * blkmemwidth;
    //unsigned int ix = blockDim.y - 1;
    unsigned int iy = threadIdx.y;
    unsigned int ibl = blockIdx.x;
    unsigned int ib = active[ibl];

    int RB = rightbot[ib];
    int RT = righttop[ib];
    //int LB = leftbot[ib];
    //int BL = botleft[ib];
    int LBRB = leftbot[RB];
    int TLRT = topleft[RT];
    int BLRB = botleft[RB];


    int lev = level[ib];
    int levRB = level[RB];
    int levRT = level[RT];
    int levBLRB = level[BLRB];
    int levTLRT = level[TLRT];

    int write = memloc(halowidth, blkmemwidth, blockDim.y, iy, ib);
    int read;
    int jj, ii, ir, it, itr;
    T a_read;
    T w1, w2, w3;


    if (RB == ib)
    {
        if (iy < (blockDim.y / 2))
        {
            read = memloc(halowidth, blkmemwidth, blockDim.y - 1, iy, ib);
            a_read = a[read];
        }
        else
        {
            if (RT == ib)
            {
                read = memloc(halowidth, blkmemwidth, blockDim.y - 1, iy, ib);
                a_read = a[read];
            }
            else
            {
                jj = (iy - (blockDim.y / 2)) * 2;
                ii = memloc(halowidth, blkmemwidth, 0, jj, RT);
                ir = memloc(halowidth, blkmemwidth, 1, jj, RT);
                it = memloc(halowidth, blkmemwidth, 0, jj + 1, RT);
                itr = memloc(halowidth, blkmemwidth, 1, jj + 1, RT);

                a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
            }
        }
    }
    else if (levRB == lev)
    {
        read = memloc(halowidth, blkmemwidth, 0, iy, RB);
        a_read = a[read];
    }
    else if (levRB > lev)
    {
        if (iy < (blockDim.y / 2))
        {
            jj = iy * 2;


            ii = memloc(halowidth, blkmemwidth, 0, jj, RB);
            ir = memloc(halowidth, blkmemwidth, 1, jj, RB);
            it = memloc(halowidth, blkmemwidth, 0, jj + 1, RB);
            itr = memloc(halowidth, blkmemwidth, 1, jj + 1, RB);

            a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
        }
        else
        {
            if (RT == ib)
            {
                read = memloc(halowidth, blkmemwidth, blockDim.y - 1, iy, ib);
                a_read = a[read];
            }
            else
            {
                jj = (iy - (blockDim.y / 2)) * 2;

                ii = memloc(halowidth, blkmemwidth, 0, jj, RT);
                ir = memloc(halowidth, blkmemwidth, 1, jj, RT);
                it = memloc(halowidth, blkmemwidth, 0, jj + 1, RT);
                itr = memloc(halowidth, blkmemwidth, 1, jj + 1, RT);

                a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
            }
        }
    }
    else if (levRB < lev)
    {
        //
        jj = LBRB == ib ? ceil(iy * (T)0.5) : ceil(iy * (T)0.5) + blockDim.y / 2;
        w1 = 1.0 / 3.0;
        w2 = ceil(iy * (T)0.5) * 2 > iy ? T(1.0 / 6.0) : T(0.5);
        w3 = ceil(iy * (T)0.5) * 2 > iy ? T(0.5) : T(1.0 / 6.0);
        ii = memloc(halowidth, blkmemwidth, blockDim.y - 1, iy, ib);
        ir = memloc(halowidth, blkmemwidth, 0, jj, RB);
        it = memloc(halowidth, blkmemwidth, 0, jj - 1, RB);
        if (LBRB == ib)
        {
            if (iy == 0)
            {
                if (BLRB == RB)
                {
                    w3 = 0.5 * (1.0 - w1);
                    w2 = w3;
                    it = ir;
                }
                else if (levBLRB < levRB)
                {
                    w1 = 4.0 / 10.0;
                    w2 = 5.0 / 10.0;
                    w3 = 1.0 / 10.0;
                    it = memloc(halowidth, blkmemwidth, 0, blockDim.y - 1, BLRB);
                }
                else if (levBLRB == levRB)
                {
                    it = memloc(halowidth, blkmemwidth, 0, blockDim.y - 1, BLRB);
                }
                else if (levBLRB > levRB)
                {
                    w1 = 1.0 / 4.0;
                    w2 = 1.0 / 2.0;
                    w3 = 1.0 / 4.0;
                    it = memloc(halowidth, blkmemwidth, 0, blockDim.y - 1, BLRB);
                }
            }
        }
        else
        {
            if (iy == (blockDim.y - 1))
            {
                if (TLRT == RT)
                {
                    w3 = 0.5 * (1.0 - w1);
                    w2 = w3;
                    ir = it;
                }
                else if (levTLRT < levRT)
                {
                    w1 = 4.0 / 10.0;
                    w2 = 1.0 / 10.0;
                    w3 = 5.0 / 10.0;
                    ir = memloc(halowidth, blkmemwidth, 0, 0, TLRT);
                }
                else if (levTLRT == levRT)
                {
                    ir = memloc(halowidth, blkmemwidth, 0, 0, TLRT);
                }
                else if (levTLRT > levRT)
                {
                    w1 = 1.0 / 4.0;
                    w2 = 1.0 / 2.0;
                    w3 = 1.0 / 4.0;
                    ir = memloc(halowidth, blkmemwidth, 0, 0, TLRT);
                }
            }
        }

        a_read= w1 * a[ii] + w2 * a[ir] + w3 * a[it];
    }
    a[write] = a_read;
}

template __global__ void fillRight<float>(int halowidth, int* active, int* level, int* rightbot, int* righttop, int* leftbot, int* botleft, int* topleft, float* a);
template __global__ void fillRight<double>(int halowidth, int* active, int* level, int* rightbot, int* righttop, int* leftbot, int* botleft, int* topleft, double* a);

template <class T> __global__ void fillRightnew(int halowidth,int nblk, int* active, int* level, int* rightbot, int* righttop, int* leftbot, int* botleft, int* topleft, T* a)
{
    int blkmemwidth = blockDim.y + halowidth * 2;
    //unsigned int blksize = blkmemwidth * blkmemwidth;
    //unsigned int ix = blockDim.y - 1;
    int iy = threadIdx.y;
    int ibl = threadIdx.x + blockIdx.x * blockDim.x;
    if (ibl < nblk)
    {
        int ib = active[ibl];

        int RB = rightbot[ib];
        int RT = righttop[ib];
        //int LB = leftbot[ib];
        //int BL = botleft[ib];
        int LBRB = leftbot[RB];
        int TLRT = topleft[RT];
        int BLRB = botleft[RB];


        int lev = level[ib];
        int levRB = level[RB];
        int levRT = level[RT];
        int levBLRB = level[BLRB];
        int levTLRT = level[TLRT];

        int write = memloc(halowidth, blkmemwidth, blockDim.y, iy, ib);
        int read;
        int jj, ii, ir, it, itr;
        T a_read;
        T w1, w2, w3;


        if (RB == ib)
        {
            if (iy < (blockDim.y / 2))
            {
                read = memloc(halowidth, blkmemwidth, blockDim.y - 1, iy, ib);
                a_read = a[read];
            }
            else
            {
                if (RT == ib)
                {
                    read = memloc(halowidth, blkmemwidth, blockDim.y - 1, iy, ib);
                    a_read = a[read];
                }
                else
                {
                    jj = (iy - (blockDim.y / 2)) * 2;
                    ii = memloc(halowidth, blkmemwidth, 0, jj, RT);
                    ir = memloc(halowidth, blkmemwidth, 1, jj, RT);
                    it = memloc(halowidth, blkmemwidth, 0, jj + 1, RT);
                    itr = memloc(halowidth, blkmemwidth, 1, jj + 1, RT);

                    a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
                }
            }
        }
        else if (levRB == lev)
        {
            read = memloc(halowidth, blkmemwidth, 0, iy, RB);
            a_read = a[read];
        }
        else if (levRB > lev)
        {
            if (iy < (blockDim.y / 2))
            {
                jj = iy * 2;


                ii = memloc(halowidth, blkmemwidth, 0, jj, RB);
                ir = memloc(halowidth, blkmemwidth, 1, jj, RB);
                it = memloc(halowidth, blkmemwidth, 0, jj + 1, RB);
                itr = memloc(halowidth, blkmemwidth, 1, jj + 1, RB);

                a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
            }
            else
            {
                if (RT == ib)
                {
                    read = memloc(halowidth, blkmemwidth, blockDim.y - 1, iy, ib);
                    a_read = a[read];
                }
                else
                {
                    jj = (iy - (blockDim.y / 2)) * 2;

                    ii = memloc(halowidth, blkmemwidth, 0, jj, RT);
                    ir = memloc(halowidth, blkmemwidth, 1, jj, RT);
                    it = memloc(halowidth, blkmemwidth, 0, jj + 1, RT);
                    itr = memloc(halowidth, blkmemwidth, 1, jj + 1, RT);

                    a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
                }
            }
        }
        else if (levRB < lev)
        {
            //
            jj = LBRB == ib ? ceil(iy * (T)0.5) : ceil(iy * (T)0.5) + blockDim.y / 2;
            w1 = 1.0 / 3.0;
            w2 = ceil(iy * (T)0.5) * 2 > iy ? T(1.0 / 6.0) : T(0.5);
            w3 = ceil(iy * (T)0.5) * 2 > iy ? T(0.5) : T(1.0 / 6.0);
            ii = memloc(halowidth, blkmemwidth, blockDim.y - 1, iy, ib);
            ir = memloc(halowidth, blkmemwidth, 0, jj, RB);
            it = memloc(halowidth, blkmemwidth, 0, jj - 1, RB);
            if (LBRB == ib)
            {
                if (iy == 0)
                {
                    if (BLRB == RB)
                    {
                        w3 = 0.5 * (1.0 - w1);
                        w2 = w3;
                        it = ir;
                    }
                    else if (levBLRB < levRB)
                    {
                        w1 = 4.0 / 10.0;
                        w2 = 5.0 / 10.0;
                        w3 = 1.0 / 10.0;
                        it = memloc(halowidth, blkmemwidth, 0, blockDim.y - 1, BLRB);
                    }
                    else if (levBLRB == levRB)
                    {
                        it = memloc(halowidth, blkmemwidth, 0, blockDim.y - 1, BLRB);
                    }
                    else if (levBLRB > levRB)
                    {
                        w1 = 1.0 / 4.0;
                        w2 = 1.0 / 2.0;
                        w3 = 1.0 / 4.0;
                        it = memloc(halowidth, blkmemwidth, 0, blockDim.y - 1, BLRB);
                    }
                }
            }
            else
            {
                if (iy == (blockDim.y - 1))
                {
                    if (TLRT == RT)
                    {
                        w3 = 0.5 * (1.0 - w1);
                        w2 = w3;
                        ir = it;
                    }
                    else if (levTLRT < levRT)
                    {
                        w1 = 4.0 / 10.0;
                        w2 = 1.0 / 10.0;
                        w3 = 5.0 / 10.0;
                        ir = memloc(halowidth, blkmemwidth, 0, 0, TLRT);
                    }
                    else if (levTLRT == levRT)
                    {
                        ir = memloc(halowidth, blkmemwidth, 0, 0, TLRT);
                    }
                    else if (levTLRT > levRT)
                    {
                        w1 = 1.0 / 4.0;
                        w2 = 1.0 / 2.0;
                        w3 = 1.0 / 4.0;
                        ir = memloc(halowidth, blkmemwidth, 0, 0, TLRT);
                    }
                }
            }

            a_read = w1 * a[ii] + w2 * a[ir] + w3 * a[it];
        }
        a[write] = a_read;
    }
}

template __global__ void fillRightnew<float>(int halowidth, int nblk, int* active, int* level, int* rightbot, int* righttop, int* leftbot, int* botleft, int* topleft, float* a);
template __global__ void fillRightnew<double>(int halowidth, int nblk, int* active, int* level, int* rightbot, int* righttop, int* leftbot, int* botleft, int* topleft, double* a);


template <class T> void fillRightFlux(Param XParam, bool doProlongation, int ib, BlockP<T> XBlock, T*& z)
{
    int jj, bb;
    int read, write;
    int ii, it;


    if (XBlock.RightBot[ib] == ib)//The lower half is a boundary 
    {
        for (int j = 0; j < (XParam.blkwidth / 2); j++)
        {

            read = memloc(XParam, XParam.blkwidth - 1, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
            write = memloc(XParam, XParam.blkwidth, j, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
            z[write] = z[read];
        }

        if (XBlock.RightTop[ib] == ib) // boundary on the top half too
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //

                read = memloc(XParam, XParam.blkwidth - 1, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, XParam.blkwidth, j, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                z[write] = z[read];
            }
        }
        else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
        {

            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                write = memloc(XParam, XParam.blkwidth, j, ib);
                jj = (j - 8) * 2;
                ii = memloc(XParam, 0, jj, XBlock.RightTop[ib]);
                //ir = memloc(XParam, 1, jj, XBlock.RightTop[ib]);
                it = memloc(XParam, 0, jj + 1, XBlock.RightTop[ib]);
                //itr = memloc(XParam, 1, jj + 1, XBlock.RightTop[ib]);

                z[write] = T(0.5) * (z[ii] + z[it]);

            }
        }
    }
    else if (XBlock.level[ib] == XBlock.level[XBlock.RightBot[ib]]) // LeftTop block does not exist
    {
        for (int j = 0; j < XParam.blkwidth; j++)
        {
            //

            write = memloc(XParam, XParam.blkwidth, j, ib);
            read = memloc(XParam, 0, j, XBlock.RightBot[ib]);
            z[write] = z[read];
        }
    }
    else if (XBlock.level[XBlock.RightBot[ib]] > XBlock.level[ib])
    {

        for (int j = 0; j < XParam.blkwidth / 2; j++)
        {

            write = memloc(XParam, XParam.blkwidth, j, ib);

            jj = j * 2;
            bb = XBlock.RightBot[ib];

            ii = memloc(XParam, 0, jj, bb);
            //ir = memloc(XParam, 1, jj, bb);
            it = memloc(XParam, 0, jj + 1, bb);
            //itr = memloc(XParam, 1, jj + 1, bb);

            //z[write] = T(0.25) * (z[ii] + z[ir] + z[it] + z[itr]);
            z[write] = T(0.5) * (z[ii] + z[it]);
        }
        //now find out aboy lefttop block
        if (XBlock.RightTop[ib] == ib)
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //

                read = memloc(XParam, XParam.blkwidth - 1, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, XParam.blkwidth, j, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                z[write] = z[read];
            }
        }
        else
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //
                jj = (j - 8) * 2;
                bb = XBlock.RightTop[ib];

                //read = memloc(XParam, 0, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, XParam.blkwidth, j, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                //z[write] = z[read];
                ii = memloc(XParam, 0, jj, bb);
                //ir = memloc(XParam, 1, jj, bb);
                it = memloc(XParam, 0, jj + 1, bb);
                //itr = memloc(XParam, 1, jj + 1, bb);
                z[write] = T(0.5) * (z[ii] + z[it]);
                //z[write] = T(0.25) * (z[ii] + z[ir] + z[it] + z[itr]);
            }
        }

    }
    else if (XBlock.level[XBlock.RightBot[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo 
    {
        for (int j = 0; j < XParam.blkwidth; j++)
        {
            write = memloc(XParam, XParam.blkwidth, j, ib);


            int jj = XBlock.LeftBot[XBlock.RightBot[ib]] == ib ? ftoi(floor(j * (T)0.5)) : ftoi(floor(j * (T)0.5) + XParam.blkwidth / 2);

            ii = memloc(XParam, 0, jj, XBlock.RightBot[ib]);
            if (doProlongation)
                z[write] = z[ii];
        }
    }



}

template void fillRightFlux<float>(Param XParam, bool doProlongation, int ib, BlockP<float> XBlock, float*& z);
template void fillRightFlux<double>(Param XParam, bool doProlongation, int ib, BlockP<double> XBlock, double*& z);


template <class T> __global__ void fillRightFlux(int halowidth, bool doProlongation, int* active, int* level, int* rightbot, int* righttop, int* leftbot, int* botleft, int* topleft, T* a)
{
    unsigned int blkmemwidth = blockDim.y + halowidth * 2;
    //unsigned int blksize = blkmemwidth * blkmemwidth;
    //unsigned int ix = blockDim.y - 1;
    unsigned int iy = threadIdx.y;
    unsigned int ibl = blockIdx.x;
    unsigned int ib = active[ibl];

    int RB = rightbot[ib];
    int RT = righttop[ib];
    //int LB = leftbot[ib];
    //int BL = botleft[ib];
    int LBRB = leftbot[RB];
    //int TLRT = topleft[RT];
    //int BLRB = botleft[RB];


    int lev = level[ib];
    int levRB = level[RB];
    //int levRT = level[RT];
    //int levBLRB = level[BLRB];
    //int levTLRT = level[TLRT];

    int write = memloc(halowidth, blkmemwidth, blockDim.y, iy, ib);
    int read;
    int jj, ii, ir, it;
    T a_read;
    //T w1, w2;


    if (RB == ib)
    {
        if (iy < (blockDim.y / 2))
        {
            read = memloc(halowidth, blkmemwidth, blockDim.y - 1, iy, ib);
            a_read = a[read];
        }
        else
        {
            if (RT == ib)
            {
                read = memloc(halowidth, blkmemwidth, blockDim.y - 1, iy, ib);
                a_read = a[read];
            }
            else
            {
                jj = (iy - (blockDim.y / 2)) * 2;
                ii = memloc(halowidth, blkmemwidth, 0, jj, RT);
                //ir = memloc(halowidth, blkmemwidth, 1, jj, RT);
                it = memloc(halowidth, blkmemwidth, 0, jj + 1, RT);
                //itr = memloc(halowidth, blkmemwidth, 1, jj + 1, RT);

                a_read = T(0.5) * (a[ii] + a[it]);
            }
        }
    }
    else if (levRB == lev)
    {
        read = memloc(halowidth, blkmemwidth, 0, iy, RB);
        a_read = a[read];
    }
    else if (levRB > lev)
    {
        if (iy < (blockDim.y / 2))
        {
            jj = iy * 2;


            ii = memloc(halowidth, blkmemwidth, 0, jj, RB);
            //ir = memloc(halowidth, blkmemwidth, 1, jj, RB);
            it = memloc(halowidth, blkmemwidth, 0, jj + 1, RB);
            //itr = memloc(halowidth, blkmemwidth, 1, jj + 1, RB);

            a_read = T(0.5) * (a[ii] + a[it]);
        }
        else
        {
            if (RT == ib)
            {
                read = memloc(halowidth, blkmemwidth, blockDim.y - 1, iy, ib);
                a_read = a[read];
            }
            else
            {
                jj = (iy - (blockDim.y / 2)) * 2;

                ii = memloc(halowidth, blkmemwidth, 0, jj, RT);
                //ir = memloc(halowidth, blkmemwidth, 1, jj, RT);
                it = memloc(halowidth, blkmemwidth, 0, jj + 1, RT);
                //itr = memloc(halowidth, blkmemwidth, 1, jj + 1, RT);

                a_read = T(0.5) * (a[ii] + a[it] );
            }
        }
    }
    else if (levRB < lev)
    {
        //
        jj = LBRB == ib ? floor(iy * (T)0.5) : floor(iy * (T)0.5) + blockDim.y / 2;


        ir = memloc(halowidth, blkmemwidth, 0, jj, RB);

        if (doProlongation)
            a_read = a[ir];

        else
            a_read = a[write];
    }
    a[write] = a_read;
}
template __global__ void fillRightFlux<float>(int halowidth, bool doProlongation, int* active, int* level, int* rightbot, int* righttop, int* leftbot, int* botleft, int* topleft, float* a);
template __global__ void fillRightFlux<double>(int halowidth, bool doProlongation, int* active, int* level, int* rightbot, int* righttop, int* leftbot, int* botleft, int* topleft, double* a);


template <class T> void fillBot(Param XParam, int ib, BlockP<T> XBlock, T*& z)
{
    int jj, bb;
    int read, write;
    int ii, ir, it, itr;


    if (XBlock.BotLeft[ib] == ib)//The lower half is a boundary 
    {
        for (int j = 0; j < (XParam.blkwidth / 2); j++)
        {

            read = memloc(XParam, j, 0, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
            write = memloc(XParam, j, -1, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
            z[write] = z[read];
        }

        if (XBlock.BotRight[ib] == ib) // boundary on the top half too
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //

                read = memloc(XParam, j, 0, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, j, -1, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                z[write] = z[read];
            }
        }
        else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
        {

            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                write = memloc(XParam,j, -1, ib);
                jj = (j - 8) * 2;
                ii = memloc(XParam,jj, (XParam.blkwidth - 1), XBlock.BotRight[ib]);
                ir = memloc(XParam,jj, (XParam.blkwidth - 2), XBlock.BotRight[ib]);
                it = memloc(XParam,jj+1, (XParam.blkwidth - 1), XBlock.BotRight[ib]);
                itr = memloc(XParam,jj+1, (XParam.blkwidth - 2), XBlock.BotRight[ib]);

                z[write] = T(0.25) * (z[ii] + z[ir] + z[it] + z[itr]);

            }
        }
    }
    else if (XBlock.level[ib] == XBlock.level[XBlock.BotLeft[ib]]) // LeftTop block does not exist
    {
        for (int j = 0; j < XParam.blkwidth; j++)
        {
            //

            write = memloc(XParam,j, -1, ib);
            read = memloc(XParam, j, (XParam.blkwidth - 1), XBlock.BotLeft[ib]);
            z[write] = z[read];
        }
    }
    else if (XBlock.level[XBlock.BotLeft[ib]] > XBlock.level[ib])
    {

        for (int j = 0; j < XParam.blkwidth / 2; j++)
        {

            write = memloc(XParam, j, -1, ib);

            jj = j * 2;
            bb = XBlock.BotLeft[ib];

            ii = memloc(XParam, jj, (XParam.blkwidth - 1), bb);
            ir = memloc(XParam, jj, (XParam.blkwidth - 2), bb);
            it = memloc(XParam, jj + 1, (XParam.blkwidth - 1), bb);
            itr = memloc(XParam, jj + 1, (XParam.blkwidth - 2), bb);

            z[write] = T(0.25) * (z[ii] + z[ir] + z[it] + z[itr]);
        }
        //now find out aboy botright block
        if (XBlock.BotRight[ib] == ib)
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //

                read = memloc(XParam, j, 0, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, j, -1, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                z[write] = z[read];
            }
        }
        else
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //
                jj = (j - 8) * 2;
                bb = XBlock.BotRight[ib];

                //read = memloc(XParam, 0, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, j, -1, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                //z[write] = z[read];
                ii = memloc(XParam, jj, (XParam.blkwidth - 1), bb);
                ir = memloc(XParam, jj, (XParam.blkwidth - 2), bb);
                it = memloc(XParam, jj + 1, (XParam.blkwidth - 1), bb);
                itr = memloc(XParam, jj + 1, (XParam.blkwidth - 2), bb);

                z[write] = T(0.25) * (z[ii] + z[ir] + z[it] + z[itr]);
            }
        }

    }
    else if (XBlock.level[XBlock.BotLeft[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo 
    {
        for (int j = 0; j < XParam.blkwidth; j++)
        {
            write = memloc(XParam, j, -1, ib);

            T w1, w2, w3;


            int jj = XBlock.TopLeft[XBlock.BotLeft[ib]] == ib ? ftoi(ceil(j * (T)0.5)) : ftoi(ceil(j * (T)0.5) + XParam.blkwidth / 2);
            w1 = T(1.0 / 3.0);
            w2 = ceil(j * (T)0.5) * 2 > j ? T(1.0 / 6.0) : T(0.5);
            w3 = ceil(j * (T)0.5) * 2 > j ? T(0.5) : T(1.0 / 6.0);

            ii = memloc(XParam, j, 0, ib);
            ir = memloc(XParam, jj, XParam.blkwidth - 1, XBlock.BotLeft[ib]);
            it = memloc(XParam, jj -1, XParam.blkwidth - 1, XBlock.BotLeft[ib]);
            //2 scenarios here ib is the rightbot neighbour of the leftbot block or ib is the righttop neighbour
            if (XBlock.TopLeft[XBlock.BotLeft[ib]] == ib)
            {
                if (j == 0)
                {
                    if (XBlock.LeftTop[XBlock.BotLeft[ib]] == XBlock.BotLeft[ib]) // no botom of leftbot block
                    {
                        w3 = T(0.5 * (1.0 - w1));
                        w2 = w3;
                        it = ir;

                    }
                    else if (XBlock.level[XBlock.LeftTop[XBlock.BotLeft[ib]]] < XBlock.level[XBlock.BotLeft[ib]]) // exists but is coarser
                    {
                        w1 = T(4.0 / 10.0);
                        w2 = T(5.0 / 10.0);
                        w3 = T(1.0 / 10.0);
                        it = memloc(XParam, XParam.blkwidth - 1, XParam.blkwidth - 1, XBlock.LeftTop[XBlock.BotLeft[ib]]);
                    }
                    else if (XBlock.level[XBlock.LeftTop[XBlock.BotLeft[ib]]] == XBlock.level[XBlock.BotLeft[ib]]) // exists with same level
                    {
                        it = memloc(XParam, XParam.blkwidth - 1, XParam.blkwidth - 1, XBlock.LeftTop[XBlock.BotLeft[ib]]);
                    }
                    else if (XBlock.level[XBlock.LeftTop[XBlock.BotLeft[ib]]] > XBlock.level[XBlock.BotLeft[ib]]) // exists with higher level
                    {
                        w1 = T(1.0 / 4.0);
                        w2 = T(1.0 / 2.0);
                        w3 = T(1.0 / 4.0);
                        it = memloc(XParam, XParam.blkwidth - 1, XParam.blkwidth - 1, XBlock.LeftTop[XBlock.BotLeft[ib]]);
                    }


                }


            }
            else//righttopleftif == ib
            {
                if (j == (XParam.blkwidth - 1))
                {
                    if (XBlock.RightTop[XBlock.BotRight[ib]] == XBlock.BotRight[ib]) // no botom of leftbot block
                    {
                        w3 = T(0.5 * (1.0 - w1));
                        w2 = w3;
                        ir = it;

                    }
                    else if (XBlock.level[XBlock.RightTop[XBlock.BotRight[ib]]] < XBlock.level[XBlock.BotRight[ib]]) // exists but is coarser
                    {
                        w1 = T(4.0 / 10.0);
                        w2 = T(1.0 / 10.0);
                        w3 = T(5.0 / 10.0);
                        ir = memloc(XParam, 0,XParam.blkwidth - 1, XBlock.RightTop[XBlock.BotRight[ib]]);
                    }
                    else if (XBlock.level[XBlock.RightTop[XBlock.BotRight[ib]]] == XBlock.level[XBlock.BotRight[ib]]) // exists with same level
                    {
                        ir = memloc(XParam,0, XParam.blkwidth - 1, XBlock.RightTop[XBlock.BotRight[ib]]);
                    }
                    else if (XBlock.level[XBlock.RightTop[XBlock.BotRight[ib]]] > XBlock.level[XBlock.BotRight[ib]]) // exists with higher level
                    {
                        w1 = T(1.0 / 4.0);
                        w2 = T(1.0 / 2.0);
                        w3 = T(1.0 / 4.0);
                        ir = memloc(XParam,0,XParam.blkwidth - 1, XBlock.RightTop[XBlock.BotRight[ib]]);
                    }
                }
                //
            }


            z[write] = w1 * z[ii] + w2 * z[ir] + w3 * z[it];
        }
    }



}

template <class T> __global__ void fillBot(int halowidth, int* active, int* level, int* botleft, int* botright, int* topleft, int* lefttop, int* righttop, T* a)
{
    int blkmemwidth = blockDim.x + halowidth * 2;
    //unsigned int blksize = blkmemwidth * blkmemwidth;
    int ix = threadIdx.x;
    //unsigned int iy = 0;
    int ibl = blockIdx.x;

    int ib = active[ibl];

    int BL = botleft[ib];
    int BR = botright[ib];
    int TLBL = topleft[BL];
    int LTBL = lefttop[BL];
    int RTBR = righttop[BR];

    int lev = level[ib];
    int levBL = level[BL];
    int levBR = level[BR];
    int levLTBL = level[LTBL];
    int levRTBR = level[RTBR];

    int write = memloc(halowidth, blkmemwidth, ix, -1, ib);
    int read;
    int jj, ii, ir, it, itr;
    T a_read;
    T w1, w2, w3;
    if (BL == ib)
    {
        if (ix < (blockDim.x / 2))
        {
            read = memloc(halowidth, blkmemwidth, ix, 0, ib);
            a_read = a[read];
        }
        else
        {
            if (BR == ib)
            {
                read = memloc(halowidth, blkmemwidth, ix, 0, ib);
                a_read = a[read];
            }
            else
            {
                jj = (ix - (blockDim.x/2)) * 2;
                ii = memloc(halowidth, blkmemwidth, jj, (blockDim.x - 1), BR);
                ir = memloc(halowidth, blkmemwidth, jj, (blockDim.x - 2), BR);
                it = memloc(halowidth, blkmemwidth, jj + 1, (blockDim.x - 1), BR);
                itr = memloc(halowidth, blkmemwidth, jj + 1, (blockDim.x - 2), BR);
                a_read = T(0.25)* (a[ii] + a[ir] + a[it] + a[itr]);
            }
        }
    }
    else if (levBL == lev)
    {
        read = memloc(halowidth, blkmemwidth, ix, (blockDim.x - 1), BL);
        a_read = a[read];
    }
    else if (levBL > lev)
    {
        if (ix < (blockDim.x / 2))
        {
            jj = ix * 2;
            ii = memloc(halowidth, blkmemwidth, jj, (blockDim.x - 1), BL);
            ir = memloc(halowidth, blkmemwidth, jj, (blockDim.x - 2), BL);
            it = memloc(halowidth, blkmemwidth, jj + 1, (blockDim.x - 1), BL);
            itr = memloc(halowidth, blkmemwidth, jj + 1, (blockDim.x - 2), BL);
            a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
        }
        else
        {
            if (BR == ib)
            {
                read = memloc(halowidth, blkmemwidth, ix, 0, ib);
                a_read = a[read];
            }
            else
            {
                jj = (ix - (blockDim.x/2)) * 2;
                ii = memloc(halowidth, blkmemwidth, jj, (blockDim.x - 1), BR);
                ir = memloc(halowidth, blkmemwidth, jj, (blockDim.x - 2), BR);
                it = memloc(halowidth, blkmemwidth, jj + 1, (blockDim.x - 1), BR);
                itr = memloc(halowidth, blkmemwidth, jj + 1, (blockDim.x - 2), BR);
                a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
            }
        }
    }
    else if (levBL < lev)
    {
        jj = TLBL == ib ? ceil(ix * (T)0.5) : ceil(ix * (T)0.5) + blockDim.x / 2;
        w1 = 1.0 / 3.0;
        w2 = ceil(ix * (T)0.5) * 2 > ix ? T(1.0 / 6.0) : T(0.5);
        w3 = ceil(ix * (T)0.5) * 2 > ix ? T(0.5) : T(1.0 / 6.0);

        ii = memloc(halowidth, blkmemwidth, ix, 0, ib);
        ir = memloc(halowidth, blkmemwidth, jj, blockDim.x - 1, BL);
        it = memloc(halowidth, blkmemwidth, jj - 1, blockDim.x - 1, BL);

        if (TLBL == ib)
        {
            if (ix == 0)
            {
                if (LTBL == BL)
                {
                    w3 = 0.5 * (1.0 - w1);
                    w2 = w3;
                    it = ir;
                }
                else if (levLTBL < levBL)
                {
                    w1 = 4.0 / 10.0;
                    w2 = 5.0 / 10.0;
                    w3 = 1.0 / 10.0;
                    it = memloc(halowidth, blkmemwidth, blockDim.x - 1, blockDim.x - 1, LTBL);
                }
                else if (levLTBL == levBL)
                {
                    it = memloc(halowidth, blkmemwidth, blockDim.x - 1, blockDim.x - 1, LTBL);
                }
                else if (levLTBL > levBL)
                {
                    w1 = 1.0 / 4.0;
                    w2 = 1.0 / 2.0;
                    w3 = 1.0 / 4.0;
                    it = memloc(halowidth, blkmemwidth, blockDim.x - 1, blockDim.x - 1, LTBL);
                }
            }
        }
        else
        {
            if (ix == (blockDim.x - 1))
            {
                if (RTBR == BR)
                {
                    w3 = 0.5 * (1.0 - w1);
                    w2 = w3;
                    ir = it;
                }
                else if (levRTBR < levBR)
                {
                    w1 = 4.0 / 10.0;
                    w2 = 1.0 / 10.0;
                    w3 = 5.0 / 10.0;
                    ir = memloc(halowidth, blkmemwidth,0, blockDim.x - 1, RTBR);
                }
                else if (levRTBR == levBR)
                {
                    ir = memloc(halowidth, blkmemwidth,0, blockDim.x - 1, RTBR);
                }
                else if (levRTBR > levBR)
                {
                    w1 = 1.0 / 4.0;
                    w2 = 1.0 / 2.0;
                    w3 = 1.0 / 4.0;
                    ir = memloc(halowidth, blkmemwidth, 0, blockDim.x - 1, RTBR);
                }
            }
        }
        a_read = w1 * a[ii] + w2 * a[ir] + w3 * a[it];
    }
    a[write] = a_read;

}
template __global__ void fillBot<float>(int halowidth, int* active, int* level, int* botleft, int* botright, int* topleft, int* lefttop, int* righttop, float* a);
template __global__ void fillBot<double>(int halowidth, int* active, int* level, int* botleft, int* botright, int* topleft, int* lefttop, int* righttop, double* a);

template <class T> __global__ void fillBotnew(int halowidth, int nblk, int* active, int* level, int* botleft, int* botright, int* topleft, int* lefttop, int* righttop, T* a)
{
    int blkmemwidth = blockDim.x + halowidth * 2;
    //unsigned int blksize = blkmemwidth * blkmemwidth;
    int ix = threadIdx.x;
    //unsigned int iy = 0;
    int ibl = threadIdx.y + blockIdx.x * blockDim.y;
    if (ibl < nblk)
    {
        int ib = active[ibl];

        int BL = botleft[ib];
        int BR = botright[ib];
        int TLBL = topleft[BL];
        int LTBL = lefttop[BL];
        int RTBR = righttop[BR];

        int lev = level[ib];
        int levBL = level[BL];
        int levBR = level[BR];
        int levLTBL = level[LTBL];
        int levRTBR = level[RTBR];

        int write = memloc(halowidth, blkmemwidth, ix, -1, ib);
        int read;
        int jj, ii, ir, it, itr;
        T a_read;
        T w1, w2, w3;
        if (BL == ib)
        {
            if (ix < (blockDim.x / 2))
            {
                read = memloc(halowidth, blkmemwidth, ix, 0, ib);
                a_read = a[read];
            }
            else
            {
                if (BR == ib)
                {
                    read = memloc(halowidth, blkmemwidth, ix, 0, ib);
                    a_read = a[read];
                }
                else
                {
                    jj = (ix - (blockDim.x / 2)) * 2;
                    ii = memloc(halowidth, blkmemwidth, jj, (blockDim.x - 1), BR);
                    ir = memloc(halowidth, blkmemwidth, jj, (blockDim.x - 2), BR);
                    it = memloc(halowidth, blkmemwidth, jj + 1, (blockDim.x - 1), BR);
                    itr = memloc(halowidth, blkmemwidth, jj + 1, (blockDim.x - 2), BR);
                    a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
                }
            }
        }
        else if (levBL == lev)
        {
            read = memloc(halowidth, blkmemwidth, ix, (blockDim.x - 1), BL);
            a_read = a[read];
        }
        else if (levBL > lev)
        {
            if (ix < (blockDim.x / 2))
            {
                jj = ix * 2;
                ii = memloc(halowidth, blkmemwidth, jj, (blockDim.x - 1), BL);
                ir = memloc(halowidth, blkmemwidth, jj, (blockDim.x - 2), BL);
                it = memloc(halowidth, blkmemwidth, jj + 1, (blockDim.x - 1), BL);
                itr = memloc(halowidth, blkmemwidth, jj + 1, (blockDim.x - 2), BL);
                a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
            }
            else
            {
                if (BR == ib)
                {
                    read = memloc(halowidth, blkmemwidth, ix, 0, ib);
                    a_read = a[read];
                }
                else
                {
                    jj = (ix - (blockDim.x / 2)) * 2;
                    ii = memloc(halowidth, blkmemwidth, jj, (blockDim.x - 1), BR);
                    ir = memloc(halowidth, blkmemwidth, jj, (blockDim.x - 2), BR);
                    it = memloc(halowidth, blkmemwidth, jj + 1, (blockDim.x - 1), BR);
                    itr = memloc(halowidth, blkmemwidth, jj + 1, (blockDim.x - 2), BR);
                    a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
                }
            }
        }
        else if (levBL < lev)
        {
            jj = TLBL == ib ? ceil(ix * (T)0.5) : ceil(ix * (T)0.5) + blockDim.x / 2;
            w1 = 1.0 / 3.0;
            w2 = ceil(ix * (T)0.5) * 2 > ix ? T(1.0 / 6.0) : T(0.5);
            w3 = ceil(ix * (T)0.5) * 2 > ix ? T(0.5) : T(1.0 / 6.0);

            ii = memloc(halowidth, blkmemwidth, ix, 0, ib);
            ir = memloc(halowidth, blkmemwidth, jj, blockDim.x - 1, BL);
            it = memloc(halowidth, blkmemwidth, jj - 1, blockDim.x - 1, BL);

            if (TLBL == ib)
            {
                if (ix == 0)
                {
                    if (LTBL == BL)
                    {
                        w3 = 0.5 * (1.0 - w1);
                        w2 = w3;
                        it = ir;
                    }
                    else if (levLTBL < levBL)
                    {
                        w1 = 4.0 / 10.0;
                        w2 = 5.0 / 10.0;
                        w3 = 1.0 / 10.0;
                        it = memloc(halowidth, blkmemwidth, blockDim.x - 1, blockDim.x - 1, LTBL);
                    }
                    else if (levLTBL == levBL)
                    {
                        it = memloc(halowidth, blkmemwidth, blockDim.x - 1, blockDim.x - 1, LTBL);
                    }
                    else if (levLTBL > levBL)
                    {
                        w1 = 1.0 / 4.0;
                        w2 = 1.0 / 2.0;
                        w3 = 1.0 / 4.0;
                        it = memloc(halowidth, blkmemwidth, blockDim.x - 1, blockDim.x - 1, LTBL);
                    }
                }
            }
            else
            {
                if (ix == (blockDim.x - 1))
                {
                    if (RTBR == BR)
                    {
                        w3 = 0.5 * (1.0 - w1);
                        w2 = w3;
                        ir = it;
                    }
                    else if (levRTBR < levBR)
                    {
                        w1 = 4.0 / 10.0;
                        w2 = 1.0 / 10.0;
                        w3 = 5.0 / 10.0;
                        ir = memloc(halowidth, blkmemwidth, 0, blockDim.x - 1, RTBR);
                    }
                    else if (levRTBR == levBR)
                    {
                        ir = memloc(halowidth, blkmemwidth, 0, blockDim.x - 1, RTBR);
                    }
                    else if (levRTBR > levBR)
                    {
                        w1 = 1.0 / 4.0;
                        w2 = 1.0 / 2.0;
                        w3 = 1.0 / 4.0;
                        ir = memloc(halowidth, blkmemwidth, 0, blockDim.x - 1, RTBR);
                    }
                }
            }
            a_read = w1 * a[ii] + w2 * a[ir] + w3 * a[it];
        }
        a[write] = a_read;
    }

}
template __global__ void fillBotnew<float>(int halowidth, int nblk, int* active, int* level, int* botleft, int* botright, int* topleft, int* lefttop, int* righttop, float* a);
template __global__ void fillBotnew<double>(int halowidth, int nblk, int* active, int* level, int* botleft, int* botright, int* topleft, int* lefttop, int* righttop, double* a);

template <class T> void fillBotFlux(Param XParam, bool doProlongation, int ib, BlockP<T> XBlock, T*& z)
{
    int jj, bb;
    int read, write;
    int ii, ir, it;


    if (XBlock.BotLeft[ib] == ib)//The lower half is a boundary 
    {
        for (int j = 0; j < (XParam.blkwidth / 2); j++)
        {

            read = memloc(XParam, j, 0, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
            write = memloc(XParam, j, -1, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
            z[write] = z[read];
        }

        if (XBlock.BotRight[ib] == ib) // boundary on the top half too
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //

                read = memloc(XParam, j, 0, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, j, -1, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                z[write] = z[read];
            }
        }
        else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
        {

            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                write = memloc(XParam, j, -1, ib);
                jj = (j - 8) * 2;
                ii = memloc(XParam, jj, (XParam.blkwidth - 1), XBlock.BotRight[ib]);
                //ir = memloc(XParam, jj, (XParam.blkwidth - 2), XBlock.BotRight[ib]);
                it = memloc(XParam, jj + 1, (XParam.blkwidth - 1), XBlock.BotRight[ib]);
                //itr = memloc(XParam, jj + 1, (XParam.blkwidth - 2), XBlock.BotRight[ib]);

                z[write] = T(0.5) * (z[ii] + z[it] );

            }
        }
    }
    else if (XBlock.level[ib] == XBlock.level[XBlock.BotLeft[ib]]) // LeftTop block does not exist
    {
        for (int j = 0; j < XParam.blkwidth; j++)
        {
            //

            write = memloc(XParam, j, -1, ib);
            read = memloc(XParam, j, (XParam.blkwidth - 1), XBlock.BotLeft[ib]);
            z[write] = z[read];
        }
    }
    else if (XBlock.level[XBlock.BotLeft[ib]] > XBlock.level[ib])
    {

        for (int j = 0; j < XParam.blkwidth / 2; j++)
        {

            write = memloc(XParam, j, -1, ib);

            jj = j * 2;
            bb = XBlock.BotLeft[ib];

            ii = memloc(XParam, jj, (XParam.blkwidth - 1), bb);
            //ir = memloc(XParam, jj, (XParam.blkwidth - 2), bb);
            it = memloc(XParam, jj + 1, (XParam.blkwidth - 1), bb);
            //itr = memloc(XParam, jj + 1, (XParam.blkwidth - 2), bb);

            z[write] = T(0.5) * (z[ii] + z[it]);
        }
        //now find out aboy botright block
        if (XBlock.BotRight[ib] == ib)
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //

                read = memloc(XParam, j, 0, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, j, -1, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                z[write] = z[read];
            }
        }
        else
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //
                jj = (j - 8) * 2;
                bb = XBlock.BotRight[ib];

                //read = memloc(XParam, 0, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, j, -1, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                //z[write] = z[read];
                ii = memloc(XParam, jj, (XParam.blkwidth - 1), bb);
                //ir = memloc(XParam, jj, (XParam.blkwidth - 2), bb);
                it = memloc(XParam, jj + 1, (XParam.blkwidth - 1), bb);
                //itr = memloc(XParam, jj + 1, (XParam.blkwidth - 2), bb);

                z[write] = T(0.5) * (z[ii]  + z[it] );
            }
        }

    }
    else if (XBlock.level[XBlock.BotLeft[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo 
    {
        for (int j = 0; j < XParam.blkwidth; j++)
        {
            write = memloc(XParam, j, -1, ib);

            //T w1, w2, w3;


            int jj = XBlock.TopLeft[XBlock.BotLeft[ib]] == ib ? ftoi(ceil(j * (T)0.5)) : ftoi(ceil(j * (T)0.5) + XParam.blkwidth / 2);


            //ii = memloc(XParam, j, 0, ib);
            ir = memloc(XParam, jj, XParam.blkwidth - 1, XBlock.BotLeft[ib]);
            if(doProlongation)
                z[write] = z[ir];
        }
    }



}

template <class T> void fillTop(Param XParam, int ib, BlockP<T> XBlock, T*& z)
{
    int jj, bb;
    int read, write;
    int ii, ir, it, itr;


    if (XBlock.TopLeft[ib] == ib)//The lower half is a boundary 
    {
        for (int j = 0; j < (XParam.blkwidth / 2); j++)
        {

            read = memloc(XParam,j, XParam.blkwidth - 1, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
            write = memloc(XParam,j, XParam.blkwidth, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
            z[write] = z[read];
        }

        if (XBlock.TopRight[ib] == ib) // boundary on the top half too
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //

                read = memloc(XParam, j, XParam.blkwidth - 1, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, j, XParam.blkwidth, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                z[write] = z[read];
            }
        }
        else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
        {

            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                write = memloc(XParam, j, XParam.blkwidth, ib);
                jj = (j - (XParam.blkwidth / 2)) * 2;
                ii = memloc(XParam, jj, 0, XBlock.TopRight[ib]);
                ir = memloc(XParam, jj, 1, XBlock.TopRight[ib]);
                it = memloc(XParam, jj + 1, 0, XBlock.TopRight[ib]);
                itr = memloc(XParam, jj + 1, 1, XBlock.TopRight[ib]);

                z[write] = T(0.25) * (z[ii] + z[ir] + z[it] + z[itr]);

            }
        }
    }
    else if (XBlock.level[ib] == XBlock.level[XBlock.TopLeft[ib]]) // LeftTop block does not exist
    {
        for (int j = 0; j < XParam.blkwidth; j++)
        {
            //

            write = memloc(XParam, j, XParam.blkwidth, ib);
            read = memloc(XParam, j, 0, XBlock.TopLeft[ib]);
            z[write] = z[read];
        }
    }
    else if (XBlock.level[XBlock.TopLeft[ib]] > XBlock.level[ib])
    {

        for (int j = 0; j < XParam.blkwidth / 2; j++)
        {

            write = memloc(XParam, j, XParam.blkwidth, ib);

            jj = j * 2;
            bb = XBlock.TopLeft[ib];

            ii = memloc(XParam,jj, 0, bb);
            ir = memloc(XParam,jj, 1, bb);
            it = memloc(XParam,jj + 1, 0, bb);
            itr = memloc(XParam,jj + 1, 1, bb);

            z[write] = T(0.25) * (z[ii] + z[ir] + z[it] + z[itr]);
        }
        //now find out aboy lefttop block
        if (XBlock.TopRight[ib] == ib)
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //

                read = memloc(XParam,j, XParam.blkwidth - 1, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, j, XParam.blkwidth, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                z[write] = z[read];
            }
        }
        else
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //
                jj = (j - (XParam.blkwidth / 2)) * 2;
                bb = XBlock.TopRight[ib];

                //read = memloc(XParam, 0, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, j , XParam.blkwidth, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                //z[write] = z[read];
                ii = memloc(XParam,jj, 0, bb);
                ir = memloc(XParam,jj, 1, bb);
                it = memloc(XParam,jj + 1, 0, bb);
                itr = memloc(XParam,jj + 1, 1, bb);

                z[write] = T(0.25) * (z[ii] + z[ir] + z[it] + z[itr]);
            }
        }

    }
    else if (XBlock.level[XBlock.TopLeft[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo 
    {
        for (int j = 0; j < XParam.blkwidth; j++)
        {
            write = memloc(XParam,j, XParam.blkwidth, ib);

            T w1, w2, w3;


            int jj = XBlock.BotLeft[XBlock.TopLeft[ib]] == ib ? ftoi(ceil(j * (T)0.5)) : ftoi(ceil(j * (T)0.5) + XParam.blkwidth / 2);
            w1 = T(1.0 / 3.0);
            w2 = ceil(j * (T)0.5) * 2 > j ? T(1.0 / 6.0) : T(0.5);
            w3 = ceil(j * (T)0.5) * 2 > j ? T(0.5) : T(1.0 / 6.0);

            ii = memloc(XParam,j, XParam.blkwidth - 1, ib);
            ir = memloc(XParam,jj, 0,  XBlock.TopLeft[ib]);
            it = memloc(XParam,jj-1, 0, XBlock.TopLeft[ib]);
            //2 scenarios here ib is the leftbot neighbour of the rightbot block or ib is the lefttop neighbour
            if (XBlock.BotLeft[XBlock.TopLeft[ib]] == ib)
            {
                if (j == 0)
                {
                    if (XBlock.LeftBot[XBlock.TopLeft[ib]] == XBlock.TopLeft[ib]) // no botom of leftbot block
                    {
                        w3 = T(0.5 * (1.0 - w1));
                        w2 = w3;
                        it = ir;

                    }
                    else if (XBlock.level[XBlock.LeftBot[XBlock.TopLeft[ib]]] < XBlock.level[XBlock.TopLeft[ib]]) // exists but is coarser
                    {
                        w1 = T(4.0 / 10.0);
                        w2 = T(5.0 / 10.0);
                        w3 = T(1.0 / 10.0);
                        it = memloc(XParam, XParam.blkwidth - 1,0, XBlock.LeftBot[XBlock.TopLeft[ib]]);
                    }
                    else if (XBlock.level[XBlock.LeftBot[XBlock.TopLeft[ib]]] == XBlock.level[XBlock.TopLeft[ib]]) // exists with same level
                    {
                        it = memloc(XParam,  XParam.blkwidth - 1,0, XBlock.LeftBot[XBlock.TopLeft[ib]]);
                    }
                    else if (XBlock.level[XBlock.LeftBot[XBlock.TopLeft[ib]]] > XBlock.level[XBlock.TopLeft[ib]]) // exists with higher level
                    {
                        w1 = T(1.0 / 4.0);
                        w2 = T(1.0 / 2.0);
                        w3 = T(1.0 / 4.0);
                        it = memloc(XParam, XParam.blkwidth - 1, 0, XBlock.LeftBot[XBlock.TopLeft[ib]]);
                    }


                }


            }
            else//
            {
                if (j == (XParam.blkwidth - 1))
                {
                    if (XBlock.RightBot[XBlock.TopRight[ib]] == XBlock.TopRight[ib]) // no botom of leftbot block
                    {
                        w3 = T(0.5 * (1.0 - w1));
                        w2 = w3;
                        ir = it;

                    }
                    else if (XBlock.level[XBlock.RightBot[XBlock.TopRight[ib]]] < XBlock.level[XBlock.TopRight[ib]]) // exists but is coarser
                    {
                        w1 = T(4.0 / 10.0);
                        w2 = T(1.0 / 10.0);
                        w3 = T(5.0 / 10.0);
                        ir = memloc(XParam, 0, 0, XBlock.RightBot[XBlock.TopRight[ib]]);
                    }
                    else if (XBlock.level[XBlock.RightBot[XBlock.TopRight[ib]]] == XBlock.level[XBlock.TopRight[ib]]) // exists with same level
                    {
                        ir = memloc(XParam, 0, 0, XBlock.RightBot[XBlock.TopRight[ib]]);
                    }
                    else if (XBlock.level[XBlock.RightBot[XBlock.TopRight[ib]]] > XBlock.level[XBlock.TopRight[ib]]) // exists with higher level
                    {
                        w1 = T(1.0 / 4.0);
                        w2 = T(1.0 / 2.0);
                        w3 = T(1.0 / 4.0);
                        ir = memloc(XParam, 0,0, XBlock.RightBot[XBlock.TopRight[ib]]);
                    }
                }
                //
            }


            z[write] = w1 * z[ii] + w2 * z[ir] + w3 * z[it];
        }
    }



}

template <class T> __global__ void fillTop(int halowidth, int* active, int* level,int * topleft, int * topright,int * botleft, int* leftbot, int* rightbot,  T* a)
{
    int blkmemwidth = blockDim.x + halowidth * 2;
    //unsigned int blksize = blkmemwidth * blkmemwidth;
    int ix = threadIdx.x;
    //unsigned int iy = blockDim.x-1;
    int ibl = blockIdx.x;
    int ib = active[ibl];

    int TL = topleft[ib];
    int TR = topright[ib];
    int LBTL = leftbot[TL];
    int BLTL = botleft[TL];
    int RBTR = rightbot[TR];


    int lev = level[ib];
    int levTL = level[TL];
    int levTR = level[TR];
    int levLBTL = level[LBTL];
    int levRBTR = level[RBTR];

    int write = memloc(halowidth, blkmemwidth, ix, blockDim.x, ib);
    int read;
    int jj, ii, ir, it, itr;
    T a_read;
    T w1, w2, w3;

    if (TL == ib)
    {
        if (ix < (blockDim.x / 2))
        {
            read = memloc(halowidth, blkmemwidth, ix, blockDim.x - 1, ib);
            a_read = a[read];
        }
        else
        {
            if (TR == ib)
            {
                read = memloc(halowidth, blkmemwidth, ix, blockDim.x - 1, ib);
                a_read = a[read];
            }
            else
            {
                jj = (ix - (blockDim.x / 2)) * 2;
                ii = memloc(halowidth, blkmemwidth, jj, 0, TR);
                ir = memloc(halowidth, blkmemwidth, jj, 1, TR);
                it = memloc(halowidth, blkmemwidth, jj + 1, 0, TR);
                itr = memloc(halowidth, blkmemwidth, jj + 1, 1, TR);

                a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
            }
        }
    }
    else if (levTL == lev)
    {
        read = memloc(halowidth, blkmemwidth, ix, 0, TL);
        a_read = a[read];
    }
    else if (levTL > lev)
    {
        if (ix < (blockDim.x / 2))
        {
            jj = ix * 2;


            ii = memloc(halowidth, blkmemwidth, jj, 0, TL);
            ir = memloc(halowidth, blkmemwidth, jj, 1, TL);
            it = memloc(halowidth, blkmemwidth, jj + 1, 0, TL);
            itr = memloc(halowidth, blkmemwidth, jj + 1, 1, TL);
            a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
        }
        else
        {
            if (TR == ib)
            {
                read = memloc(halowidth, blkmemwidth, ix, blockDim.x-1, ib);
                a_read = a[read];
            }
            else
            {
                jj = (ix - (blockDim.x / 2)) * 2;
                ii = memloc(halowidth, blkmemwidth, jj, 0, TR);
                ir = memloc(halowidth, blkmemwidth, jj, 1, TR);
                it = memloc(halowidth, blkmemwidth, jj + 1, 0, TR);
                itr = memloc(halowidth, blkmemwidth, jj + 1, 1, TR);
                a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
            }
        }
    }
    else if (levTL < lev)
    {
        jj = BLTL == ib ? ceil(ix * (T)0.5) : ceil(ix * (T)0.5) + blockDim.x / 2;
        w1 = 1.0 / 3.0;
        w2 = ceil(ix * (T)0.5) * 2 > ix ? T(1.0 / 6.0) : T(0.5);
        w3 = ceil(ix * (T)0.5) * 2 > ix ? T(0.5) : T(1.0 / 6.0);
        ii = memloc(halowidth, blkmemwidth, ix, blockDim.x - 1, ib);
        ir = memloc(halowidth, blkmemwidth, jj, 0, TL);
        it = memloc(halowidth, blkmemwidth, jj - 1, 0, TL);
        if (BLTL == ib)
        {
            if (ix == 0)
            {
                if (LBTL == TL)
                {
                    w3 = 0.5 * (1.0 - w1);
                    w2 = w3;
                    it = ir;
                }
                else if (levLBTL < levTL)
                {
                    w1 = 4.0 / 10.0;
                    w2 = 5.0 / 10.0;
                    w3 = 1.0 / 10.0;
                    it = memloc(halowidth, blkmemwidth, blockDim.x - 1, 0, LBTL);
                }
                else if (levLBTL == levTL)
                {
                    it = memloc(halowidth, blkmemwidth, blockDim.x - 1, 0, LBTL);
                }
                else if (levLBTL > levTL)
                {
                    w1 = 1.0 / 4.0;
                    w2 = 1.0 / 2.0;
                    w3 = 1.0 / 4.0;
                    it = memloc(halowidth, blkmemwidth, blockDim.x - 1, 0, LBTL);
                }
            }
        }
        else
        {
            if (ix == blockDim.x - 1)
            {
                if (RBTR == TR)
                {
                    w3 = 0.5 * (1.0 - w1);
                    w2 = w3;
                    ir = it;
                }
                else if (levRBTR < levTR)
                {
                    w1 = 4.0 / 10.0;
                    w2 = 1.0 / 10.0;
                    w3 = 5.0 / 10.0;
                    ir = memloc(halowidth, blkmemwidth, 0, 0, RBTR);
                }
                else if (levRBTR == levTR)
                {
                    ir = memloc(halowidth, blkmemwidth, 0, 0, RBTR);
                }
                else if (levRBTR > levTR)
                {
                    w1 = 1.0 / 4.0;
                    w2 = 1.0 / 2.0;
                    w3 = 1.0 / 4.0;
                    ir = memloc(halowidth, blkmemwidth,0, 0, RBTR);
                }
            }
        }
        a_read= w1 * a[ii] + w2 * a[ir] + w3 * a[it];
    }
    a[write] = a_read;
}

template __global__ void fillTop<float>(int halowidth, int* active, int* level, int* topleft, int* topright, int* botleft, int* leftbot, int* rightbot, float* a);
template __global__ void fillTop<double>(int halowidth, int* active, int* level, int* topleft, int* topright, int* botleft, int* leftbot, int* rightbot, double* a);

/***
 * @brief CUDA kernel to fill the top halo region of blocks in parallel for new refinement, handling various neighbor configurations, new version
 * @param halowidth The width of the halo region
 * @param nblk The number of active blocks
 * @param active The array of active block indices
 * @param level The array of block levels
 * @param topleft The array of top left neighbor block indices
 * @param topright The array of top right neighbor block indices
 * @param botleft The array of bottom left neighbor block indices
 * @param leftbot The array of left bottom neighbor block indices
 * @param rightbot The array of right bottom neighbor block indices
 * @param a The variable to be refined
 * 
 */
template <class T> __global__ void fillTopnew(int halowidth, int nblk, int* active, int* level, int* topleft, int* topright, int* botleft, int* leftbot, int* rightbot, T* a)
{
    int blkmemwidth = blockDim.x + halowidth * 2;
    //unsigned int blksize = blkmemwidth * blkmemwidth;
    int ix = threadIdx.x;
    //unsigned int iy = blockDim.x-1;
    int ibl = threadIdx.y + blockIdx.x * blockDim.y;
    if (ibl < nblk)
    {
        int ib = active[ibl];

        int TL = topleft[ib];
        int TR = topright[ib];
        int LBTL = leftbot[TL];
        int BLTL = botleft[TL];
        int RBTR = rightbot[TR];


        int lev = level[ib];
        int levTL = level[TL];
        int levTR = level[TR];
        int levLBTL = level[LBTL];
        int levRBTR = level[RBTR];

        int write = memloc(halowidth, blkmemwidth, ix, blockDim.x, ib);
        int read;
        int jj, ii, ir, it, itr;
        T a_read;
        T w1, w2, w3;

        if (TL == ib)
        {
            if (ix < (blockDim.x / 2))
            {
                read = memloc(halowidth, blkmemwidth, ix, blockDim.x - 1, ib);
                a_read = a[read];
            }
            else
            {
                if (TR == ib)
                {
                    read = memloc(halowidth, blkmemwidth, ix, blockDim.x - 1, ib);
                    a_read = a[read];
                }
                else
                {
                    jj = (ix - (blockDim.x / 2)) * 2;
                    ii = memloc(halowidth, blkmemwidth, jj, 0, TR);
                    ir = memloc(halowidth, blkmemwidth, jj, 1, TR);
                    it = memloc(halowidth, blkmemwidth, jj + 1, 0, TR);
                    itr = memloc(halowidth, blkmemwidth, jj + 1, 1, TR);

                    a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
                }
            }
        }
        else if (levTL == lev)
        {
            read = memloc(halowidth, blkmemwidth, ix, 0, TL);
            a_read = a[read];
        }
        else if (levTL > lev)
        {
            if (ix < (blockDim.x / 2))
            {
                jj = ix * 2;


                ii = memloc(halowidth, blkmemwidth, jj, 0, TL);
                ir = memloc(halowidth, blkmemwidth, jj, 1, TL);
                it = memloc(halowidth, blkmemwidth, jj + 1, 0, TL);
                itr = memloc(halowidth, blkmemwidth, jj + 1, 1, TL);
                a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
            }
            else
            {
                if (TR == ib)
                {
                    read = memloc(halowidth, blkmemwidth, ix, blockDim.x - 1, ib);
                    a_read = a[read];
                }
                else
                {
                    jj = (ix - (blockDim.x / 2)) * 2;
                    ii = memloc(halowidth, blkmemwidth, jj, 0, TR);
                    ir = memloc(halowidth, blkmemwidth, jj, 1, TR);
                    it = memloc(halowidth, blkmemwidth, jj + 1, 0, TR);
                    itr = memloc(halowidth, blkmemwidth, jj + 1, 1, TR);
                    a_read = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
                }
            }
        }
        else if (levTL < lev)
        {
            jj = BLTL == ib ? ceil(ix * (T)0.5) : ceil(ix * (T)0.5) + blockDim.x / 2;
            w1 = 1.0 / 3.0;
            w2 = ceil(ix * (T)0.5) * 2 > ix ? T(1.0 / 6.0) : T(0.5);
            w3 = ceil(ix * (T)0.5) * 2 > ix ? T(0.5) : T(1.0 / 6.0);
            ii = memloc(halowidth, blkmemwidth, ix, blockDim.x - 1, ib);
            ir = memloc(halowidth, blkmemwidth, jj, 0, TL);
            it = memloc(halowidth, blkmemwidth, jj - 1, 0, TL);
            if (BLTL == ib)
            {
                if (ix == 0)
                {
                    if (LBTL == TL)
                    {
                        w3 = 0.5 * (1.0 - w1);
                        w2 = w3;
                        it = ir;
                    }
                    else if (levLBTL < levTL)
                    {
                        w1 = 4.0 / 10.0;
                        w2 = 5.0 / 10.0;
                        w3 = 1.0 / 10.0;
                        it = memloc(halowidth, blkmemwidth, blockDim.x - 1, 0, LBTL);
                    }
                    else if (levLBTL == levTL)
                    {
                        it = memloc(halowidth, blkmemwidth, blockDim.x - 1, 0, LBTL);
                    }
                    else if (levLBTL > levTL)
                    {
                        w1 = 1.0 / 4.0;
                        w2 = 1.0 / 2.0;
                        w3 = 1.0 / 4.0;
                        it = memloc(halowidth, blkmemwidth, blockDim.x - 1, 0, LBTL);
                    }
                }
            }
            else
            {
                if (ix == blockDim.x - 1)
                {
                    if (RBTR == TR)
                    {
                        w3 = 0.5 * (1.0 - w1);
                        w2 = w3;
                        ir = it;
                    }
                    else if (levRBTR < levTR)
                    {
                        w1 = 4.0 / 10.0;
                        w2 = 1.0 / 10.0;
                        w3 = 5.0 / 10.0;
                        ir = memloc(halowidth, blkmemwidth, 0, 0, RBTR);
                    }
                    else if (levRBTR == levTR)
                    {
                        ir = memloc(halowidth, blkmemwidth, 0, 0, RBTR);
                    }
                    else if (levRBTR > levTR)
                    {
                        w1 = 1.0 / 4.0;
                        w2 = 1.0 / 2.0;
                        w3 = 1.0 / 4.0;
                        ir = memloc(halowidth, blkmemwidth, 0, 0, RBTR);
                    }
                }
            }
            a_read = w1 * a[ii] + w2 * a[ir] + w3 * a[it];
        }
        a[write] = a_read;
    }
}

template __global__ void fillTopnew<float>(int halowidth, int nblk, int* active, int* level, int* topleft, int* topright, int* botleft, int* leftbot, int* rightbot, float* a);
template __global__ void fillTopnew<double>(int halowidth, int nblk, int* active, int* level, int* topleft, int* topright, int* botleft, int* leftbot, int* rightbot, double* a);

template <class T> void fillTopFlux(Param XParam, bool doProlongation, int ib, BlockP<T> XBlock, T*& z)
{
    int jj, bb;
    int read, write;
    int ii, ir, it;


    if (XBlock.TopLeft[ib] == ib)//The lower half is a boundary 
    {
        for (int j = 0; j < (XParam.blkwidth / 2); j++)
        {

            read = memloc(XParam, j, XParam.blkwidth - 1, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
            write = memloc(XParam, j, XParam.blkwidth, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
            z[write] = z[read];
        }

        if (XBlock.TopRight[ib] == ib) // boundary on the top half too
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //

                read = memloc(XParam, j, XParam.blkwidth - 1, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, j, XParam.blkwidth, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                z[write] = z[read];
            }
        }
        else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
        {

            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                write = memloc(XParam, j, XParam.blkwidth, ib);
                jj = (j - (XParam.blkwidth / 2)) * 2;
                ii = memloc(XParam, jj, 0, XBlock.TopRight[ib]);
                //ir = memloc(XParam, jj, 1, XBlock.TopRight[ib]);
                it = memloc(XParam, jj + 1, 0, XBlock.TopRight[ib]);
                //itr = memloc(XParam, jj + 1, 1, XBlock.TopRight[ib]);

                z[write] = T(0.5) * (z[ii] + z[it] );

            }
        }
    }
    else if (XBlock.level[ib] == XBlock.level[XBlock.TopLeft[ib]]) // LeftTop block does not exist
    {
        for (int j = 0; j < XParam.blkwidth; j++)
        {
            //

            write = memloc(XParam, j, XParam.blkwidth, ib);
            read = memloc(XParam, j, 0, XBlock.TopLeft[ib]);
            z[write] = z[read];
        }
    }
    else if (XBlock.level[XBlock.TopLeft[ib]] > XBlock.level[ib])
    {

        for (int j = 0; j < XParam.blkwidth / 2; j++)
        {

            write = memloc(XParam, j, XParam.blkwidth, ib);

            jj = j * 2;
            bb = XBlock.TopLeft[ib];

            ii = memloc(XParam, jj, 0, bb);
            //ir = memloc(XParam, jj, 1, bb);
            it = memloc(XParam, jj + 1, 0, bb);
            //itr = memloc(XParam, jj + 1, 1, bb);

            z[write] = T(0.5) * (z[ii]  + z[it]);
        }
        //now find out aboy lefttop block
        if (XBlock.TopRight[ib] == ib)
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //

                read = memloc(XParam, j, XParam.blkwidth - 1, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, j, XParam.blkwidth, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                z[write] = z[read];
            }
        }
        else
        {
            for (int j = (XParam.blkwidth / 2); j < (XParam.blkwidth); j++)
            {
                //
                jj = (j - (XParam.blkwidth / 2)) * 2;
                bb = XBlock.TopRight[ib];

                //read = memloc(XParam, 0, j, ib);// 1 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                write = memloc(XParam, j, XParam.blkwidth, ib); //0 + (j + XParam.halowidth) * XParam.blkmemwidth + ib * XParam.blksize;
                //z[write] = z[read];
                ii = memloc(XParam, jj, 0, bb);
                //ir = memloc(XParam, jj, 1, bb);
                it = memloc(XParam, jj + 1, 0, bb);
                //itr = memloc(XParam, jj + 1, 1, bb);

                z[write] = T(0.5) * (z[ii]  + z[it]);
            }
        }

    }
    else if (XBlock.level[XBlock.TopLeft[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo 
    {
        for (int j = 0; j < XParam.blkwidth; j++)
        {
            write = memloc(XParam, j, XParam.blkwidth, ib);
            int jj = XBlock.BotLeft[XBlock.TopLeft[ib]] == ib ? ftoi(floor(j * (T)0.5)) : ftoi(floor(j * (T)0.5) + XParam.blkwidth / 2);

            ir = memloc(XParam, jj, 0, XBlock.TopLeft[ib]);

            if (doProlongation)
                z[write] = z[ir];

        }
    }



}
template void fillTopFlux<float>(Param XParam, bool doProlongation, int ib, BlockP<float> XBlock, float*& z);
template void fillTopFlux<double>(Param XParam, bool doProlongation, int ib, BlockP<double> XBlock, double*& z);

template <class T> __global__ void fillTopFlux(int halowidth, bool doProlongation, int* active, int* level, int* topleft, int* topright, int* botleft, int* leftbot, int* rightbot, T* a)
{
    unsigned int blkmemwidth = blockDim.x + halowidth * 2;
    //unsigned int blksize = blkmemwidth * blkmemwidth;
    unsigned int ix = threadIdx.x;
    //unsigned int iy = blockDim.x - 1;
    unsigned int ibl = blockIdx.x;
    unsigned int ib = active[ibl];

    int TL = topleft[ib];
    int TR = topright[ib];
    //int LBTL = leftbot[TL];
    int BLTL = botleft[TL];
    //int RBTR = rightbot[TR];


    int lev = level[ib];
    int levTL = level[TL];
    //int levTR = level[TR];
    //int levLBTL = level[LBTL];
    //int levRBTR = level[RBTR];

    int write = memloc(halowidth, blkmemwidth, ix, blockDim.x, ib);
    int read;
    int jj, ii, ir, it;
    T a_read;
    //T w1, w2, w3;

    if (TL == ib)
    {
        if (ix < (blockDim.x / 2))
        {
            read = memloc(halowidth, blkmemwidth, ix, blockDim.x - 1, ib);
            a_read = a[read];
        }
        else
        {
            if (TR == ib)
            {
                read = memloc(halowidth, blkmemwidth, ix, blockDim.x - 1, ib);
                a_read = a[read];
            }
            else
            {
                jj = (ix - (blockDim.x / 2)) * 2;
                ii = memloc(halowidth, blkmemwidth, jj, 0, TR);
                //ir = memloc(halowidth, blkmemwidth, jj, 1, TR);
                it = memloc(halowidth, blkmemwidth, jj + 1, 0, TR);
                //itr = memloc(halowidth, blkmemwidth, jj + 1, 1, TR);

                a_read = T(0.5) * (a[ii]  + a[it] );
            }
        }
    }
    else if (levTL == lev)
    {
        read = memloc(halowidth, blkmemwidth, ix, 0, TL);
        a_read = a[read];
    }
    else if (levTL > lev)
    {
        if (ix < (blockDim.x / 2))
        {
            jj = ix * 2;


            ii = memloc(halowidth, blkmemwidth, jj, 0, TL);
            //ir = memloc(halowidth, blkmemwidth, jj, 1, TL);
            it = memloc(halowidth, blkmemwidth, jj + 1, 0, TL);
            //itr = memloc(halowidth, blkmemwidth, jj + 1, 1, TL);
            a_read = T(0.5) * (a[ii] + a[it]);
        }
        else
        {
            if (TR == ib)
            {
                read = memloc(halowidth, blkmemwidth, ix, blockDim.x - 1, ib);
                a_read = a[read];
            }
            else
            {
                jj = (ix - (blockDim.x / 2)) * 2;
                ii = memloc(halowidth, blkmemwidth, jj, 0, TR);
                //ir = memloc(halowidth, blkmemwidth, jj, 1, TR);
                it = memloc(halowidth, blkmemwidth, jj + 1, 0, TR);
                //itr = memloc(halowidth, blkmemwidth, jj + 1, 1, TR);
                a_read = T(0.5) * (a[ii] + a[it]);
            }
        }
    }
    else if (levTL < lev)
    {
        jj = BLTL == ib ? floor(ix * (T)0.5) : floor(ix * (T)0.5) + blockDim.x / 2;

        ir = memloc(halowidth, blkmemwidth, jj, 0, TL);
        if (doProlongation)
            a_read = a[ir];
        else
            a_read =  a[write];
    }
    a[write] = a_read;
}

template __global__ void fillTopFlux<float>(int halowidth, bool doProlongation, int* active, int* level, int* topleft, int* topright, int* botleft, int* leftbot, int* rightbot, float* a);
template __global__ void fillTopFlux<double>(int halowidth, bool doProlongation, int* active, int* level, int* topleft, int* topright, int* botleft, int* leftbot, int* rightbot, double* a);


template <class T> void fillCorners(Param XParam, BlockP<T> XBlock, T*& z)
{
    int ib;

    for (int ibl = 0; ibl < XParam.nblk; ibl++)
    {
        ib = XBlock.active[ibl];
        fillCorners(XParam, ib, XBlock, z);

    }

}
template void fillCorners<float>(Param XParam, BlockP<float> XBlock, float*& z);
template void fillCorners<double>(Param XParam, BlockP<double> XBlock, double*& z);


template <class T> void fillCorners(Param XParam, BlockP<T> XBlock, EvolvingP<T>& Xev)
{
    int ib;

    for (int ibl = 0; ibl < XParam.nblk; ibl++)
    {
        ib = XBlock.active[ibl];
        fillCorners(XParam, ib, XBlock, Xev.h);
        fillCorners(XParam, ib, XBlock, Xev.zs);
        fillCorners(XParam, ib, XBlock, Xev.u);
        fillCorners(XParam, ib, XBlock, Xev.v);
    }

}
template void fillCorners<float>(Param XParam, BlockP<float> XBlock, EvolvingP<float>& Xev);
template void fillCorners<double>(Param XParam, BlockP<double> XBlock, EvolvingP<double>& Xev);


template <class T> void fillCorners(Param XParam, int ib, BlockP<T> XBlock, T*& z)
{
    // Run only this function after the filling the other bit of halo (i.e. fctn fillleft...)
    // Most of the time the cormers are not needed. they are when refining a cell! 


    T zz;
    int write;
    int ii, ir, it, itr;


    // Bottom left corner
    write = memloc(XParam, -1, -1, ib);
    //check that there is a block there and if there is calculate the value depending on the level of that block
    if (XBlock.LeftTop[XBlock.BotLeft[ib]] == XBlock.BotLeft[ib]) // There is no block
    {
        zz = T(0.5) * (z[memloc(XParam, -1, 0, ib)] + z[memloc(XParam, 0, -1, ib)]);
    }
    else if (XBlock.level[XBlock.LeftTop[XBlock.BotLeft[ib]]] == XBlock.level[ib])
    {
        zz = z[memloc(XParam, XParam.blkwidth - 1, XParam.blkwidth - 1, XBlock.LeftTop[XBlock.BotLeft[ib]])];
    }
    else if (XBlock.level[XBlock.LeftTop[XBlock.BotLeft[ib]]] > XBlock.level[ib])
    {
        ii = memloc(XParam, XParam.blkwidth - 1, XParam.blkwidth - 1, XBlock.LeftTop[XBlock.BotLeft[ib]]);
        ir = memloc(XParam, XParam.blkwidth - 1, XParam.blkwidth - 2, XBlock.LeftTop[XBlock.BotLeft[ib]]);
        it = memloc(XParam, XParam.blkwidth - 2, XParam.blkwidth - 1, XBlock.LeftTop[XBlock.BotLeft[ib]]);
        itr = memloc(XParam, XParam.blkwidth - 2, XParam.blkwidth - 2, XBlock.LeftTop[XBlock.BotLeft[ib]]);

        zz = T(0.25) * (z[ii] + z[ir] + z[it] + z[itr]);
    }
    else if (XBlock.level[XBlock.LeftTop[XBlock.BotLeft[ib]]] < XBlock.level[ib])
    {
        ii = memloc(XParam, XParam.blkwidth - 1, XParam.blkwidth - 1, XBlock.LeftTop[XBlock.BotLeft[ib]]);
        ir = memloc(XParam, - 1, 0, ib);
        it = memloc(XParam,0, - 1, ib);
        zz = T(0.5) * z[ii] + T(0.25) * (z[ir] + z[it]);
    }

    z[write] = zz;

    // Top Left corner
    write = memloc(XParam, -1, XParam.blkwidth, ib);
    //check that there is a block there and if there is calculate the value depending on the level of that block
    if (XBlock.LeftBot[XBlock.TopLeft[ib]] == XBlock.TopLeft[ib]) // There is no block
    {
        zz = T(0.5) * (z[memloc(XParam, -1, XParam.blkwidth-1, ib)] + z[memloc(XParam, 0, XParam.blkwidth, ib)]);
    }
    else if (XBlock.level[XBlock.LeftBot[XBlock.TopLeft[ib]]] == XBlock.level[ib])
    {
        zz = z[memloc(XParam, XParam.blkwidth - 1, 0, XBlock.LeftBot[XBlock.TopLeft[ib]])];
    }
    else if (XBlock.level[XBlock.LeftBot[XBlock.TopLeft[ib]]] > XBlock.level[ib])
    {
        ii = memloc(XParam, XParam.blkwidth - 1, 0, XBlock.LeftBot[XBlock.TopLeft[ib]]);
        ir = memloc(XParam, XParam.blkwidth - 1, 1, XBlock.LeftBot[XBlock.TopLeft[ib]]);
        it = memloc(XParam, XParam.blkwidth - 2, 0, XBlock.LeftBot[XBlock.TopLeft[ib]]);
        itr = memloc(XParam, XParam.blkwidth - 2, 1, XBlock.LeftBot[XBlock.TopLeft[ib]]);

        zz = T(0.25) * (z[ii] + z[ir] + z[it] + z[itr]);
    }
    else if (XBlock.level[XBlock.LeftBot[XBlock.TopLeft[ib]]] < XBlock.level[ib])
    {
        ii = memloc(XParam, XParam.blkwidth - 1, 0, XBlock.LeftBot[XBlock.TopLeft[ib]]);
        ir = memloc(XParam, -1, XParam.blkwidth - 1, ib);
        it = memloc(XParam, 0, XParam.blkwidth, ib);
        zz = T(0.5) * z[ii] + T(0.25) * (z[ir] + z[it]);
    }

    z[write] = zz;

    //Top Right corner
    write = memloc(XParam, XParam.blkwidth, XParam.blkwidth, ib);
    //check that there is a block there and if there is calculate the value depending on the level of that block
    if (XBlock.RightBot[XBlock.TopRight[ib]] == XBlock.TopRight[ib]) // There is no block
    {
        zz = T(0.5) * (z[memloc(XParam, XParam.blkwidth, XParam.blkwidth - 1, ib)] + z[memloc(XParam, XParam.blkwidth - 1, XParam.blkwidth, ib)]);
    }
    else if (XBlock.level[XBlock.RightBot[XBlock.TopRight[ib]]] == XBlock.level[ib])
    {
        zz = z[memloc(XParam, 0, 0, XBlock.RightBot[XBlock.TopRight[ib]])];
    }
    else if (XBlock.level[XBlock.RightBot[XBlock.TopRight[ib]]] > XBlock.level[ib])
    {
        ii = memloc(XParam, 0, 0, XBlock.RightBot[XBlock.TopRight[ib]]);
        ir = memloc(XParam, 0, 1, XBlock.RightBot[XBlock.TopRight[ib]]);
        it = memloc(XParam, 1, 0, XBlock.RightBot[XBlock.TopRight[ib]]);
        itr = memloc(XParam, 1, 1, XBlock.RightBot[XBlock.TopRight[ib]]);

        zz = T(0.25) * (z[ii] + z[ir] + z[it] + z[itr]);
    }
    else if (XBlock.level[XBlock.LeftBot[XBlock.TopLeft[ib]]] < XBlock.level[ib])
    {
        ii = memloc(XParam, 0, 0, XBlock.LeftBot[XBlock.TopLeft[ib]]);
        ir = memloc(XParam, XParam.blkwidth, XParam.blkwidth - 1, ib);
        it = memloc(XParam, XParam.blkwidth-1, XParam.blkwidth, ib);
        zz = T(0.5) * z[ii] + T(0.25) * ( z[ir] +  z[it]);
    }

    z[write] = zz;

    //Bot Right corner
    write = memloc(XParam, XParam.blkwidth, -1, ib);
    //check that there is a block there and if there is calculate the value depending on the level of that block
    if (XBlock.RightBot[XBlock.BotRight[ib]] == XBlock.BotRight[ib]) // There is no block
    {
        zz = T(0.5) * (z[memloc(XParam, XParam.blkwidth-1, - 1, ib)] + z[memloc(XParam, XParam.blkwidth , 0, ib)]);
    }
    else if (XBlock.level[XBlock.RightBot[XBlock.BotRight[ib]]] == XBlock.level[ib])
    {
        zz = z[memloc(XParam, 0, XParam.blkwidth - 1, XBlock.RightBot[XBlock.BotRight[ib]])];
    }
    else if (XBlock.level[XBlock.RightBot[XBlock.BotRight[ib]]] > XBlock.level[ib])
    {
        ii = memloc(XParam, 0, XParam.blkwidth - 1, XBlock.RightBot[XBlock.BotRight[ib]]);
        ir = memloc(XParam, 0, XParam.blkwidth - 2, XBlock.RightBot[XBlock.BotRight[ib]]);
        it = memloc(XParam, 1, XParam.blkwidth - 1, XBlock.RightBot[XBlock.BotRight[ib]]);
        itr = memloc(XParam, 1, XParam.blkwidth - 2, XBlock.RightBot[XBlock.BotRight[ib]]);

        zz = T(0.25) * (z[ii] + z[ir] + z[it] + z[itr]);
    }
    else if (XBlock.level[XBlock.RightBot[XBlock.BotRight[ib]]] < XBlock.level[ib])
    {
        ii = memloc(XParam, 0, XParam.blkwidth - 1, XBlock.LeftBot[XBlock.TopLeft[ib]]);
        ir = memloc(XParam, XParam.blkwidth - 1, -1, ib);
        it = memloc(XParam, XParam.blkwidth, 0, ib);
        zz = T(0.5) * z[ii] + T(0.25) * (z[ir] + z[it]);
    }

    z[write] = zz;

}
template void fillCorners<float>(Param XParam, int ib, BlockP<float> XBlock, float*& z);
template void fillCorners<double>(Param XParam, int ib, BlockP<double> XBlock, double*& z);

template <class T> __global__ void fillCornersGPU(Param XParam, BlockP<T> XBlock, T* z)
{
    int blkmemwidth = XParam.blkwidth + XParam.halowidth * 2;
    int halowidth = XParam.halowidth;
    //unsigned int blksize = blkmemwidth * blkmemwidth;
    int ix = threadIdx.x;
    //int iy = threadIdx.y;
    //unsigned int iy = blockDim.x-1;
    int ibl = blockIdx.x;
    int ib = XBlock.active[ibl];

    int TL = XBlock.TopLeft[ib];
    int TR = XBlock.TopRight[ib];
    int LB = XBlock.LeftBot[ib];
    int LT = XBlock.LeftTop[ib];
    int BL = XBlock.BotLeft[ib];
    int BR = XBlock.BotRight[ib];
    int RB = XBlock.RightBot[ib];
    int RT = XBlock.RightTop[ib];

    //int LBTL = XBlock.leftbot[TL];
    //int BLTL = XBlock.botleft[TL];
    //int RBTR = XBlock.rightbot[TR];

    int iout, ii;


    if (ix == 0)
    {
        // Bot left corner

        iout = memloc(halowidth, blkmemwidth, -1, -1, ib);


        if (BL == ib && LB == ib)//
        {
            ii = memloc(halowidth, blkmemwidth, 0, 0, ib);
        }
        else
        {
            if (BL != ib)
            {
                ii = memloc(halowidth, blkmemwidth, -1, XParam.blkwidth - 1, BL);
            }
            else
            {
                ii = memloc(halowidth, blkmemwidth, XParam.blkwidth - 1, -1, LB);
            }

        }
        z[iout] = z[ii];
    }
    if (ix == 1)
    {

        // Top left corner
        iout = memloc(halowidth, blkmemwidth, -1, XParam.blkwidth, ib);
        if (TL == ib && LT == ib)//
        {
            ii = memloc(halowidth, blkmemwidth, 0, XParam.blkwidth - 1, ib);
        }
        else
        {
            if (TL != ib)
            {
                ii = memloc(halowidth, blkmemwidth, -1, 0, TL);
            }
            else
            {
                ii = memloc(halowidth, blkmemwidth, XParam.blkwidth, XParam.blkwidth - 1, LT);
            }

        }
        z[iout] = z[ii];

    }
    if (ix == 2)
    {
        // Top right corner
        iout = memloc(halowidth, blkmemwidth, XParam.blkwidth, XParam.blkwidth, ib);
        if (TR == ib && RT == ib)//
        {
            ii = memloc(halowidth, blkmemwidth, XParam.blkwidth - 1, XParam.blkwidth - 1, ib);
        }
        else
        {
            if (TR != ib)
            {
                ii = memloc(halowidth, blkmemwidth, XParam.blkwidth, 0, TR);
            }
            else
            {
                ii = memloc(halowidth, blkmemwidth, 0, XParam.blkwidth, RT);
            }

        }
        z[iout] = z[ii];

    }
    if (ix == 3)
    {
        // Bot right corner
        iout = memloc(halowidth, blkmemwidth, XParam.blkwidth, -1, ib);
        if (BR == ib && RB == ib)//
        {
            ii = memloc(halowidth, blkmemwidth, XParam.blkwidth - 1, 0, ib);
        }
        else
        {
            if (BR != ib)
            {
                ii = memloc(halowidth, blkmemwidth, XParam.blkwidth, XParam.blkwidth - 1, BR);
            }
            else
            {
                ii = memloc(halowidth, blkmemwidth, 0, -1, RB);
            }

        }
        z[iout] = z[ii];
    }

}
template __global__ void fillCornersGPU<float>(Param XParam, BlockP<float> XBlock, float* z);
template __global__ void fillCornersGPU<double>(Param XParam, BlockP<double> XBlock, double* z);