File Gradients.cu
File List > src > Gradients.cu
Go to the documentation of this file
#include "Gradients.h"
template <class T> void gradientGPU(Param XParam, BlockP<T>XBlock, EvolvingP<T> XEv, GradientsP<T> XGrad,T* zb)
{
//const int num_streams = 4;
/*
cudaStream_t streams[num_streams];
for (int i = 0; i < num_streams; i++)
{
CUDA_CHECK(cudaStreamCreate(&streams[i]));
}
*/
dim3 blockDim(XParam.blkwidth, XParam.blkwidth, 1);
dim3 blockDimLR(1, XParam.blkwidth, 1);
dim3 blockDimBT(XParam.blkwidth, 1, 1);
dim3 blockDimLR2(2, XParam.blkwidth, 1);
dim3 blockDimBT2(XParam.blkwidth, 2, 1);
dim3 blockDimfull(XParam.blkmemwidth, XParam.blkmemwidth, 1);
dim3 gridDim(XParam.nblk, 1, 1);
//gradient <<< gridDim, blockDim, 0, streams[1] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy);
gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx, XGrad.dhdy);
CUDA_CHECK(cudaDeviceSynchronize());
//gradient <<< gridDim, blockDim, 0, streams[2] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
CUDA_CHECK(cudaDeviceSynchronize());
//gradient <<< gridDim, blockDim, 0, streams[3] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy);
gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx, XGrad.dudy);
CUDA_CHECK(cudaDeviceSynchronize());
//gradient <<< gridDim, blockDim, 0, streams[0] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy);
gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx, XGrad.dvdy);
CUDA_CHECK(cudaDeviceSynchronize());
//CUDA_CHECK(cudaDeviceSynchronize());
/*
for (int i = 0; i < num_streams; i++)
{
cudaStreamDestroy(streams[i]);
}
*/
//fillHaloGPU(XParam, XBlock, XGrad);
gradientHaloGPU(XParam, XBlock, XEv.h, XGrad.dhdx, XGrad.dhdy);
gradientHaloGPU(XParam, XBlock, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
gradientHaloGPU(XParam, XBlock, XEv.u, XGrad.dudx, XGrad.dudy);
gradientHaloGPU(XParam, XBlock, XEv.v, XGrad.dvdx, XGrad.dvdy);
if (XParam.conserveElevation)
{
conserveElevationGradHaloGPU(XParam, XBlock, XEv.h, XEv.zs, zb, XGrad.dhdx, XGrad.dzsdx, XGrad.dhdy, XGrad.dzsdy);
}
//else
{
if (XParam.maxlevel > XParam.minlevel)
{
refine_linearGPU(XParam, XBlock, XEv.h, XGrad.dhdx, XGrad.dhdy);
//refine_linearGPU(XParam, XBlock, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
refine_linearGPU(XParam, XBlock, XEv.u, XGrad.dudx, XGrad.dudy);
refine_linearGPU(XParam, XBlock, XEv.v, XGrad.dvdx, XGrad.dvdy);
if (XParam.conserveElevation)
{
conserveElevationGPU(XParam, XBlock, XEv, zb);
}
RecalculateZsGPU <<< gridDim, blockDimfull, 0 >>> (XParam, XBlock, XEv, zb);
CUDA_CHECK(cudaDeviceSynchronize());
//gradient <<< gridDim, blockDim, 0, streams[1] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy);
gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx, XGrad.dhdy);
CUDA_CHECK(cudaDeviceSynchronize());
//gradient <<< gridDim, blockDim, 0, streams[2] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
CUDA_CHECK(cudaDeviceSynchronize());
//gradient <<< gridDim, blockDim, 0, streams[3] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy);
gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx, XGrad.dudy);
CUDA_CHECK(cudaDeviceSynchronize());
//gradient <<< gridDim, blockDim, 0, streams[0] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy);
gradient <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx, XGrad.dvdy);
CUDA_CHECK(cudaDeviceSynchronize());
/*
gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx);
//CUDA_CHECK(cudaDeviceSynchronize());
//gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx);
//CUDA_CHECK(cudaDeviceSynchronize());
gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdy);
//CUDA_CHECK(cudaDeviceSynchronize());
//gradientedgeY <<< gridDim, blockDimBT, 0>>> (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdy);
//CUDA_CHECK(cudaDeviceSynchronize());
gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx);
//CUDA_CHECK(cudaDeviceSynchronize());
//gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx);
//CUDA_CHECK(cudaDeviceSynchronize());
gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdy);
//CUDA_CHECK(cudaDeviceSynchronize());
//gradientedgeY <<< gridDim, blockDimBT, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdy);
//CUDA_CHECK(cudaDeviceSynchronize());
gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx);
//CUDA_CHECK(cudaDeviceSynchronize());
//gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx);
//CUDA_CHECK(cudaDeviceSynchronize());
gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudy);
//CUDA_CHECK(cudaDeviceSynchronize());
//gradientedgeY <<< gridDim, blockDimBT, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudy);
//CUDA_CHECK(cudaDeviceSynchronize());
gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx);
//CUDA_CHECK(cudaDeviceSynchronize());
//gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx);
//CUDA_CHECK(cudaDeviceSynchronize());
gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdy);
//CUDA_CHECK(cudaDeviceSynchronize());
//gradientedgeY <<< gridDim, blockDimBT, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdy);
CUDA_CHECK(cudaDeviceSynchronize());
*/
gradientHaloGPU(XParam, XBlock, XEv.h, XGrad.dhdx, XGrad.dhdy);
gradientHaloGPU(XParam, XBlock, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
gradientHaloGPU(XParam, XBlock, XEv.u, XGrad.dudx, XGrad.dudy);
gradientHaloGPU(XParam, XBlock, XEv.v, XGrad.dvdx, XGrad.dvdy);
if (XParam.conserveElevation)
{
conserveElevationGradHaloGPU(XParam, XBlock, XEv.h, XEv.zs, zb, XGrad.dhdx, XGrad.dzsdx, XGrad.dhdy, XGrad.dzsdy);
}
}
if (XParam.engine == 1)
{
// wet slope limiter
WetsloperesetXGPU <<< gridDim, blockDim, 0 >>> (XParam, XBlock, XEv, XGrad, zb);
CUDA_CHECK(cudaDeviceSynchronize());
WetsloperesetYGPU <<< gridDim, blockDim, 0 >>> (XParam, XBlock, XEv, XGrad, zb);
CUDA_CHECK(cudaDeviceSynchronize());
// ALso do the slope limiter on the halo
WetsloperesetHaloLeftGPU <<< gridDim, blockDimLR, 0 >>> (XParam, XBlock, XEv, XGrad, zb);
CUDA_CHECK(cudaDeviceSynchronize());
WetsloperesetHaloRightGPU <<< gridDim, blockDimLR, 0 >>> (XParam, XBlock, XEv, XGrad, zb);
CUDA_CHECK(cudaDeviceSynchronize());
WetsloperesetHaloBotGPU <<< gridDim, blockDimBT, 0 >>> (XParam, XBlock, XEv, XGrad, zb);
CUDA_CHECK(cudaDeviceSynchronize());
WetsloperesetHaloTopGPU <<< gridDim, blockDimBT, 0 >>> (XParam, XBlock, XEv, XGrad, zb);
CUDA_CHECK(cudaDeviceSynchronize());
}
}
//conserveElevationGradHaloGPU(XParam, XBlock, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
//conserveElevationGradHaloGPU(XParam, XBlock, XEv.u, XGrad.dudx, XGrad.dudy);
//conserveElevationGradHaloGPU(XParam, XBlock, XEv.v, XGrad.dvdx, XGrad.dvdy);
}
template void gradientGPU<float>(Param XParam, BlockP<float>XBlock, EvolvingP<float> XEv, GradientsP<float> XGrad, float * zb);
template void gradientGPU<double>(Param XParam, BlockP<double>XBlock, EvolvingP<double> XEv, GradientsP<double> XGrad, double * zb);
template <class T> void gradientGPUnew(Param XParam, BlockP<T>XBlock, EvolvingP<T> XEv, GradientsP<T> XGrad, T* zb)
{
dim3 blockDim(XParam.blkwidth, XParam.blkwidth, 1);
dim3 blockDimLR(1, XParam.blkwidth, 1);
dim3 blockDimBT(XParam.blkwidth, 1, 1);
dim3 blockDimLR2(2, XParam.blkwidth, 1);
dim3 blockDimBT2(XParam.blkwidth, 2, 1);
dim3 blockDimfull(XParam.blkmemwidth, XParam.blkmemwidth, 1);
dim3 gridDim(XParam.nblk, 1, 1);
const int num_streams = 4;
cudaStream_t streams[num_streams];
for (int i = 0; i < num_streams; i++)
{
CUDA_CHECK(cudaStreamCreate(&streams[i]));
}
//gradient <<< gridDim, blockDim, 0, streams[1] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy);
gradientSMC <<< gridDim, blockDim, 0, streams[0] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx, XGrad.dhdy);
//CUDA_CHECK(cudaDeviceSynchronize());
//gradient <<< gridDim, blockDim, 0, streams[2] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
gradientSMC <<< gridDim, blockDim, 0, streams[1] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
//CUDA_CHECK(cudaDeviceSynchronize());
//gradient <<< gridDim, blockDim, 0, streams[3] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy);
gradientSMC <<< gridDim, blockDim, 0, streams[2] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx, XGrad.dudy);
//CUDA_CHECK(cudaDeviceSynchronize());
//gradient <<< gridDim, blockDim, 0, streams[0] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy);
gradientSMC <<< gridDim, blockDim, 0, streams[3] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx, XGrad.dvdy);
//CUDA_CHECK(cudaDeviceSynchronize());
//CUDA_CHECK(cudaDeviceSynchronize());
for (int i = 0; i < num_streams; i++)
{
cudaStreamDestroy(streams[i]);
}
//fillHaloGPU(XParam, XBlock, XGrad);
gradientHaloGPUnew(XParam, XBlock, XEv.h, XGrad.dhdx, XGrad.dhdy);
gradientHaloGPUnew(XParam, XBlock, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
gradientHaloGPUnew(XParam, XBlock, XEv.u, XGrad.dudx, XGrad.dudy);
gradientHaloGPUnew(XParam, XBlock, XEv.v, XGrad.dvdx, XGrad.dvdy);
if (XParam.maxlevel > XParam.minlevel)
{
if (XParam.conserveElevation)
{
conserveElevationGradHaloGPU(XParam, XBlock, XEv.h, XEv.zs, zb, XGrad.dhdx, XGrad.dzsdx, XGrad.dhdy, XGrad.dzsdy);
}
refine_linearGPU(XParam, XBlock, XEv.h, XGrad.dhdx, XGrad.dhdy);
//refine_linearGPU(XParam, XBlock, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
refine_linearGPU(XParam, XBlock, XEv.u, XGrad.dudx, XGrad.dudy);
refine_linearGPU(XParam, XBlock, XEv.v, XGrad.dvdx, XGrad.dvdy);
if (XParam.conserveElevation)
{
conserveElevationGPU(XParam, XBlock, XEv, zb);
}
else if (XParam.wetdryfix)
{
WetDryProlongationGPU(XParam, XBlock, XEv, zb);
}
RecalculateZsGPU <<< gridDim, blockDimfull, 0 >>> (XParam, XBlock, XEv, zb);
CUDA_CHECK(cudaDeviceSynchronize());
/*
//gradient <<< gridDim, blockDim, 0, streams[1] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy);
gradientSM <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx, XGrad.dhdy);
CUDA_CHECK(cudaDeviceSynchronize());
//gradient <<< gridDim, blockDim, 0, streams[2] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
gradientSM <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
CUDA_CHECK(cudaDeviceSynchronize());
//gradient <<< gridDim, blockDim, 0, streams[3] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy);
gradientSM <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx, XGrad.dudy);
CUDA_CHECK(cudaDeviceSynchronize());
//gradient <<< gridDim, blockDim, 0, streams[0] >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy);
gradientSM <<< gridDim, blockDim, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx, XGrad.dvdy);
CUDA_CHECK(cudaDeviceSynchronize());
*/
/*
const int num_streams = 16;
cudaStream_t streams[num_streams];
for (int i = 0; i < num_streams; i++)
{
CUDA_CHECK(cudaStreamCreate(&streams[i]));
}
*/
gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdx);
//CUDA_CHECK(cudaDeviceSynchronize());
//gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdx);
//CUDA_CHECK(cudaDeviceSynchronize());
gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.h, XGrad.dhdy);
//CUDA_CHECK(cudaDeviceSynchronize());
//gradientedgeY <<< gridDim, blockDimBT, 0>>> (XParam.halowidth, XParam.blkwidth-1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.h, XGrad.dhdy);
//CUDA_CHECK(cudaDeviceSynchronize());
gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdx);
//CUDA_CHECK(cudaDeviceSynchronize());
//gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdx);
//CUDA_CHECK(cudaDeviceSynchronize());
gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.zs, XGrad.dzsdy);
//CUDA_CHECK(cudaDeviceSynchronize());
//gradientedgeY <<< gridDim, blockDimBT, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.zs, XGrad.dzsdy);
//CUDA_CHECK(cudaDeviceSynchronize());
gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudx);
//CUDA_CHECK(cudaDeviceSynchronize());
//gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudx);
//CUDA_CHECK(cudaDeviceSynchronize());
gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.u, XGrad.dudy);
//CUDA_CHECK(cudaDeviceSynchronize());
//gradientedgeY <<< gridDim, blockDimBT, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.u, XGrad.dudy);
//CUDA_CHECK(cudaDeviceSynchronize());
gradientedgeX <<< gridDim, blockDimLR2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdx);
//CUDA_CHECK(cudaDeviceSynchronize());
//gradientedgeX <<< gridDim, blockDimLR, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdx);
//CUDA_CHECK(cudaDeviceSynchronize());
gradientedgeY <<< gridDim, blockDimBT2, 0 >>> (XParam.halowidth, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.delta, XEv.v, XGrad.dvdy);
//CUDA_CHECK(cudaDeviceSynchronize());
//gradientedgeY <<< gridDim, blockDimBT, 0 >>> (XParam.halowidth, XParam.blkwidth - 1, XBlock.active, XBlock.level, (T)XParam.theta, (T)XParam.dx, XEv.v, XGrad.dvdy);
CUDA_CHECK(cudaDeviceSynchronize());
/*
for (int i = 0; i < num_streams; i++)
{
cudaStreamDestroy(streams[i]);
}
*/
gradientHaloGPUnew(XParam, XBlock, XEv.h, XGrad.dhdx, XGrad.dhdy);
gradientHaloGPUnew(XParam, XBlock, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
gradientHaloGPUnew(XParam, XBlock, XEv.u, XGrad.dudx, XGrad.dudy);
gradientHaloGPUnew(XParam, XBlock, XEv.v, XGrad.dvdx, XGrad.dvdy);
if (XParam.conserveElevation)
{
conserveElevationGradHaloGPU(XParam, XBlock, XEv.h, XEv.zs, zb, XGrad.dhdx, XGrad.dzsdx, XGrad.dhdy, XGrad.dzsdy);
}
}
if (XParam.engine == 1)
{
// wet slope limiter
WetsloperesetXGPU <<< gridDim, blockDim, 0 >>> (XParam, XBlock, XEv, XGrad, zb);
CUDA_CHECK(cudaDeviceSynchronize());
WetsloperesetYGPU <<< gridDim, blockDim, 0 >>> (XParam, XBlock, XEv, XGrad, zb);
CUDA_CHECK(cudaDeviceSynchronize());
// ALso do the slope limiter on the halo
WetsloperesetHaloLeftGPU <<< gridDim, blockDimLR, 0 >>> (XParam, XBlock, XEv, XGrad, zb);
CUDA_CHECK(cudaDeviceSynchronize());
WetsloperesetHaloRightGPU <<< gridDim, blockDimLR, 0 >>> (XParam, XBlock, XEv, XGrad, zb);
CUDA_CHECK(cudaDeviceSynchronize());
WetsloperesetHaloBotGPU <<< gridDim, blockDimBT, 0 >>> (XParam, XBlock, XEv, XGrad, zb);
CUDA_CHECK(cudaDeviceSynchronize());
WetsloperesetHaloTopGPU <<< gridDim, blockDimBT, 0 >>> (XParam, XBlock, XEv, XGrad, zb);
CUDA_CHECK(cudaDeviceSynchronize());
}
//conserveElevationGradHaloGPU(XParam, XBlock, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
//conserveElevationGradHaloGPU(XParam, XBlock, XEv.u, XGrad.dudx, XGrad.dudy);
//conserveElevationGradHaloGPU(XParam, XBlock, XEv.v, XGrad.dvdx, XGrad.dvdy);
}
template void gradientGPUnew<float>(Param XParam, BlockP<float>XBlock, EvolvingP<float> XEv, GradientsP<float> XGrad, float* zb);
template void gradientGPUnew<double>(Param XParam, BlockP<double>XBlock, EvolvingP<double> XEv, GradientsP<double> XGrad, double* zb);
template <class T> __global__ void gradient(int halowidth, int* active, int* level, T theta, T dx, T* a, T* dadx, T* dady)
{
//int *leftblk,int *rightblk,int* topblk, int * botblk,
//int ix = threadIdx.x+1;
//int iy = threadIdx.y+1;
int blkmemwidth = blockDim.x + halowidth * 2;
//unsigned int blksize = blkmemwidth * blkmemwidth;
int ix = threadIdx.x;
int iy = threadIdx.y;
int ibl = blockIdx.x;
int ib = active[ibl];
int lev = level[ib];
T delta = calcres(dx, lev);
int i = memloc(halowidth, blkmemwidth, ix, iy, ib);
//int iright, ileft, ibot;
// shared array index to make the code bit more readable
//unsigned int sx = ix + halowidth;
//unsigned int sy = iy + halowidth;
T a_l, a_t, a_r, a_b,a_i;
a_i = a[i];
a_l = a[memloc(halowidth, blkmemwidth, ix - 1, iy, ib)];
a_t = a[memloc(halowidth, blkmemwidth, ix , iy + 1, ib)];
a_r = a[memloc(halowidth, blkmemwidth, ix + 1, iy, ib)];
a_b = a[memloc(halowidth, blkmemwidth, ix, iy - 1, ib)];
//__shared__ T a_s[18][18];
//__syncthreads();
//__syncwarp;
dadx[i] = minmod2(theta, a_l, a_i, a_r) / delta;
dady[i] = minmod2(theta, a_b, a_i, a_t) / delta;
}
template __global__ void gradient<float>(int halowidth, int* active, int* level, float theta, float dx, float* a, float* dadx, float* dady);
template __global__ void gradient<double>(int halowidth, int* active, int* level, double theta, double dx, double* a, double* dadx, double* dady);
template <class T> __global__ void gradientSM(int halowidth, int* active, int* level, T theta, T dx, T* a, T* dadx, T* dady)
{
//int *leftblk,int *rightblk,int* topblk, int * botblk,
//int ix = threadIdx.x+1;
//int iy = threadIdx.y+1;
int blkmemwidth = blockDim.x + halowidth * 2;
int blksize = blkmemwidth * blkmemwidth;
int ix = threadIdx.x;
int iy = threadIdx.y;
int ibl = blockIdx.x;
int ib = active[ibl];
int lev = level[ib];
T delta = calcres(dx, lev);
int i = memloc(halowidth, blkmemwidth, ix, iy, ib);
int iright, ileft, itop, ibot;
// shared array index to make the code bit more readable
int sx = ix + halowidth;
int sy = iy + halowidth;
__shared__ T a_s[18][18];
a_s[sx][sy] = a[i];
//syncthread is needed here ?
// read the halo around the tile
if (threadIdx.x == blockDim.x - 1)
{
iright = memloc(halowidth, blkmemwidth, ix + 1, iy, ib);
a_s[sx + 1][sy] = a[iright];
}
if (threadIdx.x == 0)
{
ileft = memloc(halowidth, blkmemwidth, ix - 1, iy, ib);;
a_s[sx - 1][sy] = a[ileft];
}
if (threadIdx.y == blockDim.y - 1)
{
itop = memloc(halowidth, blkmemwidth, ix, iy + 1, ib);;
a_s[sx][sy + 1] = a[itop];
}
if (threadIdx.y == 0)
{
ibot = memloc(halowidth, blkmemwidth, ix, iy - 1, ib);
a_s[sx][sy - 1] = a[ibot];
}
__syncthreads();
dadx[i] = minmod2(theta, a_s[sx - 1][sy], a_s[sx][sy], a_s[sx + 1][sy]) / delta;
dady[i] = minmod2(theta, a_s[sx][sy - 1], a_s[sx][sy], a_s[sx][sy + 1]) / delta;
}
template __global__ void gradientSM<float>(int halowidth, int* active, int* level, float theta, float dx, float* a, float* dadx, float* dady);
template __global__ void gradientSM<double>(int halowidth, int* active, int* level, double theta, double dx, double* a, double* dadx, double* dady);
template <class T> __global__ void gradientSMB(int halowidth, int* active, int* level, T theta, T dx, T* a, T* dadx, T* dady)
{
//int *leftblk,int *rightblk,int* topblk, int * botblk,
//int ix = threadIdx.x+1;
//int iy = threadIdx.y+1;
int blkmemwidth = 18;
int blksize = blkmemwidth * blkmemwidth;
int ix = threadIdx.x-1;
int iy = threadIdx.y-1;
int ibl = blockIdx.x;
int ib = active[ibl];
int lev = level[ib];
T delta = calcres(dx, lev);
int iright, ileft, itop, ibot;
// shared array index to make the code bit more readable
int sx = ix + 1;
int sy = iy + 1;
int i = memloc(halowidth, blkmemwidth, ix, iy, ib);
//int o = memloc(halowidth, blkmemwidth, sx, sy, ib);
__shared__ T a_s[18][18];
a_s[sx][sy] = a[i];
__syncthreads();
//syncthread is needed here ?
T aleft, aright, atop, abot;
aleft = a_s[sx - 1][sy];
aright = a_s[sx + 1][sy];
atop = a_s[sx][sy + 1];
abot = a_s[sx][sy - 1];
if (ix >= 0 && ix < 16 && iy >=0 && iy < 16)
{
dadx[i] = minmod2(theta, aleft, a_s[sx][sy], aright) / delta;
dady[i] = minmod2(theta, abot, a_s[sx][sy], atop) / delta;
}
}
template __global__ void gradientSMB<float>(int halowidth, int* active, int* level, float theta, float dx, float* a, float* dadx, float* dady);
template __global__ void gradientSMB<double>(int halowidth, int* active, int* level, double theta, double dx, double* a, double* dadx, double* dady);
template <class T> __global__ void gradientSMC(int halowidth, int* active, int* level, T theta, T dx, T* a, T* dadx, T* dady)
{
//int *leftblk,int *rightblk,int* topblk, int * botblk,
//int ix = threadIdx.x+1;
//int iy = threadIdx.y+1;
int blkmemwidth = blockDim.x + halowidth * 2;
int flatenwidth = blockDim.x * blockDim.y;
int blksize = blkmemwidth * blkmemwidth;
int ix = threadIdx.x;
int iy = threadIdx.y;
int ibl = blockIdx.x;
int ib = active[ibl];
int lev = level[ib];
T delta = calcres(dx, lev);
int ism =ix + iy * blockDim.x;
int istore = ism + ib * (blkmemwidth * blkmemwidth);
//(i + halowidth) + (j + halowidth) * blkmemwidth + ib * (blkmemwidth * blkmemwidth);
//memloc(halowidth, blkmemwidth, ix, iy, ib);
int ileft, iright, itop, ibot, i, iwrite;
// shared array index to make the code bit more readable
int sx = ix + halowidth;
int sy = iy + halowidth;
__shared__ T a_s[324];
//__shared__ T dadx_out[18 * 18];
//__shared__ T dady_out[18 * 18];
//__shared__ T a_s[324];
//__shared__ T a_left[324];
//__shared__ T a_right[324];
//__shared__ T a_top[324];
//__shared__ T a_bot[324];
//(i + halowidth) + (j + halowidth) * blkmemwidth + ib * (blkmemwidth * blkmemwidth);
//memloc(halowidth, blkmemwidth, ix, iy, ib);
//
a_s[ism] = a[istore];
if (ism < (324 - (flatenwidth)))
{
a_s[ism + flatenwidth] = a[istore + flatenwidth];
}
__syncthreads();
i = memloc(halowidth, blkmemwidth, ix, iy, 0);
ileft = memloc(halowidth, blkmemwidth, ix - 1, iy, 0);
iright = memloc(halowidth, blkmemwidth, ix + 1, iy, 0);
itop = memloc(halowidth, blkmemwidth, ix, iy + 1, 0);
ibot = memloc(halowidth, blkmemwidth, ix, iy - 1, 0);
iwrite = memloc(halowidth, blkmemwidth, ix, iy, ib);
//a_left[i] = a_s[ileft];
//a_right[i] = a_s[iright];
//a_top[i] = a_s[itop];
//a_bot[i] = a_s[ibot];
//dadx[iwrite] = minmod2(theta, a_left[i], a_s[i], a_right[i]) / delta;
//dady[iwrite] = minmod2(theta, a_bot[i], a_s[i], a_top[i]) / delta;
dadx[iwrite] = minmod2(theta, a_s[ileft], a_s[i], a_s[iright]) / delta;
dady[iwrite] = minmod2(theta, a_s[ibot], a_s[i], a_s[itop]) / delta;
/*
dadx_out[i] = minmod2(theta, a_s[ileft], a_s[i], a_s[iright]) / delta;
dady_out[i] = minmod2(theta, a_s[ibot], a_s[i], a_s[itop]) / delta;
dadx[istore] = dadx_out[ism];// = a[istore];
dady[istore] = dady_out[ism];
if (ism < (324 - (flatenwidth)))
{
dadx[istore + flatenwidth] = dadx_out[ism + flatenwidth];
dady[istore + flatenwidth] = dady_out[ism + flatenwidth];
}
*/
}
template __global__ void gradientSMC<float>(int halowidth, int* active, int* level, float theta, float dx, float* a, float* dadx, float* dady);
template __global__ void gradientSMC<double>(int halowidth, int* active, int* level, double theta, double dx, double* a, double* dadx, double* dady);
template <class T> __global__ void gradientedgeX(int halowidth, int* active, int* level, T theta, T dx, T* a, T* dadx)
{
//int *leftblk,int *rightblk,int* topblk, int * botblk,
//int ix = threadIdx.x+1;
//int iy = threadIdx.y+1;
int blkmemwidth = blockDim.y + halowidth * 2;
//unsigned int blksize = blkmemwidth * blkmemwidth;
//unsigned int ix = threadIdx.x;
int ix;
int iy = threadIdx.y;
int ibl = blockIdx.x;
int ib = active[ibl];
if (threadIdx.x == 0)
{
ix = 0;
}
else
{
ix = blockDim.y - 1;
}
int lev = level[ib];
T delta = calcres(dx, lev);
int i = memloc(halowidth, blkmemwidth, ix, iy, ib);
//int iright, ileft, ibot;
// shared array index to make the code bit more readable
//unsigned int sx = ix + halowidth;
//unsigned int sy = iy + halowidth;
T a_l, a_r, a_i;
a_i = a[i];
a_l = a[memloc(halowidth, blkmemwidth, ix - 1, iy, ib)];
//a_t = a[memloc(halowidth, blkmemwidth, ix, iy + 1, ib)];
a_r = a[memloc(halowidth, blkmemwidth, ix + 1, iy, ib)];
//a_b = a[memloc(halowidth, blkmemwidth, ix, iy - 1, ib)];
//__shared__ T a_s[18][18];
//__syncthreads();
//__syncwarp;
dadx[i] = minmod2(theta, a_l, a_i, a_r) / delta;
}
template __global__ void gradientedgeX<float>(int halowidth, int* active, int* level, float theta, float dx, float* a, float* dadx);
template __global__ void gradientedgeX<double>(int halowidth, int* active, int* level, double theta, double dx, double* a, double* dadx);
template <class T> __global__ void gradientedgeY(int halowidth, int* active, int* level, T theta, T dx, T* a, T* dady)
{
//int *leftblk,int *rightblk,int* topblk, int * botblk,
//int ix = threadIdx.x+1;
//int iy = threadIdx.y+1;
int blkmemwidth = blockDim.x + halowidth * 2;
//unsigned int blksize = blkmemwidth * blkmemwidth;
int ix = threadIdx.x;
int iy;
//unsigned int iy = threadIdx.y;
int ibl = blockIdx.x;
int ib = active[ibl];
int lev = level[ib];
T delta = calcres(dx, lev);
if (threadIdx.y == 0)
{
iy = 0;
}
else
{
iy = blockDim.x - 1;
}
int i = memloc(halowidth, blkmemwidth, ix, iy, ib);
//int iright, ileft, ibot;
// shared array index to make the code bit more readable
//unsigned int sx = ix + halowidth;
//unsigned int sy = iy + halowidth;
T a_t, a_b, a_i;
a_i = a[i];
//a_l = a[memloc(halowidth, blkmemwidth, ix - 1, iy, ib)];
a_t = a[memloc(halowidth, blkmemwidth, ix, iy + 1, ib)];
//a_r = a[memloc(halowidth, blkmemwidth, ix + 1, iy, ib)];
a_b = a[memloc(halowidth, blkmemwidth, ix, iy - 1, ib)];
//__shared__ T a_s[18][18];
//__syncthreads();
//__syncwarp;
//dadx[i] = minmod2(theta, a_l, a_i, a_r) / delta;
dady[i] = minmod2(theta, a_b, a_i, a_t) / delta;
}
template __global__ void gradientedgeY<float>(int halowidth, int* active, int* level, float theta, float dx, float* a, float* dady);
template __global__ void gradientedgeY<double>(int halowidth, int* active, int* level, double theta, double dx, double* a, double* dady);
template <class T> void gradientC(Param XParam, BlockP<T> XBlock, T* a, T* dadx, T* dady)
{
int i,ib;
int xplus, xminus, yplus, yminus;
T delta;
for (int ibl = 0; ibl < XParam.nblk; ibl++)
{
ib = XBlock.active[ibl];
delta = calcres(T(XParam.delta), XBlock.level[ib]);
for (int iy = 0; iy < XParam.blkwidth; iy++)
{
for (int ix = 0; ix < XParam.blkwidth; ix++)
{
i = memloc(XParam, ix,iy,ib);
//
xplus = memloc(XParam, ix+1, iy, ib);
xminus = memloc(XParam, ix-1, iy, ib);
yplus = memloc(XParam, ix, iy+1, ib);
yminus = memloc(XParam, ix, iy-1, ib);
dadx[i] = minmod2(T(XParam.theta), a[xminus], a[i], a[xplus]) / delta;
dady[i] = minmod2(T(XParam.theta), a[yminus], a[i], a[yplus]) / delta;
}
}
}
}
template void gradientC<float>(Param XParam, BlockP<float> XBlock, float* a, float* dadx, float* dady);
template void gradientC<double>(Param XParam, BlockP<double> XBlock, double* a, double* dadx, double* dady);
template <class T> void gradientCPU(Param XParam, BlockP<T>XBlock, EvolvingP<T> XEv, GradientsP<T> XGrad, T* zb)
{
std::thread t0(&gradientC<T>, XParam, XBlock, XEv.h, XGrad.dhdx, XGrad.dhdy);
std::thread t1(&gradientC<T>, XParam, XBlock, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
std::thread t2(&gradientC<T>, XParam, XBlock, XEv.u, XGrad.dudx, XGrad.dudy);
std::thread t3(&gradientC<T>, XParam, XBlock, XEv.v, XGrad.dvdx, XGrad.dvdy);
t0.join();
t1.join();
t2.join();
t3.join();
//fillHalo(XParam, XBlock, XGrad);
gradientHalo(XParam, XBlock, XEv.h, XGrad.dhdx, XGrad.dhdy);
gradientHalo(XParam, XBlock, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
gradientHalo(XParam, XBlock, XEv.u, XGrad.dudx, XGrad.dudy);
gradientHalo(XParam, XBlock, XEv.v, XGrad.dvdx, XGrad.dvdy);
if (XParam.conserveElevation)
{
conserveElevationGradHalo(XParam, XBlock, XEv.h, XEv.zs, zb, XGrad.dhdx, XGrad.dzsdx, XGrad.dhdy, XGrad.dzsdy);
}
refine_linear(XParam,XBlock, XEv.h, XGrad.dhdx, XGrad.dhdy);
refine_linear(XParam, XBlock, XEv.u, XGrad.dudx, XGrad.dudy);
refine_linear(XParam, XBlock, XEv.v, XGrad.dvdx, XGrad.dvdy);
if (XParam.conserveElevation)
{
conserveElevation(XParam, XBlock, XEv, zb);
}
else if (XParam.wetdryfix)
{
WetDryProlongation(XParam, XBlock, XEv, zb);
}
RecalculateZs(XParam, XBlock, XEv, zb);
std::thread t4(&gradientC<T>, XParam, XBlock, XEv.h, XGrad.dhdx, XGrad.dhdy);
std::thread t5(&gradientC<T>, XParam, XBlock, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
std::thread t6(&gradientC<T>, XParam, XBlock, XEv.u, XGrad.dudx, XGrad.dudy);
std::thread t7(&gradientC<T>, XParam, XBlock, XEv.v, XGrad.dvdx, XGrad.dvdy);
t4.join();
t5.join();
t6.join();
t7.join();
gradientHalo(XParam, XBlock, XEv.h, XGrad.dhdx, XGrad.dhdy);
gradientHalo(XParam, XBlock, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
gradientHalo(XParam, XBlock, XEv.u, XGrad.dudx, XGrad.dudy);
gradientHalo(XParam, XBlock, XEv.v, XGrad.dvdx, XGrad.dvdy);
if (XParam.conserveElevation)
{
conserveElevationGradHalo(XParam, XBlock, XEv.h, XEv.zs, zb, XGrad.dhdx, XGrad.dzsdx, XGrad.dhdy, XGrad.dzsdy);
}
if (XParam.engine == 1)
{
WetsloperesetCPU(XParam, XBlock, XEv, XGrad, zb);
WetsloperesetHaloLeftCPU(XParam, XBlock, XEv, XGrad, zb);
WetsloperesetHaloRightCPU(XParam, XBlock, XEv, XGrad, zb);
WetsloperesetHaloBotCPU(XParam, XBlock, XEv, XGrad, zb);
WetsloperesetHaloTopCPU(XParam, XBlock, XEv, XGrad, zb);
}
//conserveElevationGradHalo(XParam, XBlock, XEv.zs, XGrad.dzsdx, XGrad.dzsdy);
//conserveElevationGradHalo(XParam, XBlock, XEv.u, XGrad.dudx, XGrad.dudy);
//conserveElevationGradHalo(XParam, XBlock, XEv.v, XGrad.dvdx, XGrad.dvdyythhhhhhhhhg);
}
template void gradientCPU<float>(Param XParam, BlockP<float>XBlock, EvolvingP<float> XEv, GradientsP<float> XGrad, float * zb);
template void gradientCPU<double>(Param XParam, BlockP<double>XBlock, EvolvingP<double> XEv, GradientsP<double> XGrad, double * zb);
template <class T> void WetsloperesetCPU(Param XParam, BlockP<T>XBlock, EvolvingP<T> XEv, GradientsP<T> XGrad, T* zb)
{
int i, ib;
int xplus, xminus, yminus;
T delta;
for (int ibl = 0; ibl < XParam.nblk; ibl++)
{
ib = XBlock.active[ibl];
delta = calcres(T(XParam.delta), XBlock.level[ib]);
for (int iy = 0; iy < XParam.blkwidth; iy++)
{
for (int ix = 0; ix < XParam.blkwidth; ix++)
{
i = memloc(XParam, ix, iy, ib);
//
xplus = memloc(XParam, ix + 1, iy, ib);
xminus = memloc(XParam, ix - 1, iy, ib);
//yplus = memloc(XParam, ix, iy + 1, ib);
yminus = memloc(XParam, ix, iy - 1, ib);
T dzsdxi = XGrad.dzsdx[i];
T dzsdyi = XGrad.dzsdy[i];
//Do X axis
if (utils::sq(dzsdxi) > utils::sq(XGrad.dzbdx[i]))
{
T leftzs, rightzs;
leftzs = XEv.zs[i] - XEv.h[i] - delta * T(0.5) * (dzsdxi - XGrad.dhdx[i]);
rightzs = XEv.zs[i] - XEv.h[i] + delta * T(0.5) * (dzsdxi - XGrad.dhdx[i]);
if (leftzs > XEv.zs[xminus] || rightzs > XEv.zs[xplus])
{
XGrad.dzsdx[i] = XGrad.dhdx[i] + XGrad.dzbdx[i];
}
}
//Do Y axis
if (utils::sq(dzsdyi) > utils::sq(XGrad.dzbdy[i]))
{
T botzs, topzs;
botzs = XEv.zs[i] - XEv.h[i] - delta * T(0.5) * (dzsdyi - XGrad.dhdy[i]);
topzs = XEv.zs[i] - XEv.h[i] + delta * T(0.5) * (dzsdyi - XGrad.dhdy[i]);
if (botzs > XEv.zs[yminus] || topzs > XEv.zs[yminus])
{
XGrad.dzsdy[i] = XGrad.dhdy[i] + XGrad.dzbdy[i];
}
}
}
}
}
}
template <class T> __global__ void WetsloperesetXGPU(Param XParam, BlockP<T>XBlock, EvolvingP<T> XEv, GradientsP<T> XGrad, T* zb)
{
unsigned int blkmemwidth = blockDim.x + XParam.halowidth * 2;
//unsigned int blksize = blkmemwidth * blkmemwidth;
unsigned int ix = threadIdx.x;
unsigned int iy = threadIdx.y;
unsigned int ibl = blockIdx.x;
unsigned int ib = XBlock.active[ibl];
int lev = XBlock.level[ib];
T delta = calcres(XParam.delta, lev);
int i = memloc(XParam.halowidth, blkmemwidth, ix, iy, ib);
int iright, ileft;
iright = memloc(XParam.halowidth, blkmemwidth, ix + 1, iy, ib);
ileft = memloc(XParam.halowidth, blkmemwidth, ix - 1, iy, ib);
T dzsdxi = XGrad.dzsdx[i];
if (utils::sq(dzsdxi) > utils::sq(XGrad.dzbdx[i]))
{
T leftzs, rightzs;
leftzs = XEv.zs[i] - XEv.h[i] - delta * T(0.5) * (dzsdxi - XGrad.dhdx[i]);
rightzs = XEv.zs[i] - XEv.h[i] + delta * T(0.5) * (dzsdxi - XGrad.dhdx[i]);
if (leftzs > XEv.zs[ileft] || rightzs > XEv.zs[iright])
{
XGrad.dzsdx[i] = XGrad.dhdx[i] + XGrad.dzbdx[i];
}
}
}
template <class T> __global__ void WetsloperesetYGPU(Param XParam, BlockP<T>XBlock, EvolvingP<T> XEv, GradientsP<T> XGrad, T* zb)
{
unsigned int blkmemwidth = blockDim.x + XParam.halowidth * 2;
//unsigned int blksize = blkmemwidth * blkmemwidth;
unsigned int ix = threadIdx.x;
unsigned int iy = threadIdx.y;
unsigned int ibl = blockIdx.x;
unsigned int ib = XBlock.active[ibl];
int lev = XBlock.level[ib];
T delta = calcres(XParam.delta, lev);
int i = memloc(XParam.halowidth, blkmemwidth, ix, iy, ib);
int itop, ibot;
itop = memloc(XParam.halowidth, blkmemwidth, ix, iy + 1, ib);
ibot = memloc(XParam.halowidth, blkmemwidth, ix, iy - 1, ib);
T dzsdyi = XGrad.dzsdy[i];
if (utils::sq(dzsdyi) > utils::sq(XGrad.dzbdy[i]))
{
T botzs, topzs;
botzs = XEv.zs[i] - XEv.h[i] - delta * T(0.5) * (dzsdyi - XGrad.dhdy[i]);
topzs = XEv.zs[i] - XEv.h[i] + delta * T(0.5) * (dzsdyi - XGrad.dhdy[i]);
if (botzs > XEv.zs[ibot] || topzs > XEv.zs[itop])
{
XGrad.dzsdy[i] = XGrad.dhdy[i] + XGrad.dzbdy[i];
}
}
}
template <class T> __global__ void WetsloperesetHaloLeftGPU(Param XParam, BlockP<T>XBlock, EvolvingP<T> XEv, GradientsP<T> XGrad, T* zb)
{
unsigned int blkmemwidth = XParam.blkwidth + XParam.halowidth * 2;
//unsigned int blksize = XParam.blkmemwidth * XParam.blkmemwidth;
int ix = -1;
int iy = threadIdx.y;
unsigned int ibl = blockIdx.x;
unsigned int ib = XBlock.active[ibl];
int lev = XBlock.level[ib];
T delta = calcres(XParam.delta, lev);
T zsi, zsright, zsleft;
int i, iright;
i = memloc(XParam.halowidth, blkmemwidth, ix, iy, ib);
iright = memloc(XParam.halowidth, blkmemwidth, ix + 1, iy, ib);
zsi = XEv.zs[i];
zsright = XEv.zs[iright];
int read, jj, ii, ir, it, itr;
if (XBlock.LeftBot[ib] == ib)//The lower half is a boundary
{
if (iy < (XParam.blkwidth / 2))
{
read = memloc(XParam.halowidth, blkmemwidth, 0, iy, ib);// or memloc(XParam, -1, j, ib) but they should be the same
zsleft = XEv.zs[read];
}
if (XBlock.LeftTop[ib] == ib) // boundary on the top half too
{
if (iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, 0, iy, ib);
zsleft = XEv.zs[read];
}
}
else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
{
if (iy >= (XParam.blkwidth / 2))
{
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj, XBlock.LeftTop[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj, XBlock.LeftTop[ib]);
it = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj + 1, XBlock.LeftTop[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj + 1, XBlock.LeftTop[ib]);
zsleft = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.LeftBot[ib]]) // LeftTop block does not exist
{
read = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 2), iy, XBlock.LeftBot[ib]);
zsleft = XEv.zs[read];
}
else if (XBlock.level[XBlock.LeftBot[ib]] > XBlock.level[ib])
{
if (iy < (XParam.blkwidth / 2))
{
jj = iy * 2;
int bb = XBlock.LeftBot[ib];
ii = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj, bb);
ir = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj, bb);
it = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj + 1, bb);
itr = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj + 1, bb);
zsleft = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
//now find out aboy lefttop block
if (XBlock.LeftTop[ib] == ib)
{
if (iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, 0, iy, ib);
zsleft = XEv.zs[read];
}
}
else
{
if (iy >= (XParam.blkwidth / 2))
{
//
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj, XBlock.LeftTop[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj, XBlock.LeftTop[ib]);
it = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj + 1, XBlock.LeftTop[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj + 1, XBlock.LeftTop[ib]);
zsleft = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
}
}
else if (XBlock.level[XBlock.LeftBot[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.RightBot[XBlock.LeftBot[ib]] == ib ? ceil(iy * (T)0.5) : ceil(iy * (T)0.5) + XParam.blkwidth / 2;
T jr = ceil(iy * (T)0.5) * 2 > iy ? T(0.25) : T(0.75);
ii = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 1), jj, XBlock.LeftBot[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 2), jj, XBlock.LeftBot[ib]);
it = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 1), jj - 1, XBlock.LeftBot[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 2), jj - 1, XBlock.LeftBot[ib]);
zsleft = BilinearInterpolation(XEv.zs[itr], XEv.zs[ir], XEv.zs[it], XEv.zs[ii], T(0.0), T(1.0), T(0.0), T(1.0), T(0.75), jr);
}
T dzsdxi = XGrad.dzsdx[i];
if (utils::sq(dzsdxi) > utils::sq(XGrad.dzbdx[i]))
{
T leftzs, rightzs;
leftzs = zsi - XEv.h[i] - delta * T(0.5) * (dzsdxi - XGrad.dhdx[i]);
rightzs = zsi - XEv.h[i] + delta * T(0.5) * (dzsdxi - XGrad.dhdx[i]);
if (leftzs > zsleft || rightzs > zsright)
{
XGrad.dzsdx[i] = XGrad.dhdx[i] + XGrad.dzbdx[i];
}
}
}
template <class T> void WetsloperesetHaloLeftCPU(Param XParam, BlockP<T>XBlock, EvolvingP<T> XEv, GradientsP<T> XGrad, T* zb)
{
unsigned int blkmemwidth = XParam.blkwidth + XParam.halowidth * 2;
//unsigned int blksize = XParam.blkmemwidth * XParam.blkmemwidth;
int ix = -1;
for (int ibl = 0; ibl < XParam.nblk; ibl++)
{
unsigned int ib = XBlock.active[ibl];
int lev = XBlock.level[ib];
T delta = calcres(XParam.delta, lev);
T zsi, zsright, zsleft;
for (int iy = 0; iy <= XParam.blkwidth; iy++)
{
int i, iright;
i = memloc(XParam.halowidth, blkmemwidth, ix, iy, ib);
iright = memloc(XParam.halowidth, blkmemwidth, ix + 1, iy, ib);
zsi = XEv.zs[i];
zsright = XEv.zs[iright];
int read, jj, ii, ir, it, itr;
if (XBlock.LeftBot[ib] == ib)//The lower half is a boundary
{
if (iy < (XParam.blkwidth / 2))
{
read = memloc(XParam.halowidth, blkmemwidth, 0, iy, ib);// or memloc(XParam, -1, j, ib) but they should be the same
zsleft = XEv.zs[read];
}
if (XBlock.LeftTop[ib] == ib) // boundary on the top half too
{
if (iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, 0, iy, ib);
zsleft = XEv.zs[read];
}
}
else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
{
if (iy >= (XParam.blkwidth / 2))
{
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj, XBlock.LeftTop[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj, XBlock.LeftTop[ib]);
it = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj + 1, XBlock.LeftTop[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj + 1, XBlock.LeftTop[ib]);
zsleft = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.LeftBot[ib]]) // LeftTop block does not exist
{
read = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 2), iy, XBlock.LeftBot[ib]);
zsleft = XEv.zs[read];
}
else if (XBlock.level[XBlock.LeftBot[ib]] > XBlock.level[ib])
{
if (iy < (XParam.blkwidth / 2))
{
jj = iy * 2;
int bb = XBlock.LeftBot[ib];
ii = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj, bb);
ir = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj, bb);
it = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj + 1, bb);
itr = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj + 1, bb);
zsleft = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
//now find out aboy lefttop block
if (XBlock.LeftTop[ib] == ib)
{
if (iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, 0, iy, ib);
zsleft = XEv.zs[read];
}
}
else
{
if (iy >= (XParam.blkwidth / 2))
{
//
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj, XBlock.LeftTop[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj, XBlock.LeftTop[ib]);
it = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj + 1, XBlock.LeftTop[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj + 1, XBlock.LeftTop[ib]);
zsleft = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
}
}
else if (XBlock.level[XBlock.LeftBot[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.RightBot[XBlock.LeftBot[ib]] == ib ? ftoi(ceil(iy * (T)0.5)) : ftoi(ceil(iy * (T)0.5) + XParam.blkwidth / 2);
T jr = ceil(iy * (T)0.5) * 2 > iy ? T(0.25) : T(0.75);
ii = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 1), jj, XBlock.LeftBot[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 2), jj, XBlock.LeftBot[ib]);
it = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 1), jj - 1, XBlock.LeftBot[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 2), jj - 1, XBlock.LeftBot[ib]);
zsleft = BilinearInterpolation(XEv.zs[itr], XEv.zs[ir], XEv.zs[it], XEv.zs[ii], T(0.0), T(1.0), T(0.0), T(1.0), T(0.75), jr);
}
T dzsdxi = XGrad.dzsdx[i];
if (utils::sq(dzsdxi) > utils::sq(XGrad.dzbdx[i]))
{
T leftzs, rightzs;
leftzs = zsi - XEv.h[i] - delta * T(0.5) * (dzsdxi - XGrad.dhdx[i]);
rightzs = zsi - XEv.h[i] + delta * T(0.5) * (dzsdxi - XGrad.dhdx[i]);
if (leftzs > zsleft || rightzs > zsright)
{
XGrad.dzsdx[i] = XGrad.dhdx[i] + XGrad.dzbdx[i];
}
}
}
}
}
template <class T> __global__ void WetsloperesetHaloRightGPU(Param XParam, BlockP<T>XBlock, EvolvingP<T> XEv, GradientsP<T> XGrad, T* zb)
{
unsigned int blkmemwidth = XParam.blkwidth + XParam.halowidth * 2;
//unsigned int blksize = XParam.blkmemwidth * XParam.blkmemwidth;
int ix = XParam.blkwidth;
int iy = threadIdx.y;
unsigned int ibl = blockIdx.x;
unsigned int ib = XBlock.active[ibl];
int i, jj, ii, ir, it, itr;
int read;
int lev = XBlock.level[ib];
T delta = calcres(XParam.delta, lev);
i = memloc(XParam.halowidth, blkmemwidth, ix, iy, ib);
int ileft;
ileft = memloc(XParam.halowidth, blkmemwidth, ix - 1, iy, ib);
T zsi, zsleft, zsright;
zsi = XEv.zs[i];
zsleft = XEv.zs[ileft];
T dzsdxi = XGrad.dzsdx[i];
if (XBlock.RightBot[ib] == ib)//The lower half is a boundary
{
if (iy < (XParam.blkwidth / 2))
{
read = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth - 1, iy, ib);// or memloc(XParam, -1, j, ib) but they should be the same
zsright = XEv.zs[read];;
}
if (XBlock.RightTop[ib] == ib) // boundary on the top half too
{
if (iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth - 1, iy, ib);
zsright = XEv.zs[read];
}
}
else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
{
if (iy >= (XParam.blkwidth / 2))
{
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, 3, jj, XBlock.RightTop[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, 2, jj, XBlock.RightTop[ib]);
it = memloc(XParam.halowidth, blkmemwidth, 3, jj + 1, XBlock.RightTop[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, 2, jj + 1, XBlock.RightTop[ib]);
zsright = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.RightBot[ib]]) // LeftTop block does not exist
{
read = memloc(XParam.halowidth, blkmemwidth, 1, iy, XBlock.RightBot[ib]);
zsright = XEv.zs[read];
}
else if (XBlock.level[XBlock.RightBot[ib]] > XBlock.level[ib])
{
if (iy < (XParam.blkwidth / 2))
{
jj = iy * 2;
int bb = XBlock.RightBot[ib];
ii = memloc(XParam.halowidth, blkmemwidth, 3, jj, bb);
ir = memloc(XParam.halowidth, blkmemwidth, 2, jj, bb);
it = memloc(XParam.halowidth, blkmemwidth, 3, jj + 1, bb);
itr = memloc(XParam.halowidth, blkmemwidth, 2, jj + 1, bb);
zsright = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
//now find out aboy lefttop block
if (XBlock.RightTop[ib] == ib)
{
if (iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth - 1, iy, ib);
zsright = XEv.zs[read];
}
}
else
{
if (iy >= (XParam.blkwidth / 2))
{
//
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, 3, jj, XBlock.RightTop[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, 2, jj, XBlock.RightTop[ib]);
it = memloc(XParam.halowidth, blkmemwidth, 3, jj + 1, XBlock.RightTop[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, 2, jj + 1, XBlock.RightTop[ib]);
zsright = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
}
}
else if (XBlock.level[XBlock.RightBot[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.LeftBot[XBlock.RightBot[ib]] == ib ? ceil(iy * (T)0.5) : ceil(iy * (T)0.5) + XParam.blkwidth / 2;
T jr = ceil(iy * (T)0.5) * 2 > iy ? T(0.25) : T(0.75);
ii = memloc(XParam.halowidth, blkmemwidth, 0, jj, XBlock.RightBot[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, 1, jj, XBlock.RightBot[ib]);
it = memloc(XParam.halowidth, blkmemwidth, 0, jj - 1, XBlock.RightBot[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, 1, jj - 1, XBlock.RightBot[ib]);
zsright = BilinearInterpolation(XEv.zs[it], XEv.zs[ii], XEv.zs[itr], XEv.zs[ir], T(0.0), T(1.0), T(0.0), T(1.0), T(0.25), jr);
}
if (utils::sq(dzsdxi) > utils::sq(XGrad.dzbdx[i]))
{
T leftzs, rightzs;
leftzs = zsi - XEv.h[i] - delta * T(0.5) * (dzsdxi - XGrad.dhdx[i]);
rightzs = zsi - XEv.h[i] + delta * T(0.5) * (dzsdxi - XGrad.dhdx[i]);
if (leftzs > zsleft || rightzs > zsright)
{
XGrad.dzsdx[i] = XGrad.dhdx[i] + XGrad.dzbdx[i];
}
}
}
template <class T> void WetsloperesetHaloRightCPU(Param XParam, BlockP<T>XBlock, EvolvingP<T> XEv, GradientsP<T> XGrad, T* zb)
{
unsigned int blkmemwidth = XParam.blkwidth + XParam.halowidth * 2;
//unsigned int blksize = XParam.blkmemwidth * XParam.blkmemwidth;
int ix = XParam.blkwidth;
for (int ibl = 0; ibl < XParam.nblk; ibl++)
{
for (int iy = 0; iy < XParam.blkwidth; iy++)
{
unsigned int ib = XBlock.active[ibl];
int i, jj, ii, ir, it, itr;
int read;
int lev = XBlock.level[ib];
T delta = calcres(XParam.delta, lev);
i = memloc(XParam.halowidth, blkmemwidth, ix, iy, ib);
int ileft;
ileft = memloc(XParam.halowidth, blkmemwidth, ix - 1, iy, ib);
T zsi, zsleft, zsright;
zsi = XEv.zs[i];
zsleft = XEv.zs[ileft];
T dzsdxi = XGrad.dzsdx[i];
if (XBlock.RightBot[ib] == ib)//The lower half is a boundary
{
if (iy < (XParam.blkwidth / 2))
{
read = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth - 1, iy, ib);// or memloc(XParam, -1, j, ib) but they should be the same
zsright = XEv.zs[read];;
}
if (XBlock.RightTop[ib] == ib) // boundary on the top half too
{
if (iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth - 1, iy, ib);
zsright = XEv.zs[read];
}
}
else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
{
if (iy >= (XParam.blkwidth / 2))
{
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, 3, jj, XBlock.RightTop[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, 2, jj, XBlock.RightTop[ib]);
it = memloc(XParam.halowidth, blkmemwidth, 3, jj + 1, XBlock.RightTop[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, 2, jj + 1, XBlock.RightTop[ib]);
zsright = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.RightBot[ib]]) // LeftTop block does not exist
{
read = memloc(XParam.halowidth, blkmemwidth, 1, iy, XBlock.RightBot[ib]);
zsright = XEv.zs[read];
}
else if (XBlock.level[XBlock.RightBot[ib]] > XBlock.level[ib])
{
if (iy < (XParam.blkwidth / 2))
{
jj = iy * 2;
int bb = XBlock.RightBot[ib];
ii = memloc(XParam.halowidth, blkmemwidth, 3, jj, bb);
ir = memloc(XParam.halowidth, blkmemwidth, 2, jj, bb);
it = memloc(XParam.halowidth, blkmemwidth, 3, jj + 1, bb);
itr = memloc(XParam.halowidth, blkmemwidth, 2, jj + 1, bb);
zsright = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
//now find out aboy lefttop block
if (XBlock.RightTop[ib] == ib)
{
if (iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth - 1, iy, ib);
zsright = XEv.zs[read];
}
}
else
{
if (iy >= (XParam.blkwidth / 2))
{
//
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, 3, jj, XBlock.RightTop[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, 2, jj, XBlock.RightTop[ib]);
it = memloc(XParam.halowidth, blkmemwidth, 3, jj + 1, XBlock.RightTop[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, 2, jj + 1, XBlock.RightTop[ib]);
zsright = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
}
}
else if (XBlock.level[XBlock.RightBot[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.LeftBot[XBlock.RightBot[ib]] == ib ? ftoi(ceil(iy * (T)0.5)) : ftoi(ceil(iy * (T)0.5) + XParam.blkwidth / 2);
T jr = ceil(iy * (T)0.5) * 2 > iy ? T(0.25) : T(0.75);
ii = memloc(XParam.halowidth, blkmemwidth, 0, jj, XBlock.RightBot[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, 1, jj, XBlock.RightBot[ib]);
it = memloc(XParam.halowidth, blkmemwidth, 0, jj - 1, XBlock.RightBot[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, 1, jj - 1, XBlock.RightBot[ib]);
zsright = BilinearInterpolation(XEv.zs[it], XEv.zs[ii], XEv.zs[itr], XEv.zs[ir], T(0.0), T(1.0), T(0.0), T(1.0), T(0.25), jr);
}
if (utils::sq(dzsdxi) > utils::sq(XGrad.dzbdx[i]))
{
T leftzs, rightzs;
leftzs = zsi - XEv.h[i] - delta * T(0.5) * (dzsdxi - XGrad.dhdx[i]);
rightzs = zsi - XEv.h[i] + delta * T(0.5) * (dzsdxi - XGrad.dhdx[i]);
if (leftzs > zsleft || rightzs > zsright)
{
XGrad.dzsdx[i] = XGrad.dhdx[i] + XGrad.dzbdx[i];
}
}
}
}
}
template <class T> __global__ void WetsloperesetHaloBotGPU(Param XParam, BlockP<T>XBlock, EvolvingP<T> XEv, GradientsP<T> XGrad, T* zb)
{
unsigned int blkmemwidth = XParam.blkwidth + XParam.halowidth * 2;
//unsigned int blksize = XParam.blkmemwidth * XParam.blkmemwidth;
int iy = -1;
int ix = threadIdx.x;
unsigned int ibl = blockIdx.x;
unsigned int ib = XBlock.active[ibl];
int i, jj, ii, ir, it, itr;
int lev = XBlock.level[ib];
T delta = calcres(XParam.delta, lev);
i = memloc(XParam.halowidth, blkmemwidth, ix, iy, ib);
int itop,read;
itop = memloc(XParam.halowidth, blkmemwidth, ix, iy + 1, ib);
T zsi, zstop, zsbot;
T dzsdyi = XGrad.dzsdy[i];
zsi = XEv.zs[i];
zstop = XEv.zs[itop];
if (XBlock.BotLeft[ib] == ib)//The lower half is a boundary
{
if (ix < (XParam.blkwidth / 2))
{
read = memloc(XParam.halowidth, blkmemwidth, ix, 0, ib);// or memloc(XParam, -1, j, ib) but they should be the same
zsbot = XEv.zs[read];
}
if (XBlock.BotRight[ib] == ib) // boundary on the top half too
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, ix, 0, ib);
zsbot = XEv.zs[read];
}
}
else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
{
if (ix >= (XParam.blkwidth / 2))
{
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
zsbot = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.BotLeft[ib]]) // LeftTop block does not exist
{
read = memloc(XParam.halowidth, blkmemwidth, ix, (XParam.blkwidth - 2), XBlock.BotLeft[ib]);
zsbot = XEv.zs[read];
}
else if (XBlock.level[XBlock.BotLeft[ib]] > XBlock.level[ib])
{
if (ix < (XParam.blkwidth / 2))
{
jj = ix * 2;
int bb = XBlock.BotLeft[ib];
ii = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 3), bb);
ir = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 4), bb);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 3), bb);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 4), bb);
zsbot = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
//now find out aboy lefttop block
if (XBlock.BotRight[ib] == ib)
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, ix, 0, ib);
zsbot = XEv.zs[read];
}
}
else
{
if (ix >= (XParam.blkwidth / 2))
{
//
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
zsbot = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
}
}
else if (XBlock.level[XBlock.BotLeft[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.TopLeft[XBlock.BotLeft[ib]] == ib ? ceil(ix * (T)0.5) : ceil(ix * (T)0.5) + XParam.blkwidth / 2;
T jr = ceil(ix * (T)0.5) * 2 > ix ? T(0.25) : T(0.75);
ii = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 1), XBlock.BotLeft[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 2), XBlock.BotLeft[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj - 1, (XParam.blkwidth - 1), XBlock.BotLeft[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj - 1, (XParam.blkwidth - 2), XBlock.BotLeft[ib]);
zsbot = BilinearInterpolation(XEv.zs[itr], XEv.zs[it], XEv.zs[ir], XEv.zs[ii], T(0.0), T(1.0), T(0.0), T(1.0), jr, T(0.75));
}
if (utils::sq(dzsdyi) > utils::sq(XGrad.dzbdy[i]))
{
T botzs, topzs;
botzs = zsi - XEv.h[i] - delta * T(0.5) * (dzsdyi - XGrad.dhdy[i]);
topzs = zsi - XEv.h[i] + delta * T(0.5) * (dzsdyi - XGrad.dhdy[i]);
if (botzs > zsbot || topzs > zstop)
{
XGrad.dzsdy[i] = XGrad.dhdy[i] + XGrad.dzbdy[i];
}
}
}
template <class T> void WetsloperesetHaloBotCPU(Param XParam, BlockP<T>XBlock, EvolvingP<T> XEv, GradientsP<T> XGrad, T* zb)
{
unsigned int blkmemwidth = XParam.blkwidth + XParam.halowidth * 2;
//unsigned int blksize = XParam.blkmemwidth * XParam.blkmemwidth;
int iy = -1;
for (int ibl = 0; ibl < XParam.nblk; ibl++)
{
unsigned int ib = XBlock.active[ibl];
int i, jj, ii, ir, it, itr;
int lev = XBlock.level[ib];
T delta = calcres(XParam.delta, lev);
for (int ix = 0; ix < XParam.blkwidth; ix++)
{
i = memloc(XParam.halowidth, blkmemwidth, ix, iy, ib);
int itop, read;
itop = memloc(XParam.halowidth, blkmemwidth, ix, iy + 1, ib);
T zsi, zstop, zsbot;
T dzsdyi = XGrad.dzsdy[i];
zsi = XEv.zs[i];
zstop = XEv.zs[itop];
if (XBlock.BotLeft[ib] == ib)//The lower half is a boundary
{
if (ix < (XParam.blkwidth / 2))
{
read = memloc(XParam.halowidth, blkmemwidth, ix, 0, ib);// or memloc(XParam, -1, j, ib) but they should be the same
zsbot = XEv.zs[read];
}
if (XBlock.BotRight[ib] == ib) // boundary on the top half too
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, ix, 0, ib);
zsbot = XEv.zs[read];
}
}
else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
{
if (ix >= (XParam.blkwidth / 2))
{
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
zsbot = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.BotLeft[ib]]) // LeftTop block does not exist
{
read = memloc(XParam.halowidth, blkmemwidth, ix, (XParam.blkwidth - 2), XBlock.BotLeft[ib]);
zsbot = XEv.zs[read];
}
else if (XBlock.level[XBlock.BotLeft[ib]] > XBlock.level[ib])
{
if (ix < (XParam.blkwidth / 2))
{
jj = ix * 2;
int bb = XBlock.BotLeft[ib];
ii = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 3), bb);
ir = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 4), bb);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 3), bb);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 4), bb);
zsbot = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
//now find out aboy lefttop block
if (XBlock.BotRight[ib] == ib)
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, ix, 0, ib);
zsbot = XEv.zs[read];
}
}
else
{
if (ix >= (XParam.blkwidth / 2))
{
//
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
zsbot = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
}
}
else if (XBlock.level[XBlock.BotLeft[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.TopLeft[XBlock.BotLeft[ib]] == ib ? ftoi(ceil(ix * (T)0.5)) : ftoi(ceil(ix * (T)0.5) + XParam.blkwidth / 2);
T jr = ceil(ix * (T)0.5) * 2 > ix ? T(0.25) : T(0.75);
ii = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 1), XBlock.BotLeft[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 2), XBlock.BotLeft[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj - 1, (XParam.blkwidth - 1), XBlock.BotLeft[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj - 1, (XParam.blkwidth - 2), XBlock.BotLeft[ib]);
zsbot = BilinearInterpolation(XEv.zs[itr], XEv.zs[it], XEv.zs[ir], XEv.zs[ii], T(0.0), T(1.0), T(0.0), T(1.0), jr, T(0.75));
}
if (utils::sq(dzsdyi) > utils::sq(XGrad.dzbdy[i]))
{
T botzs, topzs;
botzs = zsi - XEv.h[i] - delta * T(0.5) * (dzsdyi - XGrad.dhdy[i]);
topzs = zsi - XEv.h[i] + delta * T(0.5) * (dzsdyi - XGrad.dhdy[i]);
if (botzs > zsbot || topzs > zstop)
{
XGrad.dzsdy[i] = XGrad.dhdy[i] + XGrad.dzbdy[i];
}
}
}
}
}
template <class T> __global__ void WetsloperesetHaloTopGPU(Param XParam, BlockP<T>XBlock, EvolvingP<T> XEv, GradientsP<T> XGrad, T* zb)
{
unsigned int blkmemwidth = XParam.blkwidth + XParam.halowidth * 2;
//unsigned int blksize = XParam.blkmemwidth * XParam.blkmemwidth;
int iy = XParam.blkwidth;
int ix = threadIdx.x;
unsigned int ibl = blockIdx.x;
unsigned int ib = XBlock.active[ibl];
int i, jj, ii, ir, it, itr;
int lev = XBlock.level[ib];
T delta = calcres(XParam.delta, lev);
i = memloc(XParam.halowidth, blkmemwidth, ix, iy, ib);
int ibot, read;
ibot = memloc(XParam.halowidth, blkmemwidth, ix, iy - 1, ib);
T zsi, zstop, zsbot;
zsi = XEv.zs[i];
zsbot = XEv.zs[ibot];
T dzsdyi = XGrad.dzsdy[i];
if (XBlock.TopLeft[ib] == ib)//The lower half is a boundary
{
if (ix < (XParam.blkwidth / 2))
{
read = memloc(XParam.halowidth, blkmemwidth, ix, XParam.blkwidth - 1, ib);// or memloc(XParam, -1, j, ib) but they should be the same
zstop = XEv.zs[read];
}
if (XBlock.TopRight[ib] == ib) // boundary on the top half too
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, ix, XParam.blkwidth - 1, ib);
zstop = XEv.zs[read];
}
}
else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
{
if (ix >= (XParam.blkwidth / 2))
{
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, jj, 3, XBlock.TopRight[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, 2, XBlock.TopRight[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, 3, XBlock.TopRight[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, 2, XBlock.TopRight[ib]);
zstop = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.TopLeft[ib]]) // LeftTop block does not exist
{
read = memloc(XParam.halowidth, blkmemwidth, ix, 1, XBlock.TopLeft[ib]);
zstop = XEv.zs[read];
}
else if (XBlock.level[XBlock.TopLeft[ib]] > XBlock.level[ib])
{
if (ix < (XParam.blkwidth / 2))
{
jj = ix * 2;
int bb = XBlock.TopLeft[ib];;
ii = memloc(XParam.halowidth, blkmemwidth, jj, 3, bb);
ir = memloc(XParam.halowidth, blkmemwidth, jj, 2, bb);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, 3, bb);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, 2, bb);
zstop = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
//now find out aboy lefttop block
if (XBlock.TopRight[ib] == ib)
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, ix, XParam.blkwidth - 1, ib);
zstop = XEv.zs[read];
}
}
else
{
if (ix >= (XParam.blkwidth / 2))
{
//
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, jj, 3, XBlock.TopRight[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, 2, XBlock.TopRight[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, 3, XBlock.TopRight[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, 2, XBlock.TopRight[ib]);
zstop = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
}
}
else if (XBlock.level[XBlock.TopLeft[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.BotLeft[XBlock.TopLeft[ib]] == ib ? ceil(ix * (T)0.5) : ceil(ix * (T)0.5) + XParam.blkwidth / 2;
T jr = ceil(ix * (T)0.5) * 2 > ix ? T(0.25) : T(0.75);
ii = memloc(XParam.halowidth, blkmemwidth, jj, 0, XBlock.TopLeft[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, 1, XBlock.TopLeft[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj - 1, 0, XBlock.TopLeft[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj - 1, 1, XBlock.TopLeft[ib]);
zstop = BilinearInterpolation(XEv.zs[it], XEv.zs[itr], XEv.zs[ii], XEv.zs[ir], T(0.0), T(1.0), T(0.0), T(1.0), jr, T(0.25));
}
if (utils::sq(dzsdyi) > utils::sq(XGrad.dzbdy[i]))
{
T botzs, topzs;
botzs = zsi - XEv.h[i] - delta * T(0.5) * (dzsdyi - XGrad.dhdy[i]);
topzs = zsi - XEv.h[i] + delta * T(0.5) * (dzsdyi - XGrad.dhdy[i]);
if (botzs > zsbot || topzs > zstop)
{
XGrad.dzsdy[i] = XGrad.dhdy[i] + XGrad.dzbdy[i];
}
}
}
template <class T> void WetsloperesetHaloTopCPU(Param XParam, BlockP<T>XBlock, EvolvingP<T> XEv, GradientsP<T> XGrad, T* zb)
{
unsigned int blkmemwidth = XParam.blkwidth + XParam.halowidth * 2;
//unsigned int blksize = XParam.blkmemwidth * XParam.blkmemwidth;
int iy = XParam.blkwidth;
for (int ibl = 0; ibl < XParam.nblk; ibl++)
{
unsigned int ib = XBlock.active[ibl];
int i, jj, ii, ir, it, itr;
int lev = XBlock.level[ib];
T delta = calcres(XParam.delta, lev);
for (int ix = 0; ix < XParam.blkwidth; ix++)
{
i = memloc(XParam.halowidth, blkmemwidth, ix, iy, ib);
int ibot, read;
ibot = memloc(XParam.halowidth, blkmemwidth, ix, iy - 1, ib);
T zsi, zstop, zsbot;
zsi = XEv.zs[i];
zsbot = XEv.zs[ibot];
T dzsdyi = XGrad.dzsdy[i];
if (XBlock.TopLeft[ib] == ib)//The lower half is a boundary
{
if (ix < (XParam.blkwidth / 2))
{
read = memloc(XParam.halowidth, blkmemwidth, ix, XParam.blkwidth - 1, ib);// or memloc(XParam, -1, j, ib) but they should be the same
zstop = XEv.zs[read];
}
if (XBlock.TopRight[ib] == ib) // boundary on the top half too
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, ix, XParam.blkwidth - 1, ib);
zstop = XEv.zs[read];
}
}
else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
{
if (ix >= (XParam.blkwidth / 2))
{
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, jj, 3, XBlock.TopRight[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, 2, XBlock.TopRight[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, 3, XBlock.TopRight[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, 2, XBlock.TopRight[ib]);
zstop = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.TopLeft[ib]]) // LeftTop block does not exist
{
read = memloc(XParam.halowidth, blkmemwidth, ix, 1, XBlock.TopLeft[ib]);
zstop = XEv.zs[read];
}
else if (XBlock.level[XBlock.TopLeft[ib]] > XBlock.level[ib])
{
if (ix < (XParam.blkwidth / 2))
{
jj = ix * 2;
int bb = XBlock.TopLeft[ib];;
ii = memloc(XParam.halowidth, blkmemwidth, jj, 3, bb);
ir = memloc(XParam.halowidth, blkmemwidth, jj, 2, bb);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, 3, bb);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, 2, bb);
zstop = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
//now find out aboy lefttop block
if (XBlock.TopRight[ib] == ib)
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, ix, XParam.blkwidth - 1, ib);
zstop = XEv.zs[read];
}
}
else
{
if (ix >= (XParam.blkwidth / 2))
{
//
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, jj, 3, XBlock.TopRight[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, 2, XBlock.TopRight[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, 3, XBlock.TopRight[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, 2, XBlock.TopRight[ib]);
zstop = T(0.25) * (XEv.zs[ii] + XEv.zs[ir] + XEv.zs[it] + XEv.zs[itr]);
}
}
}
else if (XBlock.level[XBlock.TopLeft[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.BotLeft[XBlock.TopLeft[ib]] == ib ? ftoi(ceil(ix * (T)0.5)) : ftoi(ceil(ix * (T)0.5) + XParam.blkwidth / 2);
T jr = ceil(ix * (T)0.5) * 2 > ix ? T(0.25) : T(0.75);
ii = memloc(XParam.halowidth, blkmemwidth, jj, 0, XBlock.TopLeft[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, 1, XBlock.TopLeft[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj - 1, 0, XBlock.TopLeft[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj - 1, 1, XBlock.TopLeft[ib]);
zstop = BilinearInterpolation(XEv.zs[it], XEv.zs[itr], XEv.zs[ii], XEv.zs[ir], T(0.0), T(1.0), T(0.0), T(1.0), jr, T(0.25));
}
if (utils::sq(dzsdyi) > utils::sq(XGrad.dzbdy[i]))
{
T botzs, topzs;
botzs = zsi - XEv.h[i] - delta * T(0.5) * (dzsdyi - XGrad.dhdy[i]);
topzs = zsi - XEv.h[i] + delta * T(0.5) * (dzsdyi - XGrad.dhdy[i]);
if (botzs > zsbot || topzs > zstop)
{
XGrad.dzsdy[i] = XGrad.dhdy[i] + XGrad.dzbdy[i];
}
}
}
}
}
template <class T> void gradientHalo(Param XParam, BlockP<T>XBlock, T* a, T* dadx, T* dady)
{
int ib;
//int xplus;
//T delta;
for (int ibl = 0; ibl < XParam.nblk; ibl++)
{
ib = XBlock.active[ibl];
for (int iy = 0; iy < XParam.blkwidth; iy++)
{
gradientHaloLeft(XParam, XBlock, ib, iy, a, dadx, dady);
gradientHaloRight(XParam, XBlock, ib, iy, a, dadx, dady);
}
for (int ix = 0; ix < XParam.blkwidth; ix++)
{
gradientHaloBot(XParam, XBlock, ib, ix, a, dadx, dady);
gradientHaloTop(XParam, XBlock, ib, ix, a, dadx, dady);
}
}
}
template <class T> void gradientHaloGPU(Param XParam, BlockP<T>XBlock, T* a, T* dadx, T* dady)
{
dim3 blockDimL(1, XParam.blkwidth, 1);
dim3 blockDimB(XParam.blkwidth, 1, 1);
dim3 gridDim(XParam.nblk, 1, 1);
gradientHaloLeftGPU <<< gridDim, blockDimL, 0 >>> (XParam, XBlock, a, dadx, dady);
CUDA_CHECK(cudaDeviceSynchronize());
gradientHaloRightGPU <<< gridDim, blockDimL, 0 >>> (XParam, XBlock, a, dadx, dady);
CUDA_CHECK(cudaDeviceSynchronize());
gradientHaloBotGPU <<< gridDim, blockDimB, 0 >>> (XParam, XBlock, a, dadx, dady);
CUDA_CHECK(cudaDeviceSynchronize());
gradientHaloTopGPU <<< gridDim, blockDimB, 0 >>> (XParam, XBlock, a, dadx, dady);
CUDA_CHECK(cudaDeviceSynchronize());
}
template <class T> void gradientHaloGPUnew(Param XParam, BlockP<T>XBlock, T* a, T* dadx, T* dady)
{
const int num_streams = 4;
cudaStream_t streams[num_streams];
for (int i = 0; i < num_streams; i++)
{
CUDA_CHECK(cudaStreamCreate(&streams[i]));
}
dim3 blockDimL(2, XParam.blkwidth, 1);
dim3 blockDimB(XParam.blkwidth, 2, 1);
dim3 gridDim(ceil(XParam.nblk/2), 1, 1);
gradientHaloLeftGPUnew <<< gridDim, blockDimL, 0, streams[0] >>> (XParam, XBlock, a, dadx, dady);
gradientHaloRightGPUnew <<< gridDim, blockDimL, 0, streams[1] >>> (XParam, XBlock, a, dadx, dady);
gradientHaloBotGPUnew <<< gridDim, blockDimB, 0, streams[2] >>> (XParam, XBlock, a, dadx, dady);
gradientHaloTopGPUnew <<< gridDim, blockDimB, 0, streams[3] >>> (XParam, XBlock, a, dadx, dady);
//CUDA_CHECK(cudaDeviceSynchronize());
for (int i = 0; i < num_streams; i++)
{
cudaStreamDestroy(streams[i]);
}
}
template <class T> void gradientHaloLeft(Param XParam, BlockP<T>XBlock, int ib, int iy, T* a, T* dadx, T* dady)
{
int i, ix, jj, ii, ir, it, itr;
int xplus, read;
T delta, aright, aleft;
ix = -1;
i = memloc(XParam, ix, iy, ib);
xplus = memloc(XParam, ix + 1, iy, ib);
aright = a[xplus];
delta = calcres(T(XParam.delta), XBlock.level[ib]);
if (XBlock.LeftBot[ib] == ib)//The lower half is a boundary
{
if ( iy < (XParam.blkwidth / 2))
{
read = memloc(XParam, 0, iy, ib);// or memloc(XParam, -1, j, ib) but they should be the same
aleft = a[read];
}
if (XBlock.LeftTop[ib] == ib) // boundary on the top half too
{
if ( iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam, 0, iy, ib);
aleft = a[read];
}
}
else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
{
if (iy >= (XParam.blkwidth / 2))
{
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam, (XParam.blkwidth - 3), jj, XBlock.LeftTop[ib]);
ir = memloc(XParam, (XParam.blkwidth - 4), jj, XBlock.LeftTop[ib]);
it = memloc(XParam, (XParam.blkwidth - 3), jj + 1, XBlock.LeftTop[ib]);
itr = memloc(XParam, (XParam.blkwidth - 4), jj + 1, XBlock.LeftTop[ib]);
aleft = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.LeftBot[ib]]) // LeftTop block does not exist
{
read = memloc(XParam, (XParam.blkwidth - 2), iy, XBlock.LeftBot[ib]);
aleft = a[read];
}
else if (XBlock.level[XBlock.LeftBot[ib]] > XBlock.level[ib])
{
if (iy < (XParam.blkwidth / 2))
{
jj = iy * 2;
int bb = XBlock.LeftBot[ib];
ii = memloc(XParam, (XParam.blkwidth - 3), jj, bb);
ir = memloc(XParam, (XParam.blkwidth - 4), jj, bb);
it = memloc(XParam, (XParam.blkwidth - 3), jj + 1, bb);
itr = memloc(XParam, (XParam.blkwidth - 4), jj + 1, bb);
aleft = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
//now find out aboy lefttop block
if (XBlock.LeftTop[ib] == ib)
{
if (iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam, 0, iy, ib);
aleft = a[read];
}
}
else
{
if (iy >= (XParam.blkwidth / 2))
{
//
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam, (XParam.blkwidth - 3), jj, XBlock.LeftTop[ib]);
ir = memloc(XParam, (XParam.blkwidth - 4), jj, XBlock.LeftTop[ib]);
it = memloc(XParam, (XParam.blkwidth - 3), jj + 1, XBlock.LeftTop[ib]);
itr = memloc(XParam, (XParam.blkwidth - 4), jj + 1, XBlock.LeftTop[ib]);
aleft = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[XBlock.LeftBot[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.RightBot[XBlock.LeftBot[ib]] == ib ? ftoi(ceil(iy * (T)0.5)) : ftoi(ceil(iy * (T)0.5) + XParam.blkwidth / 2);
//T jr = ceil(iy * (T)0.5) * 2 > iy ? T(0.75) : T(0.25);// This is the wrong way around
T jr = ceil(iy * (T)0.5) * 2 > iy ? T(0.25) : T(0.75); // This is right (e.g. at iy==0 use 0.75 at iy==1 use 0.25)
ii = memloc(XParam, (XParam.blkwidth - 1), jj, XBlock.LeftBot[ib]);
ir = memloc(XParam, (XParam.blkwidth - 2), jj, XBlock.LeftBot[ib]);
it = memloc(XParam, (XParam.blkwidth - 1), jj - 1, XBlock.LeftBot[ib]);
itr = memloc(XParam, (XParam.blkwidth - 2), jj - 1, XBlock.LeftBot[ib]);
aleft = BilinearInterpolation(a[itr], a[ir], a[it], a[ii], T(0.0), T(1.0), T(0.0), T(1.0), T(0.75), jr);
}
dadx[i] = minmod2(T(XParam.theta), aleft, a[i], aright) / delta;
//dady[i] = minmod2(T(XParam.theta), abot, a[i], atop) / delta;
}
template <class T> void gradientHaloRight(Param XParam, BlockP<T>XBlock, int ib, int iy, T* a, T* dadx, T* dady)
{
int i, ix, jj, ii, ir, it, itr;
int xminus, read;
T delta, aright, aleft;
ix = 16;
i = memloc(XParam, ix, iy, ib);
xminus = memloc(XParam, ix - 1, iy, ib);
aleft = a[xminus];
delta = calcres(T(XParam.delta), XBlock.level[ib]);
if (XBlock.RightBot[ib] == ib)//The lower half is a boundary
{
if (iy < (XParam.blkwidth / 2))
{
read = memloc(XParam, XParam.blkwidth -1, iy, ib);// or memloc(XParam, -1, j, ib) but they should be the same
aright = a[read];
}
if (XBlock.RightTop[ib] == ib) // boundary on the top half too
{
if (iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam, XParam.blkwidth - 1, iy, ib);
aright = a[read];
}
}
else // boundary is only on the bottom half and implicitely level of righttopib is levelib+1
{
if (iy >= (XParam.blkwidth / 2))
{
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam, 3, jj, XBlock.RightTop[ib]);
ir = memloc(XParam, 2, jj, XBlock.RightTop[ib]);
it = memloc(XParam, 3, jj + 1, XBlock.RightTop[ib]);
itr = memloc(XParam, 2, jj + 1, XBlock.RightTop[ib]);
aright = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.RightBot[ib]]) // LeftTop block does not exist
{
read = memloc(XParam, 1, iy, XBlock.RightBot[ib]);
aright = a[read];
}
else if (XBlock.level[XBlock.RightBot[ib]] > XBlock.level[ib])
{
if (iy < (XParam.blkwidth / 2))
{
jj = iy * 2;
int bb = XBlock.RightBot[ib];
ii = memloc(XParam, 3, jj, bb);
ir = memloc(XParam, 2, jj, bb);
it = memloc(XParam, 3, jj + 1, bb);
itr = memloc(XParam, 2, jj + 1, bb);
aright = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
//now find out aboy lefttop block
if (XBlock.RightTop[ib] == ib)
{
if (iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam, XParam.blkwidth - 1, iy, ib);
aright = a[read];
}
}
else
{
if (iy >= (XParam.blkwidth / 2))
{
//
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam, 3, jj, XBlock.RightTop[ib]);
ir = memloc(XParam, 2, jj, XBlock.RightTop[ib]);
it = memloc(XParam, 3, jj + 1, XBlock.RightTop[ib]);
itr = memloc(XParam, 2, jj + 1, XBlock.RightTop[ib]);
aright = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[XBlock.RightBot[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.LeftBot[XBlock.RightBot[ib]] == ib ? ftoi(ceil(iy * (T)0.5)) : ftoi(ceil(iy * (T)0.5) + XParam.blkwidth / 2);
T jr = ceil(iy * (T)0.5) * 2 > iy ? T(0.25) : T(0.75);
ii = memloc(XParam, 0, jj, XBlock.RightBot[ib]);
ir = memloc(XParam, 1, jj, XBlock.RightBot[ib]);
it = memloc(XParam, 0, jj - 1, XBlock.RightBot[ib]);
itr = memloc(XParam, 1, jj - 1, XBlock.RightBot[ib]);
aright = BilinearInterpolation(a[it], a[ii], a[itr], a[ir], T(0.0), T(1.0), T(0.0), T(1.0), T(0.25), jr);
}
dadx[i] = minmod2(T(XParam.theta), aleft, a[i], aright) / delta;
//dady[i] = minmod2(T(XParam.theta), abot, a[i], atop) / delta;
}
template <class T> void gradientHaloBot(Param XParam, BlockP<T>XBlock, int ib, int ix, T* a, T* dadx, T* dady)
{
int i, iy, jj, ii, ir, it, itr;
int yplus, read;
T delta, atop, abot;
iy = -1;
i = memloc(XParam, ix, iy, ib);
yplus = memloc(XParam, ix , iy + 1, ib);
atop = a[yplus];
delta = calcres(T(XParam.delta), XBlock.level[ib]);
if (XBlock.BotLeft[ib] == ib)//The lower half is a boundary
{
if (ix < (XParam.blkwidth / 2))
{
read = memloc(XParam, ix, 0, ib);// or memloc(XParam, -1, j, ib) but they should be the same
abot = a[read];
}
if (XBlock.BotRight[ib] == ib) // boundary on the top half too
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam, ix, 0, ib);
abot = a[read];
}
}
else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
{
if (ix >= (XParam.blkwidth / 2))
{
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam, jj, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
ir = memloc(XParam, jj, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
it = memloc(XParam, jj + 1, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
itr = memloc(XParam, jj + 1, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
abot = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.BotLeft[ib]]) // LeftTop block does not exist
{
read = memloc(XParam, ix, (XParam.blkwidth - 2), XBlock.BotLeft[ib]);
abot = a[read];
}
else if (XBlock.level[XBlock.BotLeft[ib]] > XBlock.level[ib])
{
if (ix < (XParam.blkwidth / 2))
{
jj = ix * 2;
int bb = XBlock.BotLeft[ib];
ii = memloc(XParam, jj, (XParam.blkwidth - 3), bb);
ir = memloc(XParam, jj, (XParam.blkwidth - 4), bb);
it = memloc(XParam, jj + 1, (XParam.blkwidth - 3), bb);
itr = memloc(XParam, jj + 1, (XParam.blkwidth - 4), bb);
abot = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
//now find out aboy lefttop block
if (XBlock.BotRight[ib] == ib)
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam, ix, 0, ib);
abot = a[read];
}
}
else
{
if (ix >= (XParam.blkwidth / 2))
{
//
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam, jj, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
ir = memloc(XParam, jj, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
it = memloc(XParam, jj + 1, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
itr = memloc(XParam, jj + 1, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
abot = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[XBlock.BotLeft[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.TopLeft[XBlock.BotLeft[ib]] == ib ? ftoi(ceil(ix * (T)0.5)) : ftoi(ceil(ix * (T)0.5) + XParam.blkwidth / 2);
T jr = ceil(ix * (T)0.5) * 2 > ix ? T(0.25) : T(0.75);
ii = memloc(XParam, jj, (XParam.blkwidth - 1), XBlock.BotLeft[ib]);
ir = memloc(XParam, jj, (XParam.blkwidth - 2), XBlock.BotLeft[ib]);
it = memloc(XParam, jj - 1, (XParam.blkwidth - 1), XBlock.BotLeft[ib]);
itr = memloc(XParam, jj - 1, (XParam.blkwidth - 2), XBlock.BotLeft[ib]);
abot = BilinearInterpolation(a[itr], a[it], a[ir], a[ii], T(0.0), T(1.0), T(0.0), T(1.0), jr, T(0.75));
}
//dadx[i] = minmod2(T(XParam.theta), aleft, a[i], aright) / delta;
dady[i] = minmod2(T(XParam.theta), abot, a[i], atop) / delta;
}
template <class T> void gradientHaloTop(Param XParam, BlockP<T>XBlock, int ib, int ix, T* a, T* dadx, T* dady)
{
int i, iy, jj, ii, ir, it, itr;
int yminus, read;
T delta, atop, abot;
iy = XParam.blkwidth;
i = memloc(XParam, ix, iy, ib);
yminus = memloc(XParam, ix, XParam.blkwidth-1, ib);
abot = a[yminus];
delta = calcres(T(XParam.delta), XBlock.level[ib]);
if (XBlock.TopLeft[ib] == ib)//The lower half is a boundary
{
if (ix < (XParam.blkwidth / 2))
{
read = memloc(XParam, ix, XParam.blkwidth - 1, ib);// or memloc(XParam, -1, j, ib) but they should be the same
atop = a[read];
}
if (XBlock.TopRight[ib] == ib) // boundary on the top half too
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam, ix, XParam.blkwidth - 1, ib);
atop = a[read];
}
}
else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
{
if (ix >= (XParam.blkwidth / 2))
{
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam, jj, 3, XBlock.TopRight[ib]);
ir = memloc(XParam, jj, 2, XBlock.TopRight[ib]);
it = memloc(XParam, jj + 1, 3, XBlock.TopRight[ib]);
itr = memloc(XParam, jj + 1, 2, XBlock.TopRight[ib]);
atop = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.TopLeft[ib]]) // LeftTop block does not exist
{
read = memloc(XParam, ix, 1, XBlock.TopLeft[ib]);
atop = a[read];
}
else if (XBlock.level[XBlock.TopLeft[ib]] > XBlock.level[ib])
{
if (ix < (XParam.blkwidth / 2))
{
jj = ix * 2;
int bb = XBlock.TopLeft[ib];
ii = memloc(XParam, jj, 3, bb);
ir = memloc(XParam, jj, 2, bb);
it = memloc(XParam, jj + 1, 3, bb);
itr = memloc(XParam, jj + 1, 2, bb);
atop = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
//now find out aboy lefttop block
if (XBlock.TopRight[ib] == ib)
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam, ix, XParam.blkwidth - 1, ib);
atop = a[read];
}
}
else
{
if (ix >= (XParam.blkwidth / 2))
{
//
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam, jj, 3, XBlock.TopRight[ib]);
ir = memloc(XParam, jj, 2, XBlock.TopRight[ib]);
it = memloc(XParam, jj + 1, 3, XBlock.TopRight[ib]);
itr = memloc(XParam, jj + 1, 2, XBlock.TopRight[ib]);
atop = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[XBlock.TopLeft[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.BotLeft[XBlock.TopLeft[ib]] == ib ? ftoi(ceil(ix * (T)0.5)) : ftoi(ceil(ix * (T)0.5) + XParam.blkwidth / 2);
T jr = ceil(ix * (T)0.5) * 2 > ix ? T(0.25) : T(0.75);
ii = memloc(XParam, jj, 0, XBlock.TopLeft[ib]);
ir = memloc(XParam, jj, 1, XBlock.TopLeft[ib]);
it = memloc(XParam, jj - 1, 0, XBlock.TopLeft[ib]);
itr = memloc(XParam, jj - 1, 1, XBlock.TopLeft[ib]);
atop = BilinearInterpolation(a[it], a[itr], a[ii], a[ir], T(0.0), T(1.0), T(0.0), T(1.0), jr, T(0.25));
}
//dadx[i] = minmod2(T(XParam.theta), aleft, a[i], aright) / delta;
dady[i] = minmod2(T(XParam.theta), abot, a[i], atop) / delta;
}
template <class T> __global__ void gradientHaloLeftGPU(Param XParam, BlockP<T>XBlock, T* a, T* dadx, T* dady)
{
unsigned int blkmemwidth = XParam.blkwidth + XParam.halowidth * 2;
//unsigned int blksize = XParam.blkmemwidth * XParam.blkmemwidth;
int ix = -1;
int iy = threadIdx.y;
unsigned int ibl = blockIdx.x;
unsigned int ib = XBlock.active[ibl];
int i, jj, ii, ir, it, itr;
int xplus, read;
T delta, aright, aleft;
i = memloc(XParam.halowidth, blkmemwidth, ix, iy, ib);
xplus = memloc(XParam.halowidth, blkmemwidth, ix + 1, iy, ib);
aright = a[xplus];
delta = calcres(T(XParam.delta), XBlock.level[ib]);
if (XBlock.LeftBot[ib] == ib)//The lower half is a boundary
{
if (iy < (XParam.blkwidth / 2))
{
read = memloc(XParam.halowidth, blkmemwidth, 0, iy, ib);// or memloc(XParam, -1, j, ib) but they should be the same
aleft = a[read];
}
if (XBlock.LeftTop[ib] == ib) // boundary on the top half too
{
if (iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, 0, iy, ib);
aleft = a[read];
}
}
else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
{
if (iy >= (XParam.blkwidth / 2))
{
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj, XBlock.LeftTop[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj, XBlock.LeftTop[ib]);
it = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj + 1, XBlock.LeftTop[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj + 1, XBlock.LeftTop[ib]);
aleft = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.LeftBot[ib]]) // LeftTop block does not exist
{
read = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 2), iy, XBlock.LeftBot[ib]);
aleft = a[read];
}
else if (XBlock.level[XBlock.LeftBot[ib]] > XBlock.level[ib])
{
if (iy < (XParam.blkwidth / 2))
{
jj = iy * 2;
int bb = XBlock.LeftBot[ib];
ii = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj, bb);
ir = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj, bb);
it = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj + 1, bb);
itr = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj + 1, bb);
aleft = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
//now find out aboy lefttop block
if (XBlock.LeftTop[ib] == ib)
{
if (iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, 0, iy, ib);
aleft = a[read];
}
}
else
{
if (iy >= (XParam.blkwidth / 2))
{
//
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj, XBlock.LeftTop[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj, XBlock.LeftTop[ib]);
it = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj + 1, XBlock.LeftTop[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj + 1, XBlock.LeftTop[ib]);
aleft = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[XBlock.LeftBot[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.RightBot[XBlock.LeftBot[ib]] == ib ? ceil(iy * (T)0.5) : ceil(iy * (T)0.5) + XParam.blkwidth / 2;
T jr = ceil(iy * (T)0.5) * 2 > iy ? T(0.25) : T(0.75);
ii = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 1), jj, XBlock.LeftBot[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 2), jj, XBlock.LeftBot[ib]);
it = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 1), jj - 1, XBlock.LeftBot[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 2), jj - 1, XBlock.LeftBot[ib]);
aleft = BilinearInterpolation(a[itr], a[ir], a[it], a[ii], T(0.0), T(1.0), T(0.0), T(1.0), T(0.75), jr);
}
dadx[i] = minmod2(T(XParam.theta), aleft, a[i], aright) / delta;
//dady[i] = minmod2(T(XParam.theta), abot, a[i], atop) / delta;
}
template <class T> __global__ void gradientHaloLeftGPUnew(Param XParam, BlockP<T>XBlock, T* a, T* dadx, T* dady)
{
unsigned int blkmemwidth = XParam.blkwidth + XParam.halowidth * 2;
//unsigned int blksize = XParam.blkmemwidth * XParam.blkmemwidth;
int ix = -1;
int iy = threadIdx.y;
int ibl = threadIdx.x + blockIdx.x * blockDim.x;;
if (ibl < XParam.nblk)
{
int ib = XBlock.active[ibl];
int i, jj, ii, ir, it, itr;
int xplus, read;
T delta, aright, aleft;
i = memloc(XParam.halowidth, blkmemwidth, ix, iy, ib);
xplus = memloc(XParam.halowidth, blkmemwidth, ix + 1, iy, ib);
aright = a[xplus];
delta = calcres(T(XParam.delta), XBlock.level[ib]);
if (XBlock.LeftBot[ib] == ib)//The lower half is a boundary
{
if (iy < (XParam.blkwidth / 2))
{
read = memloc(XParam.halowidth, blkmemwidth, 0, iy, ib);// or memloc(XParam, -1, j, ib) but they should be the same
aleft = a[read];
}
if (XBlock.LeftTop[ib] == ib) // boundary on the top half too
{
if (iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, 0, iy, ib);
aleft = a[read];
}
}
else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
{
if (iy >= (XParam.blkwidth / 2))
{
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj, XBlock.LeftTop[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj, XBlock.LeftTop[ib]);
it = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj + 1, XBlock.LeftTop[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj + 1, XBlock.LeftTop[ib]);
aleft = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.LeftBot[ib]]) // LeftTop block does not exist
{
read = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 2), iy, XBlock.LeftBot[ib]);
aleft = a[read];
}
else if (XBlock.level[XBlock.LeftBot[ib]] > XBlock.level[ib])
{
if (iy < (XParam.blkwidth / 2))
{
jj = iy * 2;
int bb = XBlock.LeftBot[ib];
ii = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj, bb);
ir = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj, bb);
it = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj + 1, bb);
itr = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj + 1, bb);
aleft = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
//now find out aboy lefttop block
if (XBlock.LeftTop[ib] == ib)
{
if (iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, 0, iy, ib);
aleft = a[read];
}
}
else
{
if (iy >= (XParam.blkwidth / 2))
{
//
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj, XBlock.LeftTop[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj, XBlock.LeftTop[ib]);
it = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 3), jj + 1, XBlock.LeftTop[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 4), jj + 1, XBlock.LeftTop[ib]);
aleft = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[XBlock.LeftBot[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.RightBot[XBlock.LeftBot[ib]] == ib ? ceil(iy * (T)0.5) : ceil(iy * (T)0.5) + XParam.blkwidth / 2;
T jr = ceil(iy * (T)0.5) * 2 > iy ? T(0.25) : T(0.75);
ii = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 1), jj, XBlock.LeftBot[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 2), jj, XBlock.LeftBot[ib]);
it = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 1), jj - 1, XBlock.LeftBot[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, (XParam.blkwidth - 2), jj - 1, XBlock.LeftBot[ib]);
aleft = BilinearInterpolation(a[itr], a[ir], a[it], a[ii], T(0.0), T(1.0), T(0.0), T(1.0), T(0.75), jr);
}
dadx[i] = minmod2(T(XParam.theta), aleft, a[i], aright) / delta;
//dady[i] = minmod2(T(XParam.theta), abot, a[i], atop) / delta;
}
}
template <class T> __global__ void gradientHaloRightGPU(Param XParam, BlockP<T>XBlock, T* a, T* dadx, T* dady)
{
unsigned int blkmemwidth = XParam.blkwidth + XParam.halowidth * 2;
//unsigned int blksize = XParam.blkmemwidth * XParam.blkmemwidth;
int ix = XParam.blkwidth;
int iy = threadIdx.y;
unsigned int ibl = blockIdx.x;
unsigned int ib = XBlock.active[ibl];
int i, jj, ii, ir, it, itr;
int xminus, read;
T delta, aright, aleft;
i = memloc(XParam.halowidth, blkmemwidth, ix, iy, ib);
xminus = memloc(XParam.halowidth, blkmemwidth, ix - 1, iy, ib);
aleft = a[xminus];
delta = calcres(T(XParam.delta), XBlock.level[ib]);
if (XBlock.RightBot[ib] == ib)//The lower half is a boundary
{
if (iy < (XParam.blkwidth / 2))
{
read = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth - 1, iy, ib);// or memloc(XParam, -1, j, ib) but they should be the same
aright = a[read];;
}
if (XBlock.RightTop[ib] == ib) // boundary on the top half too
{
if (iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth - 1, iy, ib);
aright = a[read];
}
}
else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
{
if (iy >= (XParam.blkwidth / 2))
{
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, 3, jj, XBlock.RightTop[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, 2, jj, XBlock.RightTop[ib]);
it = memloc(XParam.halowidth, blkmemwidth, 3, jj + 1, XBlock.RightTop[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, 2, jj + 1, XBlock.RightTop[ib]);
aright = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.RightBot[ib]]) // LeftTop block does not exist
{
read = memloc(XParam.halowidth, blkmemwidth, 1, iy, XBlock.RightBot[ib]);
aright = a[read];
}
else if (XBlock.level[XBlock.RightBot[ib]] > XBlock.level[ib])
{
if (iy < (XParam.blkwidth / 2))
{
jj = iy * 2;
int bb = XBlock.RightBot[ib];
ii = memloc(XParam.halowidth, blkmemwidth, 3, jj, bb);
ir = memloc(XParam.halowidth, blkmemwidth, 2, jj, bb);
it = memloc(XParam.halowidth, blkmemwidth, 3, jj + 1, bb);
itr = memloc(XParam.halowidth, blkmemwidth, 2, jj + 1, bb);
aright = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
//now find out aboy lefttop block
if (XBlock.RightTop[ib] == ib)
{
if (iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth - 1, iy, ib);
aright = a[read];
}
}
else
{
if (iy >= (XParam.blkwidth / 2))
{
//
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, 3, jj, XBlock.RightTop[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, 2, jj, XBlock.RightTop[ib]);
it = memloc(XParam.halowidth, blkmemwidth, 3, jj + 1, XBlock.RightTop[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, 2, jj + 1, XBlock.RightTop[ib]);
aright = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[XBlock.RightBot[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.LeftBot[XBlock.RightBot[ib]] == ib ? ceil(iy * (T)0.5) : ceil(iy * (T)0.5) + XParam.blkwidth / 2;
T jr = ceil(iy * (T)0.5) * 2 > iy ? T(0.25) : T(0.75);
ii = memloc(XParam.halowidth, blkmemwidth, 0, jj, XBlock.RightBot[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, 1, jj, XBlock.RightBot[ib]);
it = memloc(XParam.halowidth, blkmemwidth, 0, jj - 1, XBlock.RightBot[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, 1, jj - 1, XBlock.RightBot[ib]);
aright = BilinearInterpolation(a[it], a[ii], a[itr], a[ir], T(0.0), T(1.0), T(0.0), T(1.0), T(0.25), jr);
}
dadx[i] = minmod2(T(XParam.theta), aleft, a[i], aright) / delta;
//dady[i] = minmod2(T(XParam.theta), abot, a[i], atop) / delta;
}
template <class T> __global__ void gradientHaloRightGPUnew(Param XParam, BlockP<T>XBlock, T* a, T* dadx, T* dady)
{
unsigned int blkmemwidth = XParam.blkwidth + XParam.halowidth * 2;
//unsigned int blksize = XParam.blkmemwidth * XParam.blkmemwidth;
int ix = XParam.blkwidth;
int iy = threadIdx.y;
int ibl = threadIdx.x + blockIdx.x * blockDim.x;;
if (ibl < XParam.nblk)
{
int ib = XBlock.active[ibl];
int i, jj, ii, ir, it, itr;
int xminus, read;
T delta, aright, aleft;
i = memloc(XParam.halowidth, blkmemwidth, ix, iy, ib);
xminus = memloc(XParam.halowidth, blkmemwidth, ix - 1, iy, ib);
aleft = a[xminus];
delta = calcres(T(XParam.delta), XBlock.level[ib]);
if (XBlock.RightBot[ib] == ib)//The lower half is a boundary
{
if (iy < (XParam.blkwidth / 2))
{
read = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth - 1, iy, ib);// or memloc(XParam, -1, j, ib) but they should be the same
aright = a[read];;
}
if (XBlock.RightTop[ib] == ib) // boundary on the top half too
{
if (iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth - 1, iy, ib);
aright = a[read];
}
}
else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
{
if (iy >= (XParam.blkwidth / 2))
{
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, 3, jj, XBlock.RightTop[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, 2, jj, XBlock.RightTop[ib]);
it = memloc(XParam.halowidth, blkmemwidth, 3, jj + 1, XBlock.RightTop[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, 2, jj + 1, XBlock.RightTop[ib]);
aright = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.RightBot[ib]]) // LeftTop block does not exist
{
read = memloc(XParam.halowidth, blkmemwidth, 1, iy, XBlock.RightBot[ib]);
aright = a[read];
}
else if (XBlock.level[XBlock.RightBot[ib]] > XBlock.level[ib])
{
if (iy < (XParam.blkwidth / 2))
{
jj = iy * 2;
int bb = XBlock.RightBot[ib];
ii = memloc(XParam.halowidth, blkmemwidth, 3, jj, bb);
ir = memloc(XParam.halowidth, blkmemwidth, 2, jj, bb);
it = memloc(XParam.halowidth, blkmemwidth, 3, jj + 1, bb);
itr = memloc(XParam.halowidth, blkmemwidth, 2, jj + 1, bb);
aright = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
//now find out aboy lefttop block
if (XBlock.RightTop[ib] == ib)
{
if (iy >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, XParam.blkwidth - 1, iy, ib);
aright = a[read];
}
}
else
{
if (iy >= (XParam.blkwidth / 2))
{
//
jj = (iy - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, 3, jj, XBlock.RightTop[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, 2, jj, XBlock.RightTop[ib]);
it = memloc(XParam.halowidth, blkmemwidth, 3, jj + 1, XBlock.RightTop[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, 2, jj + 1, XBlock.RightTop[ib]);
aright = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[XBlock.RightBot[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.LeftBot[XBlock.RightBot[ib]] == ib ? ceil(iy * (T)0.5) : ceil(iy * (T)0.5) + XParam.blkwidth / 2;
T jr = ceil(iy * (T)0.5) * 2 > iy ? T(0.25) : T(0.75);
ii = memloc(XParam.halowidth, blkmemwidth, 0, jj, XBlock.RightBot[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, 1, jj, XBlock.RightBot[ib]);
it = memloc(XParam.halowidth, blkmemwidth, 0, jj - 1, XBlock.RightBot[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, 1, jj - 1, XBlock.RightBot[ib]);
aright = BilinearInterpolation(a[it], a[ii], a[itr], a[ir], T(0.0), T(1.0), T(0.0), T(1.0), T(0.25), jr);
}
dadx[i] = minmod2(T(XParam.theta), aleft, a[i], aright) / delta;
//dady[i] = minmod2(T(XParam.theta), abot, a[i], atop) / delta;
}
}
template <class T> __global__ void gradientHaloBotGPU(Param XParam, BlockP<T>XBlock, T* a, T* dadx, T* dady)
{
unsigned int blkmemwidth = XParam.blkwidth + XParam.halowidth * 2;
//unsigned int blksize = XParam.blkmemwidth * XParam.blkmemwidth;
int iy = -1;
int ix = threadIdx.x;
unsigned int ibl = blockIdx.x;
unsigned int ib = XBlock.active[ibl];
int i, jj, ii, ir, it, itr;
int yplus, read;
T delta, atop, abot;
i = memloc(XParam.halowidth, blkmemwidth, ix, iy, ib);
yplus = memloc(XParam.halowidth, blkmemwidth, ix, iy + 1, ib);
atop = a[yplus];
delta = calcres(T(XParam.delta), XBlock.level[ib]);
if (XBlock.BotLeft[ib] == ib)//The lower half is a boundary
{
if (ix < (XParam.blkwidth / 2))
{
read = memloc(XParam.halowidth, blkmemwidth, ix, 0, ib);// or memloc(XParam, -1, j, ib) but they should be the same
abot = a[read];
}
if (XBlock.BotRight[ib] == ib) // boundary on the top half too
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, ix, 0, ib);
abot = a[read];
}
}
else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
{
if (ix >= (XParam.blkwidth / 2))
{
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
abot = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.BotLeft[ib]]) // LeftTop block does not exist
{
read = memloc(XParam.halowidth, blkmemwidth, ix, (XParam.blkwidth - 2), XBlock.BotLeft[ib]);
abot = a[read];
}
else if (XBlock.level[XBlock.BotLeft[ib]] > XBlock.level[ib])
{
if (ix < (XParam.blkwidth / 2))
{
jj = ix * 2;
int bb = XBlock.BotLeft[ib];
ii = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 3), bb);
ir = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 4), bb);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 3), bb);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 4), bb);
abot = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
//now find out aboy lefttop block
if (XBlock.BotRight[ib] == ib)
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, ix, 0, ib);
abot = a[read];
}
}
else
{
if (ix >= (XParam.blkwidth / 2))
{
//
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
abot = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[XBlock.BotLeft[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.TopLeft[XBlock.BotLeft[ib]] == ib ? ceil(ix * (T)0.5) : ceil(ix * (T)0.5) + XParam.blkwidth / 2;
T jr = ceil(ix * (T)0.5) * 2 > ix ? T(0.25) : T(0.75);
ii = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 1), XBlock.BotLeft[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 2), XBlock.BotLeft[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj - 1, (XParam.blkwidth - 1), XBlock.BotLeft[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj - 1, (XParam.blkwidth - 2), XBlock.BotLeft[ib]);
abot = BilinearInterpolation(a[itr], a[it], a[ir], a[ii], T(0.0), T(1.0), T(0.0), T(1.0), jr, T(0.75));
}
//dadx[i] = minmod2(T(XParam.theta), aleft, a[i], aright) / delta;
dady[i] = minmod2(T(XParam.theta), abot, a[i], atop) / delta;
}
template <class T> __global__ void gradientHaloBotGPUnew(Param XParam, BlockP<T>XBlock, T* a, T* dadx, T* dady)
{
unsigned int blkmemwidth = XParam.blkwidth + XParam.halowidth * 2;
//unsigned int blksize = XParam.blkmemwidth * XParam.blkmemwidth;
int iy = -1;
int ix = threadIdx.x;
int ibl = threadIdx.y + blockIdx.x * blockDim.y;
if (ibl < XParam.nblk)
{
int ib = XBlock.active[ibl];
int i, jj, ii, ir, it, itr;
int yplus, read;
T delta, atop, abot;
i = memloc(XParam.halowidth, blkmemwidth, ix, iy, ib);
yplus = memloc(XParam.halowidth, blkmemwidth, ix, iy + 1, ib);
atop = a[yplus];
delta = calcres(T(XParam.delta), XBlock.level[ib]);
if (XBlock.BotLeft[ib] == ib)//The lower half is a boundary
{
if (ix < (XParam.blkwidth / 2))
{
read = memloc(XParam.halowidth, blkmemwidth, ix, 0, ib);// or memloc(XParam, -1, j, ib) but they should be the same
abot = a[read];
}
if (XBlock.BotRight[ib] == ib) // boundary on the top half too
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, ix, 0, ib);
abot = a[read];
}
}
else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
{
if (ix >= (XParam.blkwidth / 2))
{
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
abot = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.BotLeft[ib]]) // LeftTop block does not exist
{
read = memloc(XParam.halowidth, blkmemwidth, ix, (XParam.blkwidth - 2), XBlock.BotLeft[ib]);
abot = a[read];
}
else if (XBlock.level[XBlock.BotLeft[ib]] > XBlock.level[ib])
{
if (ix < (XParam.blkwidth / 2))
{
jj = ix * 2;
int bb = XBlock.BotLeft[ib];
ii = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 3), bb);
ir = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 4), bb);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 3), bb);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 4), bb);
abot = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
//now find out aboy lefttop block
if (XBlock.BotRight[ib] == ib)
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, ix, 0, ib);
abot = a[read];
}
}
else
{
if (ix >= (XParam.blkwidth / 2))
{
//
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 3), XBlock.BotRight[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, (XParam.blkwidth - 4), XBlock.BotRight[ib]);
abot = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[XBlock.BotLeft[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.TopLeft[XBlock.BotLeft[ib]] == ib ? ceil(ix * (T)0.5) : ceil(ix * (T)0.5) + XParam.blkwidth / 2;
T jr = ceil(ix * (T)0.5) * 2 > ix ? T(0.25) : T(0.75);
ii = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 1), XBlock.BotLeft[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, (XParam.blkwidth - 2), XBlock.BotLeft[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj - 1, (XParam.blkwidth - 1), XBlock.BotLeft[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj - 1, (XParam.blkwidth - 2), XBlock.BotLeft[ib]);
abot = BilinearInterpolation(a[itr], a[it], a[ir], a[ii], T(0.0), T(1.0), T(0.0), T(1.0), jr, T(0.75));
}
//dadx[i] = minmod2(T(XParam.theta), aleft, a[i], aright) / delta;
dady[i] = minmod2(T(XParam.theta), abot, a[i], atop) / delta;
}
}
template <class T> __global__ void gradientHaloTopGPU(Param XParam, BlockP<T>XBlock, T* a, T* dadx, T* dady)
{
unsigned int blkmemwidth = XParam.blkwidth + XParam.halowidth * 2;
//unsigned int blksize = XParam.blkmemwidth * XParam.blkmemwidth;
int iy = XParam.blkwidth;
int ix = threadIdx.x;
int ibl = blockIdx.x;
int ib = XBlock.active[ibl];
int i, jj, ii, ir, it, itr;
int yminus, read;
T delta, atop, abot;
i = memloc(XParam.halowidth, blkmemwidth, ix, iy, ib);
yminus = memloc(XParam.halowidth, blkmemwidth, ix, XParam.blkwidth - 1, ib);
abot = a[yminus];
delta = calcres(T(XParam.delta), XBlock.level[ib]);
if (XBlock.TopLeft[ib] == ib)//The lower half is a boundary
{
if (ix < (XParam.blkwidth / 2))
{
read = memloc(XParam.halowidth, blkmemwidth, ix, XParam.blkwidth - 1, ib);// or memloc(XParam, -1, j, ib) but they should be the same
atop = a[read];
}
if (XBlock.TopRight[ib] == ib) // boundary on the top half too
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, ix, XParam.blkwidth - 1, ib);
atop = a[read];
}
}
else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
{
if (ix >= (XParam.blkwidth / 2))
{
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, jj, 3, XBlock.TopRight[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, 2, XBlock.TopRight[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, 3, XBlock.TopRight[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, 2, XBlock.TopRight[ib]);
atop = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.TopLeft[ib]]) // LeftTop block does not exist
{
read = memloc(XParam.halowidth, blkmemwidth, ix, 1, XBlock.TopLeft[ib]);
atop = a[read];
}
else if (XBlock.level[XBlock.TopLeft[ib]] > XBlock.level[ib])
{
if (ix < (XParam.blkwidth / 2))
{
jj = ix * 2;
int bb = XBlock.TopLeft[ib];;
ii = memloc(XParam.halowidth, blkmemwidth, jj, 3, bb);
ir = memloc(XParam.halowidth, blkmemwidth, jj, 2, bb);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, 3, bb);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, 2, bb);
atop = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
//now find out aboy lefttop block
if (XBlock.TopRight[ib] == ib)
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, ix, XParam.blkwidth - 1, ib);
atop = a[read];
}
}
else
{
if (ix >= (XParam.blkwidth / 2))
{
//
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, jj, 3, XBlock.TopRight[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, 2, XBlock.TopRight[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, 3, XBlock.TopRight[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, 2, XBlock.TopRight[ib]);
atop = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[XBlock.TopLeft[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.BotLeft[XBlock.TopLeft[ib]] == ib ? ceil(ix * (T)0.5) : ceil(ix * (T)0.5) + XParam.blkwidth / 2;
T jr = ceil(ix * (T)0.5) * 2 > ix ? T(0.25) : T(0.75);
ii = memloc(XParam.halowidth, blkmemwidth, jj, 0, XBlock.TopLeft[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, 1, XBlock.TopLeft[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj - 1, 0, XBlock.TopLeft[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj - 1, 1, XBlock.TopLeft[ib]);
atop = BilinearInterpolation(a[it], a[itr], a[ii], a[ir], T(0.0), T(1.0), T(0.0), T(1.0), jr, T(0.25));
}
//dadx[i] = minmod2(T(XParam.theta), aleft, a[i], aright) / delta;
dady[i] = minmod2(T(XParam.theta), abot, a[i], atop) / delta;
}
template <class T> __global__ void gradientHaloTopGPUnew(Param XParam, BlockP<T>XBlock, T* a, T* dadx, T* dady)
{
unsigned int blkmemwidth = XParam.blkwidth + XParam.halowidth * 2;
//unsigned int blksize = XParam.blkmemwidth * XParam.blkmemwidth;
int iy = XParam.blkwidth;
int ix = threadIdx.x;
int ibl = threadIdx.y + blockIdx.x * blockDim.y;
if (ibl < XParam.nblk)
{
int ib = XBlock.active[ibl];
int i, jj, ii, ir, it, itr;
int yminus, read;
T delta, atop, abot;
i = memloc(XParam.halowidth, blkmemwidth, ix, iy, ib);
yminus = memloc(XParam.halowidth, blkmemwidth, ix, XParam.blkwidth - 1, ib);
abot = a[yminus];
delta = calcres(T(XParam.delta), XBlock.level[ib]);
if (XBlock.TopLeft[ib] == ib)//The lower half is a boundary
{
if (ix < (XParam.blkwidth / 2))
{
read = memloc(XParam.halowidth, blkmemwidth, ix, XParam.blkwidth - 1, ib);// or memloc(XParam, -1, j, ib) but they should be the same
atop = a[read];
}
if (XBlock.TopRight[ib] == ib) // boundary on the top half too
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, ix, XParam.blkwidth - 1, ib);
atop = a[read];
}
}
else // boundary is only on the bottom half and implicitely level of lefttopib is levelib+1
{
if (ix >= (XParam.blkwidth / 2))
{
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, jj, 3, XBlock.TopRight[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, 2, XBlock.TopRight[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, 3, XBlock.TopRight[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, 2, XBlock.TopRight[ib]);
atop = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[ib] == XBlock.level[XBlock.TopLeft[ib]]) // LeftTop block does not exist
{
read = memloc(XParam.halowidth, blkmemwidth, ix, 1, XBlock.TopLeft[ib]);
atop = a[read];
}
else if (XBlock.level[XBlock.TopLeft[ib]] > XBlock.level[ib])
{
if (ix < (XParam.blkwidth / 2))
{
jj = ix * 2;
int bb = XBlock.TopLeft[ib];;
ii = memloc(XParam.halowidth, blkmemwidth, jj, 3, bb);
ir = memloc(XParam.halowidth, blkmemwidth, jj, 2, bb);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, 3, bb);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, 2, bb);
atop = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
//now find out aboy lefttop block
if (XBlock.TopRight[ib] == ib)
{
if (ix >= (XParam.blkwidth / 2))
{
//
read = memloc(XParam.halowidth, blkmemwidth, ix, XParam.blkwidth - 1, ib);
atop = a[read];
}
}
else
{
if (ix >= (XParam.blkwidth / 2))
{
//
jj = (ix - XParam.blkwidth / 2) * 2;
ii = memloc(XParam.halowidth, blkmemwidth, jj, 3, XBlock.TopRight[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, 2, XBlock.TopRight[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj + 1, 3, XBlock.TopRight[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj + 1, 2, XBlock.TopRight[ib]);
atop = T(0.25) * (a[ii] + a[ir] + a[it] + a[itr]);
}
}
}
else if (XBlock.level[XBlock.TopLeft[ib]] < XBlock.level[ib]) // Neighbour is coarser; using barycentric interpolation (weights are precalculated) for the Halo
{
jj = XBlock.BotLeft[XBlock.TopLeft[ib]] == ib ? ceil(ix * (T)0.5) : ceil(ix * (T)0.5) + XParam.blkwidth / 2;
T jr = ceil(ix * (T)0.5) * 2 > ix ? T(0.25) : T(0.75);
ii = memloc(XParam.halowidth, blkmemwidth, jj, 0, XBlock.TopLeft[ib]);
ir = memloc(XParam.halowidth, blkmemwidth, jj, 1, XBlock.TopLeft[ib]);
it = memloc(XParam.halowidth, blkmemwidth, jj - 1, 0, XBlock.TopLeft[ib]);
itr = memloc(XParam.halowidth, blkmemwidth, jj - 1, 1, XBlock.TopLeft[ib]);
atop = BilinearInterpolation(a[it], a[itr], a[ii], a[ir], T(0.0), T(1.0), T(0.0), T(1.0), jr, T(0.25));
}
//dadx[i] = minmod2(T(XParam.theta), aleft, a[i], aright) / delta;
dady[i] = minmod2(T(XParam.theta), abot, a[i], atop) / delta;
}
}