Gromacs  2026.2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Macros | Functions | Variables
pme_gather_hip.cpp File Reference
#include "gmxpre.h"
#include <cassert>
#include "gromacs/gpu_utils/hip_kernel_utils.h"
#include "gromacs/gpu_utils/hip_sycl_kernel_utils.h"
#include "gromacs/gpu_utils/typecasts_cuda_hip.h"
#include "gromacs/math/functions.h"
#include "pme_gpu_calculate_splines_hip.h"
#include "pme_gpu_constants.h"
#include "pme_grid.h"
+ Include dependency graph for pme_gather_hip.cpp:

Description

Implements PME force gathering in HIP.

Author
Aleksei Iupinov a.yup.nosp@m.inov.nosp@m.@gmai.nosp@m.l.co.nosp@m.m

Macros

#define INSTANTIATE_3(order, numGrids, readGlobal, threadsPerAtom, parallelExecutionWidth)
 Kernel instantiations. More...
 
#define INSTANTIATE_2(order, numGrids, threadsPerAtom, parallelExecutionWidth)
 
#define INSTANTIATE(order, parallelExecutionWidth)
 

Functions

static __device__ float readGridSize (const float *realGridSizeFP, const int dimIndex)
 An inline HIP function: unroll the dynamic index accesses to the constant grid sizes to avoid local memory operations.
 
template<int order, int atomDataSize, int blockSize, int parallelExecutionWidth>
static __device__ void reduceAtomForces (float3 *__restrict__ sm_forces, const int atomIndexLocal, const int splineIndex, const float *realGridSizeFP, float &fx, float &fy, float &fz)
 Reduce the partial force contributions. More...
 
template<int order, int atomsPerWarp, bool wrapX, bool wrapY>
static __device__ void sumForceComponents (float *__restrict__ fx, float *__restrict__ fy, float *__restrict__ fz, const int ithyMin, const int ithyMax, const int ixBase, const int iz, const int nx, const int ny, const int pny, const int pnz, const int atomIndexLocal, const int splineIndexBase, const float2 tdz, const int *__restrict__ sm_gridlineIndices, const float *__restrict__ sm_theta, const float *__restrict__ sm_dtheta, const float *__restrict__ gm_grid)
 Calculate the sum of the force partial components (in X, Y and Z) More...
 
static __device__ void calculateAndStoreGridForces (float3 *__restrict__ sm_forces, const int forceIndexLocal, const int forceIndexGlobal, const float recipBox[DIM][DIM], const float scale, const float *__restrict__ gm_coefficients)
 Calculate the grid forces and store them in shared memory. More...
 
template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom, int parallelExecutionWidth>
 __launch_bounds__ (sc_gatherMaxThreadsPerBlock< parallelExecutionWidth >, sc_gatherMinBlocksPerMP< parallelExecutionWidth >) __global__ void pmeGatherKernel(const PmeGpuKernelParams kernelParams)
 A HIP kernel which gathers the atom forces from the grid. The grid is assumed to be wrapped in dimension Z. More...
 
template __global__ void pmeGatherKernel< 4, true, true, 1, true, ThreadsPerAtom::Order, 32 > (PmeGpuKernelParams kernelParams)
 
template __global__ void pmeGatherKernel< 4, true, true, 1, false, ThreadsPerAtom::Order, 32 > (PmeGpuKernelParams kernelParams)
 
template __global__ void pmeGatherKernel< 4, true, true, 1, true, ThreadsPerAtom::OrderSquared, 32 > (PmeGpuKernelParams kernelParams)
 
template __global__ void pmeGatherKernel< 4, true, true, 1, false, ThreadsPerAtom::OrderSquared, 32 > (PmeGpuKernelParams kernelParams)
 
template __global__ void pmeGatherKernel< 4, true, true, 2, true, ThreadsPerAtom::Order, 32 > (PmeGpuKernelParams kernelParams)
 
template __global__ void pmeGatherKernel< 4, true, true, 2, false, ThreadsPerAtom::Order, 32 > (PmeGpuKernelParams kernelParams)
 
template __global__ void pmeGatherKernel< 4, true, true, 2, true, ThreadsPerAtom::OrderSquared, 32 > (PmeGpuKernelParams kernelParams)
 
template __global__ void pmeGatherKernel< 4, true, true, 2, false, ThreadsPerAtom::OrderSquared, 32 > (PmeGpuKernelParams kernelParams)
 
template __global__ void pmeGatherKernel< 4, true, true, 1, true, ThreadsPerAtom::Order, 64 > (PmeGpuKernelParams kernelParams)
 
template __global__ void pmeGatherKernel< 4, true, true, 1, false, ThreadsPerAtom::Order, 64 > (PmeGpuKernelParams kernelParams)
 
template __global__ void pmeGatherKernel< 4, true, true, 1, true, ThreadsPerAtom::OrderSquared, 64 > (PmeGpuKernelParams kernelParams)
 
template __global__ void pmeGatherKernel< 4, true, true, 1, false, ThreadsPerAtom::OrderSquared, 64 > (PmeGpuKernelParams kernelParams)
 
template __global__ void pmeGatherKernel< 4, true, true, 2, true, ThreadsPerAtom::Order, 64 > (PmeGpuKernelParams kernelParams)
 
template __global__ void pmeGatherKernel< 4, true, true, 2, false, ThreadsPerAtom::Order, 64 > (PmeGpuKernelParams kernelParams)
 
template __global__ void pmeGatherKernel< 4, true, true, 2, true, ThreadsPerAtom::OrderSquared, 64 > (PmeGpuKernelParams kernelParams)
 
template __global__ void pmeGatherKernel< 4, true, true, 2, false, ThreadsPerAtom::OrderSquared, 64 > (PmeGpuKernelParams kernelParams)
 

Variables

template<int parallelExecutionWidth>
static constexpr int sc_gatherMaxThreadsPerBlock = c_gatherMaxWarpsPerBlock * parallelExecutionWidth
 
template<int parallelExecutionWidth>
static constexpr int sc_gatherMinBlocksPerMP
 

Macro Definition Documentation

#define INSTANTIATE (   order,
  parallelExecutionWidth 
)
Value:
INSTANTIATE_2(order, 1, ThreadsPerAtom::Order, parallelExecutionWidth); \
INSTANTIATE_2(order, 1, ThreadsPerAtom::OrderSquared, parallelExecutionWidth); \
INSTANTIATE_2(order, 2, ThreadsPerAtom::Order, parallelExecutionWidth); \
INSTANTIATE_2(order, 2, ThreadsPerAtom::OrderSquared, parallelExecutionWidth);
Use a number of threads equal to the PME order (ie. 4)
Use a number of threads equal to the square of the PME order (ie. 16)
#define INSTANTIATE_2 (   order,
  numGrids,
  threadsPerAtom,
  parallelExecutionWidth 
)
Value:
INSTANTIATE_3(order, numGrids, true, threadsPerAtom, parallelExecutionWidth); \
INSTANTIATE_3(order, numGrids, false, threadsPerAtom, parallelExecutionWidth);
#define INSTANTIATE_3(order, numGrids, readGlobal, threadsPerAtom, parallelExecutionWidth)
Kernel instantiations.
Definition: pme_gather_hip.cpp:553
static int numGrids(const GridSet::DomainSetup &domainSetup)
Returns the number of search grids.
Definition: gridset.cpp:67
#define INSTANTIATE_3 (   order,
  numGrids,
  readGlobal,
  threadsPerAtom,
  parallelExecutionWidth 
)
Value:
template __global__ void pmeGatherKernel<order, true, true, numGrids, readGlobal, threadsPerAtom, parallelExecutionWidth>( \
PmeGpuKernelParams kernelParams);
int PmeGpuKernelParams
A dummy typedef for the GPU kernel arguments data placeholder on non-GPU builds.
Definition: pme_gpu_types_host.h:79

Kernel instantiations.

Function Documentation

template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom, int parallelExecutionWidth>
__launch_bounds__ ( sc_gatherMaxThreadsPerBlock< parallelExecutionWidth >  ,
sc_gatherMinBlocksPerMP< parallelExecutionWidth >   
) const

A HIP kernel which gathers the atom forces from the grid. The grid is assumed to be wrapped in dimension Z.

Template Parameters
orderThe PME order (must be 4 currently).
wrapXTells if the grid is wrapped in the X dimension.
wrapYTells if the grid is wrapped in the Y dimension.
numGridsThe number of grids to use in the kernel. Can be 1 or 2.
readGlobalTells if we should read spline values from global memory
threadsPerAtomHow many threads work on each atom
parallelExecutionWidthHow large the wave size is.

param[in] kernelParams All the PME GPU data.

static __device__ void calculateAndStoreGridForces ( float3 *__restrict__  sm_forces,
const int  forceIndexLocal,
const int  forceIndexGlobal,
const float  recipBox[DIM][DIM],
const float  scale,
const float *__restrict__  gm_coefficients 
)
inlinestatic

Calculate the grid forces and store them in shared memory.

Parameters
[in,out]sm_forcesShared memory array with the output forces.
[in]forceIndexLocalThe local (per thread) index in the sm_forces array.
[in]forceIndexGlobalThe index of the thread in the gm_coefficients array.
[in]recipBoxThe reciprocal box.
[in]scaleThe scale to use when calculating the forces. For gm_coefficientsB (when using multiple coefficients on a single grid) the scale will be (1.0 - scale).
[in]gm_coefficientsGlobal memory array of the coefficients to use for an unperturbed or FEP in state A if a single grid is used (multiCoefficientsSingleGrid == true).If two separate grids are used this should be the coefficients of the grid in question.
template<int order, int atomDataSize, int blockSize, int parallelExecutionWidth>
static __device__ void reduceAtomForces ( float3 *__restrict__  sm_forces,
const int  atomIndexLocal,
const int  splineIndex,
const float *  realGridSizeFP,
float &  fx,
float &  fy,
float &  fz 
)
inlinestatic

Reduce the partial force contributions.

Template Parameters
orderThe PME order (must be 4).
atomDataSizeThe number of partial force contributions for each atom (currently order^2 == 16)
blockSizeThe HIP block size
Parameters
[out]sm_forcesShared memory array with the output forces (number of elements is number of atoms per block)
[in]atomIndexLocalLocal atom index
[in]splineIndexSpline index
[in]realGridSizeFPLocal grid size constant
[in]fxInput force partial component X
[in]fyInput force partial component Y
[in]fzInput force partial component Z
template<int order, int atomsPerWarp, bool wrapX, bool wrapY>
static __device__ void sumForceComponents ( float *__restrict__  fx,
float *__restrict__  fy,
float *__restrict__  fz,
const int  ithyMin,
const int  ithyMax,
const int  ixBase,
const int  iz,
const int  nx,
const int  ny,
const int  pny,
const int  pnz,
const int  atomIndexLocal,
const int  splineIndexBase,
const float2  tdz,
const int *__restrict__  sm_gridlineIndices,
const float *__restrict__  sm_theta,
const float *__restrict__  sm_dtheta,
const float *__restrict__  gm_grid 
)
inlinestatic

Calculate the sum of the force partial components (in X, Y and Z)

Template Parameters
orderThe PME order (must be 4).
atomsPerWarpThe number of atoms per GPU warp.
wrapXTells if the grid is wrapped in the X dimension.
wrapYTells if the grid is wrapped in the Y dimension.
Parameters
[out]fxThe force partial component in the X dimension.
[out]fyThe force partial component in the Y dimension.
[out]fzThe force partial component in the Z dimension.
[in]ithyMinThe thread minimum index in the Y dimension.
[in]ithyMaxThe thread maximum index in the Y dimension.
[in]ixBaseThe grid line index base value in the X dimension.
[in]izThe grid line index in the Z dimension.
[in]nxThe grid real size in the X dimension.
[in]nyThe grid real size in the Y dimension.
[in]pnyThe padded grid real size in the Y dimension.
[in]pnzThe padded grid real size in the Z dimension.
[in]atomIndexLocalThe atom index for this thread.
[in]splineIndexBaseThe base value of the spline parameter index.
[in]tdzThe theta and dtheta in the Z dimension.
[in]sm_gridlineIndicesShared memory array of grid line indices.
[in]sm_thetaShared memory array of atom theta values.
[in]sm_dthetaShared memory array of atom dtheta values.
[in]gm_gridGlobal memory array of the grid to use.

Variable Documentation

template<int parallelExecutionWidth>
constexpr int sc_gatherMinBlocksPerMP
static
Initial value:
=
1024 / sc_gatherMaxThreadsPerBlock<parallelExecutionWidth>