|
Gromacs
2026.2
|
#include "gmxpre.h"#include <cassert>#include "gromacs/gpu_utils/hip_kernel_utils.h"#include "gromacs/gpu_utils/hip_sycl_kernel_utils.h"#include "gromacs/gpu_utils/typecasts_cuda_hip.h"#include "gromacs/math/functions.h"#include "pme_gpu_calculate_splines_hip.h"#include "pme_gpu_constants.h"#include "pme_grid.h"
Include dependency graph for pme_gather_hip.cpp:Implements PME force gathering in HIP.
Macros | |
| #define | INSTANTIATE_3(order, numGrids, readGlobal, threadsPerAtom, parallelExecutionWidth) |
| Kernel instantiations. More... | |
| #define | INSTANTIATE_2(order, numGrids, threadsPerAtom, parallelExecutionWidth) |
| #define | INSTANTIATE(order, parallelExecutionWidth) |
Functions | |
| static __device__ float | readGridSize (const float *realGridSizeFP, const int dimIndex) |
| An inline HIP function: unroll the dynamic index accesses to the constant grid sizes to avoid local memory operations. | |
| template<int order, int atomDataSize, int blockSize, int parallelExecutionWidth> | |
| static __device__ void | reduceAtomForces (float3 *__restrict__ sm_forces, const int atomIndexLocal, const int splineIndex, const float *realGridSizeFP, float &fx, float &fy, float &fz) |
| Reduce the partial force contributions. More... | |
| template<int order, int atomsPerWarp, bool wrapX, bool wrapY> | |
| static __device__ void | sumForceComponents (float *__restrict__ fx, float *__restrict__ fy, float *__restrict__ fz, const int ithyMin, const int ithyMax, const int ixBase, const int iz, const int nx, const int ny, const int pny, const int pnz, const int atomIndexLocal, const int splineIndexBase, const float2 tdz, const int *__restrict__ sm_gridlineIndices, const float *__restrict__ sm_theta, const float *__restrict__ sm_dtheta, const float *__restrict__ gm_grid) |
| Calculate the sum of the force partial components (in X, Y and Z) More... | |
| static __device__ void | calculateAndStoreGridForces (float3 *__restrict__ sm_forces, const int forceIndexLocal, const int forceIndexGlobal, const float recipBox[DIM][DIM], const float scale, const float *__restrict__ gm_coefficients) |
| Calculate the grid forces and store them in shared memory. More... | |
| template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom, int parallelExecutionWidth> | |
| __launch_bounds__ (sc_gatherMaxThreadsPerBlock< parallelExecutionWidth >, sc_gatherMinBlocksPerMP< parallelExecutionWidth >) __global__ void pmeGatherKernel(const PmeGpuKernelParams kernelParams) | |
| A HIP kernel which gathers the atom forces from the grid. The grid is assumed to be wrapped in dimension Z. More... | |
| template __global__ void | pmeGatherKernel< 4, true, true, 1, true, ThreadsPerAtom::Order, 32 > (PmeGpuKernelParams kernelParams) |
| template __global__ void | pmeGatherKernel< 4, true, true, 1, false, ThreadsPerAtom::Order, 32 > (PmeGpuKernelParams kernelParams) |
| template __global__ void | pmeGatherKernel< 4, true, true, 1, true, ThreadsPerAtom::OrderSquared, 32 > (PmeGpuKernelParams kernelParams) |
| template __global__ void | pmeGatherKernel< 4, true, true, 1, false, ThreadsPerAtom::OrderSquared, 32 > (PmeGpuKernelParams kernelParams) |
| template __global__ void | pmeGatherKernel< 4, true, true, 2, true, ThreadsPerAtom::Order, 32 > (PmeGpuKernelParams kernelParams) |
| template __global__ void | pmeGatherKernel< 4, true, true, 2, false, ThreadsPerAtom::Order, 32 > (PmeGpuKernelParams kernelParams) |
| template __global__ void | pmeGatherKernel< 4, true, true, 2, true, ThreadsPerAtom::OrderSquared, 32 > (PmeGpuKernelParams kernelParams) |
| template __global__ void | pmeGatherKernel< 4, true, true, 2, false, ThreadsPerAtom::OrderSquared, 32 > (PmeGpuKernelParams kernelParams) |
| template __global__ void | pmeGatherKernel< 4, true, true, 1, true, ThreadsPerAtom::Order, 64 > (PmeGpuKernelParams kernelParams) |
| template __global__ void | pmeGatherKernel< 4, true, true, 1, false, ThreadsPerAtom::Order, 64 > (PmeGpuKernelParams kernelParams) |
| template __global__ void | pmeGatherKernel< 4, true, true, 1, true, ThreadsPerAtom::OrderSquared, 64 > (PmeGpuKernelParams kernelParams) |
| template __global__ void | pmeGatherKernel< 4, true, true, 1, false, ThreadsPerAtom::OrderSquared, 64 > (PmeGpuKernelParams kernelParams) |
| template __global__ void | pmeGatherKernel< 4, true, true, 2, true, ThreadsPerAtom::Order, 64 > (PmeGpuKernelParams kernelParams) |
| template __global__ void | pmeGatherKernel< 4, true, true, 2, false, ThreadsPerAtom::Order, 64 > (PmeGpuKernelParams kernelParams) |
| template __global__ void | pmeGatherKernel< 4, true, true, 2, true, ThreadsPerAtom::OrderSquared, 64 > (PmeGpuKernelParams kernelParams) |
| template __global__ void | pmeGatherKernel< 4, true, true, 2, false, ThreadsPerAtom::OrderSquared, 64 > (PmeGpuKernelParams kernelParams) |
Variables | |
| template<int parallelExecutionWidth> | |
| static constexpr int | sc_gatherMaxThreadsPerBlock = c_gatherMaxWarpsPerBlock * parallelExecutionWidth |
| template<int parallelExecutionWidth> | |
| static constexpr int | sc_gatherMinBlocksPerMP |
| #define INSTANTIATE | ( | order, | |
| parallelExecutionWidth | |||
| ) |
| #define INSTANTIATE_2 | ( | order, | |
| numGrids, | |||
| threadsPerAtom, | |||
| parallelExecutionWidth | |||
| ) |
| #define INSTANTIATE_3 | ( | order, | |
| numGrids, | |||
| readGlobal, | |||
| threadsPerAtom, | |||
| parallelExecutionWidth | |||
| ) |
Kernel instantiations.
| __launch_bounds__ | ( | sc_gatherMaxThreadsPerBlock< parallelExecutionWidth > | , |
| sc_gatherMinBlocksPerMP< parallelExecutionWidth > | |||
| ) | const |
A HIP kernel which gathers the atom forces from the grid. The grid is assumed to be wrapped in dimension Z.
| order | The PME order (must be 4 currently). |
| wrapX | Tells if the grid is wrapped in the X dimension. |
| wrapY | Tells if the grid is wrapped in the Y dimension. |
| numGrids | The number of grids to use in the kernel. Can be 1 or 2. |
| readGlobal | Tells if we should read spline values from global memory |
| threadsPerAtom | How many threads work on each atom |
| parallelExecutionWidth | How large the wave size is. |
param[in] kernelParams All the PME GPU data.
|
inlinestatic |
Calculate the grid forces and store them in shared memory.
| [in,out] | sm_forces | Shared memory array with the output forces. |
| [in] | forceIndexLocal | The local (per thread) index in the sm_forces array. |
| [in] | forceIndexGlobal | The index of the thread in the gm_coefficients array. |
| [in] | recipBox | The reciprocal box. |
| [in] | scale | The scale to use when calculating the forces. For gm_coefficientsB (when using multiple coefficients on a single grid) the scale will be (1.0 - scale). |
| [in] | gm_coefficients | Global memory array of the coefficients to use for an unperturbed or FEP in state A if a single grid is used (multiCoefficientsSingleGrid == true).If two separate grids are used this should be the coefficients of the grid in question. |
|
inlinestatic |
Reduce the partial force contributions.
| order | The PME order (must be 4). |
| atomDataSize | The number of partial force contributions for each atom (currently order^2 == 16) |
| blockSize | The HIP block size |
| [out] | sm_forces | Shared memory array with the output forces (number of elements is number of atoms per block) |
| [in] | atomIndexLocal | Local atom index |
| [in] | splineIndex | Spline index |
| [in] | realGridSizeFP | Local grid size constant |
| [in] | fx | Input force partial component X |
| [in] | fy | Input force partial component Y |
| [in] | fz | Input force partial component Z |
|
inlinestatic |
Calculate the sum of the force partial components (in X, Y and Z)
| order | The PME order (must be 4). |
| atomsPerWarp | The number of atoms per GPU warp. |
| wrapX | Tells if the grid is wrapped in the X dimension. |
| wrapY | Tells if the grid is wrapped in the Y dimension. |
| [out] | fx | The force partial component in the X dimension. |
| [out] | fy | The force partial component in the Y dimension. |
| [out] | fz | The force partial component in the Z dimension. |
| [in] | ithyMin | The thread minimum index in the Y dimension. |
| [in] | ithyMax | The thread maximum index in the Y dimension. |
| [in] | ixBase | The grid line index base value in the X dimension. |
| [in] | iz | The grid line index in the Z dimension. |
| [in] | nx | The grid real size in the X dimension. |
| [in] | ny | The grid real size in the Y dimension. |
| [in] | pny | The padded grid real size in the Y dimension. |
| [in] | pnz | The padded grid real size in the Z dimension. |
| [in] | atomIndexLocal | The atom index for this thread. |
| [in] | splineIndexBase | The base value of the spline parameter index. |
| [in] | tdz | The theta and dtheta in the Z dimension. |
| [in] | sm_gridlineIndices | Shared memory array of grid line indices. |
| [in] | sm_theta | Shared memory array of atom theta values. |
| [in] | sm_dtheta | Shared memory array of atom dtheta values. |
| [in] | gm_grid | Global memory array of the grid to use. |
|
static |
1.8.5