Gromacs  2025-dev-20240613-b88f907
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Macros | Functions
pme_gather_sycl.cpp File Reference
#include "gmxpre.h"
#include "pme_gather_sycl.h"
#include "gromacs/gpu_utils/gmxsycl.h"
#include "gromacs/gpu_utils/gputraits_sycl.h"
#include "gromacs/gpu_utils/sycl_kernel_utils.h"
#include "gromacs/gpu_utils/syclutils.h"
#include "gromacs/math/functions.h"
#include "gromacs/utility/basedefinitions.h"
#include "pme_gpu_calculate_splines_sycl.h"
#include "pme_gpu_constants.h"
#include "pme_gpu_types_host.h"
#include "pme_grid.h"
+ Include dependency graph for pme_gather_sycl.cpp:

Description

Implements PME force gathering in SYCL.

Author
Andrey Alekseenko al42a.nosp@m.nd@g.nosp@m.mail..nosp@m.com

Macros

#define INSTANTIATE_3(order, numGrids, readGlobal, threadsPerAtom, subGroupSize)   template class PmeGatherKernel<order, true, true, numGrids, readGlobal, threadsPerAtom, subGroupSize>;
 Kernel instantiations.
 
#define INSTANTIATE_2(order, numGrids, threadsPerAtom, subGroupSize)
 
#define INSTANTIATE(order, subGroupSize)
 

Functions

float readGridSize (const float *realGridSizeFP, const int dimIndex)
 Use loads from constant address space indexed by constant offsets rather than dynamic index-based accesses to the grid size data to avoid local memory operations and related large overhead. More...
 
template<int order, int atomDataSize, int workGroupSize, int subGroupSize>
void reduceAtomForces (sycl::nd_item< 3 > itemIdx, sycl::local_ptr< Float3 > sm_forces, const int atomIndexLocal, const int splineIndex, const int gmx_unused lineIndex, const float realGridSizeFP[3], float &fx, float &fy, float &fz)
 Reduce the partial force contributions. More...
 
template<int order, int atomsPerWarp, bool wrapX, bool wrapY>
void sumForceComponents (sycl::private_ptr< float > fx, sycl::private_ptr< float > fy, sycl::private_ptr< float > fz, const int ithyMin, const int ithyMax, const int ixBase, const int iz, const int nx, const int ny, const int pny, const int pnz, const int atomIndexLocal, const int splineIndexBase, const sycl::float2 tdz, const sycl::local_ptr< int > sm_gridlineIndices, const sycl::local_ptr< float > sm_theta, const sycl::local_ptr< float > sm_dtheta, const sycl::global_ptr< const float > gm_grid)
 Calculate the sum of the force partial components (in X, Y and Z) More...
 
void calculateAndStoreGridForces (sycl::local_ptr< Float3 > sm_forces, const int forceIndexLocal, const int forceIndexGlobal, const Float3 &recipBox0, const Float3 &recipBox1, const Float3 &recipBox2, const float scale, const sycl::global_ptr< const float > gm_coefficients)
 Calculate the grid forces and store them in shared memory. More...
 
template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize>
auto pmeGatherKernel (sycl::handler &cgh, const int nAtoms, const float *__restrict__ gm_gridA, const float *__restrict__ gm_gridB, const float *__restrict__ gm_coefficientsA, const float *__restrict__ gm_coefficientsB, const Float3 *__restrict__ gm_coordinates, Float3 *__restrict__ gm_forces, const float *__restrict__ gm_theta, const float *__restrict__ gm_dtheta, const int *__restrict__ gm_gridlineIndices, const float *__restrict__ gm_fractShiftsTable, const int *__restrict__ gm_gridlineIndicesTable, const gmx::IVec tablesOffsets, const gmx::IVec realGridSize, const gmx::RVec realGridSizeFP, const gmx::IVec realGridSizePadded, const gmx::RVec currentRecipBox0, const gmx::RVec currentRecipBox1, const gmx::RVec currentRecipBox2, const float scale)
 A SYCL kernel which gathers the atom forces from the grid. The grid is assumed to be wrapped in dimension Z. More...
 

Macro Definition Documentation

#define INSTANTIATE (   order,
  subGroupSize 
)
Value:
INSTANTIATE_2(order, 1, ThreadsPerAtom::Order, subGroupSize); \
INSTANTIATE_2(order, 1, ThreadsPerAtom::OrderSquared, subGroupSize); \
INSTANTIATE_2(order, 2, ThreadsPerAtom::Order, subGroupSize); \
INSTANTIATE_2(order, 2, ThreadsPerAtom::OrderSquared, subGroupSize);
Use a number of threads equal to the PME order (ie. 4)
Use a number of threads equal to the square of the PME order (ie. 16)
#define INSTANTIATE_2 (   order,
  numGrids,
  threadsPerAtom,
  subGroupSize 
)
Value:
INSTANTIATE_3(order, numGrids, true, threadsPerAtom, subGroupSize); \
INSTANTIATE_3(order, numGrids, false, threadsPerAtom, subGroupSize);
static int numGrids(const GridSet::DomainSetup &domainSetup)
Returns the number of search grids.
Definition: gridset.cpp:57
#define INSTANTIATE_3(order, numGrids, readGlobal, threadsPerAtom, subGroupSize)
Kernel instantiations.
Definition: pme_gather_sycl.cpp:694

Function Documentation

void calculateAndStoreGridForces ( sycl::local_ptr< Float3 sm_forces,
const int  forceIndexLocal,
const int  forceIndexGlobal,
const Float3 recipBox0,
const Float3 recipBox1,
const Float3 recipBox2,
const float  scale,
const sycl::global_ptr< const float >  gm_coefficients 
)
inline

Calculate the grid forces and store them in shared memory.

Parameters
[in,out]sm_forcesShared memory array with the output forces.
[in]forceIndexLocalThe local (per thread) index in the sm_forces array.
[in]forceIndexGlobalThe index of the thread in the gm_coefficients array.
[in]recipBox0The reciprocal box (first vector).
[in]recipBox1The reciprocal box (second vector).
[in]recipBox2The reciprocal box (third vector).
[in]scaleThe scale to use when calculating the forces. For gm_coefficientsB (when using multiple coefficients on a single grid) the scale will be (1.0 - scale).
[in]gm_coefficientsGlobal memory array of the coefficients to use for an unperturbed or FEP in state A if a single grid is used (multiCoefficientsSingleGrid == true).If two separate grids are used this should be the coefficients of the grid in question.
template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize>
auto pmeGatherKernel ( sycl::handler &  cgh,
const int  nAtoms,
const float *__restrict__  gm_gridA,
const float *__restrict__  gm_gridB,
const float *__restrict__  gm_coefficientsA,
const float *__restrict__  gm_coefficientsB,
const Float3 *__restrict__  gm_coordinates,
Float3 *__restrict__  gm_forces,
const float *__restrict__  gm_theta,
const float *__restrict__  gm_dtheta,
const int *__restrict__  gm_gridlineIndices,
const float *__restrict__  gm_fractShiftsTable,
const int *__restrict__  gm_gridlineIndicesTable,
const gmx::IVec  tablesOffsets,
const gmx::IVec  realGridSize,
const gmx::RVec  realGridSizeFP,
const gmx::IVec  realGridSizePadded,
const gmx::RVec  currentRecipBox0,
const gmx::RVec  currentRecipBox1,
const gmx::RVec  currentRecipBox2,
const float  scale 
)

A SYCL kernel which gathers the atom forces from the grid. The grid is assumed to be wrapped in dimension Z.

Template Parameters
orderPME interpolation order.
wrapXA boolean which tells if the grid overlap in dimension X should be wrapped.
wrapYA boolean which tells if the grid overlap in dimension Y should be wrapped.
numGridsThe number of grids to use in the kernel. Can be 1 or 2.
readGlobalTells if we should read spline values from global memory.
threadsPerAtomHow many threads work on each atom.
subGroupSizeSize of the sub-group.
float readGridSize ( const float *  realGridSizeFP,
const int  dimIndex 
)
inline

Use loads from constant address space indexed by constant offsets rather than dynamic index-based accesses to the grid size data to avoid local memory operations and related large overhead.

Drastically reduces register spills on AMD via hipSYCL, and improves performance 10x.

Parameters
[in]realGridSizeFPLocal grid size constant
[in]dimIndexDimension index (XX, YY, ZZ)
Returns
The grid size of the specified dimension.
template<int order, int atomDataSize, int workGroupSize, int subGroupSize>
void reduceAtomForces ( sycl::nd_item< 3 >  itemIdx,
sycl::local_ptr< Float3 sm_forces,
const int  atomIndexLocal,
const int  splineIndex,
const int gmx_unused  lineIndex,
const float  realGridSizeFP[3],
float &  fx,
float &  fy,
float &  fz 
)
inline

Reduce the partial force contributions.

Template Parameters
orderThe PME order (must be 4).
atomDataSizeThe number of partial force contributions for each atom (currently order^2 == 16).
workGroupSizeThe size of a work-group.
subGroupSizeThe size of a sub-group.
Parameters
[in]itemIdxSYCL thread ID.
[out]sm_forcesShared memory array with the output forces (number of elements is number of atoms per block).
[in]atomIndexLocalLocal atom index.
[in]splineIndexSpline index.
[in]lineIndexLine index (same as threadLocalId)
[in]realGridSizeFPLocal grid size constant
[in]fxInput force partial component X
[in]fyInput force partial component Y
[in]fzInput force partial component Z
template<int order, int atomsPerWarp, bool wrapX, bool wrapY>
void sumForceComponents ( sycl::private_ptr< float >  fx,
sycl::private_ptr< float >  fy,
sycl::private_ptr< float >  fz,
const int  ithyMin,
const int  ithyMax,
const int  ixBase,
const int  iz,
const int  nx,
const int  ny,
const int  pny,
const int  pnz,
const int  atomIndexLocal,
const int  splineIndexBase,
const sycl::float2  tdz,
const sycl::local_ptr< int >  sm_gridlineIndices,
const sycl::local_ptr< float >  sm_theta,
const sycl::local_ptr< float >  sm_dtheta,
const sycl::global_ptr< const float >  gm_grid 
)
inline

Calculate the sum of the force partial components (in X, Y and Z)

Template Parameters
orderThe PME order (must be 4).
atomsPerWarpThe number of atoms per GPU warp.
wrapXTells if the grid is wrapped in the X dimension.
wrapYTells if the grid is wrapped in the Y dimension.
Parameters
[out]fxThe force partial component in the X dimension.
[out]fyThe force partial component in the Y dimension.
[out]fzThe force partial component in the Z dimension.
[in]ithyMinThe thread minimum index in the Y dimension.
[in]ithyMaxThe thread maximum index in the Y dimension.
[in]ixBaseThe grid line index base value in the X dimension.
[in]izThe grid line index in the Z dimension.
[in]nxThe grid real size in the X dimension.
[in]nyThe grid real size in the Y dimension.
[in]pnyThe padded grid real size in the Y dimension.
[in]pnzThe padded grid real size in the Z dimension.
[in]atomIndexLocalThe atom index for this thread.
[in]splineIndexBaseThe base value of the spline parameter index.
[in]tdzThe theta and dtheta in the Z dimension.
[in]sm_gridlineIndicesShared memory array of grid line indices.
[in]sm_thetaShared memory array of atom theta values.
[in]sm_dthetaShared memory array of atom dtheta values.
[in]gm_gridGlobal memory array of the grid to use.