#include "gmxpre.h"
#include "pme_gather_sycl.h"
#include "gromacs/gpu_utils/gmxsycl.h"
#include "gromacs/gpu_utils/gputraits_sycl.h"
#include "gromacs/gpu_utils/sycl_kernel_utils.h"
#include "gromacs/gpu_utils/syclutils.h"
#include "gromacs/math/functions.h"
#include "gromacs/utility/basedefinitions.h"
#include "pme_gpu_calculate_splines_sycl.h"
#include "pme_gpu_constants.h"
#include "pme_gpu_types_host.h"
#include "pme_grid.h"

Include dependency graph for pme_gather_sycl.cpp:

Description

Implements PME force gathering in SYCL.

Author: Andrey Alekseenko al42a.nosp@m.nd@g.nosp@m.mail..nosp@m.com

Macros
#define	INSTANTIATE_3(order, numGrids, readGlobal, threadsPerAtom, subGroupSize) template class PmeGatherKernel<order, true, true, numGrids, readGlobal, threadsPerAtom, subGroupSize>;
	Kernel instantiations.

#define	INSTANTIATE_2(order, numGrids, threadsPerAtom, subGroupSize)

#define	INSTANTIATE(order, subGroupSize)

Functions
float	readGridSize (const float *realGridSizeFP, const int dimIndex)
	Use loads from constant address space indexed by constant offsets rather than dynamic index-based accesses to the grid size data to avoid local memory operations and related large overhead. More...

template<int order, int atomDataSize, int workGroupSize, int subGroupSize>
void	reduceAtomForces (sycl::nd_item< 3 > itemIdx, sycl::local_ptr< Float3 > sm_forces, const int atomIndexLocal, const int splineIndex, const int gmx_unused lineIndex, const float realGridSizeFP[3], float &fx, float &fy, float &fz)
	Reduce the partial force contributions. More...

template<int order, int atomsPerWarp, bool wrapX, bool wrapY>
void	sumForceComponents (sycl::private_ptr< float > fx, sycl::private_ptr< float > fy, sycl::private_ptr< float > fz, const int ithyMin, const int ithyMax, const int ixBase, const int iz, const int nx, const int ny, const int pny, const int pnz, const int atomIndexLocal, const int splineIndexBase, const sycl::float2 tdz, const sycl::local_ptr< int > sm_gridlineIndices, const sycl::local_ptr< float > sm_theta, const sycl::local_ptr< float > sm_dtheta, const sycl::global_ptr< const float > gm_grid)
	Calculate the sum of the force partial components (in X, Y and Z) More...

void	calculateAndStoreGridForces (sycl::local_ptr< Float3 > sm_forces, const int forceIndexLocal, const int forceIndexGlobal, const Float3 &recipBox0, const Float3 &recipBox1, const Float3 &recipBox2, const float scale, const sycl::global_ptr< const float > gm_coefficients)
	Calculate the grid forces and store them in shared memory. More...

template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize>
auto	pmeGatherKernel (sycl::handler &cgh, const int nAtoms, const float __restrict__ gm_gridA, const float __restrict__ gm_gridB, const float __restrict__ gm_coefficientsA, const float __restrict__ gm_coefficientsB, const Float3 __restrict__ gm_coordinates, Float3 __restrict__ gm_forces, const float __restrict__ gm_theta, const float __restrict__ gm_dtheta, const int __restrict__ gm_gridlineIndices, const float __restrict__ gm_fractShiftsTable, const int *__restrict__ gm_gridlineIndicesTable, const gmx::IVec tablesOffsets, const gmx::IVec realGridSize, const gmx::RVec realGridSizeFP, const gmx::IVec realGridSizePadded, const gmx::RVec currentRecipBox0, const gmx::RVec currentRecipBox1, const gmx::RVec currentRecipBox2, const float scale)
	A SYCL kernel which gathers the atom forces from the grid. The grid is assumed to be wrapped in dimension Z. More...

Macro Definition Documentation

#define INSTANTIATE	(	order,
		subGroupSize
	)

Value:

INSTANTIATE_2(order, 1, ThreadsPerAtom::Order, subGroupSize);        \
    INSTANTIATE_2(order, 1, ThreadsPerAtom::OrderSquared, subGroupSize); \
    INSTANTIATE_2(order, 2, ThreadsPerAtom::Order, subGroupSize);        \
    INSTANTIATE_2(order, 2, ThreadsPerAtom::OrderSquared, subGroupSize);

#define INSTANTIATE_2	(	order,
		numGrids,
		threadsPerAtom,
		subGroupSize
	)

Value:

INSTANTIATE_3(order, numGrids, true, threadsPerAtom, subGroupSize); \

INSTANTIATE_3(order, numGrids, false, threadsPerAtom, subGroupSize);

Nbnxm::numGrids

static int numGrids(const GridSet::DomainSetup &domainSetup)

Returns the number of search grids.

Definition: gridset.cpp:57

INSTANTIATE_3

#define INSTANTIATE_3(order, numGrids, readGlobal, threadsPerAtom, subGroupSize)

Kernel instantiations.

Definition: pme_gather_sycl.cpp:694

Function Documentation

void calculateAndStoreGridForces	(	sycl::local_ptr< Float3 >	sm_forces,
		const int	forceIndexLocal,
		const int	forceIndexGlobal,
		const Float3 &	recipBox0,
		const Float3 &	recipBox1,
		const Float3 &	recipBox2,
		const float	scale,
		const sycl::global_ptr< const float >	gm_coefficients
	)

inline

Calculate the grid forces and store them in shared memory.

Parameters

[in,out]	sm_forces	Shared memory array with the output forces.
[in]	forceIndexLocal	The local (per thread) index in the sm_forces array.
[in]	forceIndexGlobal	The index of the thread in the gm_coefficients array.
[in]	recipBox0	The reciprocal box (first vector).
[in]	recipBox1	The reciprocal box (second vector).
[in]	recipBox2	The reciprocal box (third vector).
[in]	scale	The scale to use when calculating the forces. For gm_coefficientsB (when using multiple coefficients on a single grid) the scale will be (1.0 - scale).
[in]	gm_coefficients	Global memory array of the coefficients to use for an unperturbed or FEP in state A if a single grid is used (`multiCoefficientsSingleGrid` == true).If two separate grids are used this should be the coefficients of the grid in question.

template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize>

auto pmeGatherKernel	(	sycl::handler &	cgh,
		const int	nAtoms,
		const float *__restrict__	gm_gridA,
		const float *__restrict__	gm_gridB,
		const float *__restrict__	gm_coefficientsA,
		const float *__restrict__	gm_coefficientsB,
		const Float3 *__restrict__	gm_coordinates,
		Float3 *__restrict__	gm_forces,
		const float *__restrict__	gm_theta,
		const float *__restrict__	gm_dtheta,
		const int *__restrict__	gm_gridlineIndices,
		const float *__restrict__	gm_fractShiftsTable,
		const int *__restrict__	gm_gridlineIndicesTable,
		const gmx::IVec	tablesOffsets,
		const gmx::IVec	realGridSize,
		const gmx::RVec	realGridSizeFP,
		const gmx::IVec	realGridSizePadded,
		const gmx::RVec	currentRecipBox0,
		const gmx::RVec	currentRecipBox1,
		const gmx::RVec	currentRecipBox2,
		const float	scale
	)

A SYCL kernel which gathers the atom forces from the grid. The grid is assumed to be wrapped in dimension Z.

Template Parameters

order	PME interpolation order.
wrapX	A boolean which tells if the grid overlap in dimension X should be wrapped.
wrapY	A boolean which tells if the grid overlap in dimension Y should be wrapped.
numGrids	The number of grids to use in the kernel. Can be 1 or 2.
readGlobal	Tells if we should read spline values from global memory.
threadsPerAtom	How many threads work on each atom.
subGroupSize	Size of the sub-group.

float readGridSize	(	const float *	realGridSizeFP,
		const int	dimIndex
	)

inline

Use loads from constant address space indexed by constant offsets rather than dynamic index-based accesses to the grid size data to avoid local memory operations and related large overhead.

Drastically reduces register spills on AMD via hipSYCL, and improves performance 10x.

Parameters

[in]	realGridSizeFP	Local grid size constant
[in]	dimIndex	Dimension index (XX, YY, ZZ)

Returns: The grid size of the specified dimension.

template<int order, int atomDataSize, int workGroupSize, int subGroupSize>

void reduceAtomForces	(	sycl::nd_item< 3 >	itemIdx,
		sycl::local_ptr< Float3 >	sm_forces,
		const int	atomIndexLocal,
		const int	splineIndex,
		const int gmx_unused	lineIndex,
		const float	realGridSizeFP[3],
		float &	fx,
		float &	fy,
		float &	fz
	)

inline

Reduce the partial force contributions.

Template Parameters

order	The PME order (must be 4).
atomDataSize	The number of partial force contributions for each atom (currently order^2 == 16).
workGroupSize	The size of a work-group.
subGroupSize	The size of a sub-group.

Parameters

[in]	itemIdx	SYCL thread ID.
[out]	sm_forces	Shared memory array with the output forces (number of elements is number of atoms per block).
[in]	atomIndexLocal	Local atom index.
[in]	splineIndex	Spline index.
[in]	lineIndex	Line index (same as threadLocalId)
[in]	realGridSizeFP	Local grid size constant
[in]	fx	Input force partial component X
[in]	fy	Input force partial component Y
[in]	fz	Input force partial component Z

template<int order, int atomsPerWarp, bool wrapX, bool wrapY>

void sumForceComponents	(	sycl::private_ptr< float >	fx,
		sycl::private_ptr< float >	fy,
		sycl::private_ptr< float >	fz,
		const int	ithyMin,
		const int	ithyMax,
		const int	ixBase,
		const int	iz,
		const int	nx,
		const int	ny,
		const int	pny,
		const int	pnz,
		const int	atomIndexLocal,
		const int	splineIndexBase,
		const sycl::float2	tdz,
		const sycl::local_ptr< int >	sm_gridlineIndices,
		const sycl::local_ptr< float >	sm_theta,
		const sycl::local_ptr< float >	sm_dtheta,
		const sycl::global_ptr< const float >	gm_grid
	)

inline

Calculate the sum of the force partial components (in X, Y and Z)

Template Parameters

order	The PME order (must be 4).
atomsPerWarp	The number of atoms per GPU warp.
wrapX	Tells if the grid is wrapped in the X dimension.
wrapY	Tells if the grid is wrapped in the Y dimension.

Parameters

[out]	fx	The force partial component in the X dimension.
[out]	fy	The force partial component in the Y dimension.
[out]	fz	The force partial component in the Z dimension.
[in]	ithyMin	The thread minimum index in the Y dimension.
[in]	ithyMax	The thread maximum index in the Y dimension.
[in]	ixBase	The grid line index base value in the X dimension.
[in]	iz	The grid line index in the Z dimension.
[in]	nx	The grid real size in the X dimension.
[in]	ny	The grid real size in the Y dimension.
[in]	pny	The padded grid real size in the Y dimension.
[in]	pnz	The padded grid real size in the Z dimension.
[in]	atomIndexLocal	The atom index for this thread.
[in]	splineIndexBase	The base value of the spline parameter index.
[in]	tdz	The theta and dtheta in the Z dimension.
[in]	sm_gridlineIndices	Shared memory array of grid line indices.
[in]	sm_theta	Shared memory array of atom theta values.
[in]	sm_dtheta	Shared memory array of atom dtheta values.
[in]	gm_grid	Global memory array of the grid to use.

Description

Macros

Functions

Macro Definition Documentation

Function Documentation