#include "gmxpre.h"
#include "pme_spread_sycl.h"
#include "gromacs/gpu_utils/gmxsycl.h"
#include "gromacs/gpu_utils/gputraits_sycl.h"
#include "gromacs/gpu_utils/sycl_kernel_utils.h"
#include "gromacs/gpu_utils/syclutils.h"
#include "pme_gpu_calculate_splines_sycl.h"
#include "pme_gpu_types_host.h"
#include "pme_grid.h"

Include dependency graph for pme_spread_sycl.cpp:

Description

Implements PME GPU spline calculation and charge spreading in SYCL.

Author: Andrey Alekseenko al42a.nosp@m.nd@g.nosp@m.mail..nosp@m.com

Macros
#define	INSTANTIATE_3(order, computeSplines, spreadCharges, numGrids, writeGlobal, threadsPerAtom, subGroupSize) template class PmeSplineAndSpreadKernel<order, computeSplines, spreadCharges, true, true, numGrids, writeGlobal, threadsPerAtom, subGroupSize>;
	Kernel instantiations.

#define	INSTANTIATE_2(order, numGrids, threadsPerAtom, subGroupSize)

#define	INSTANTIATE(order, subGroupSize)

Functions
template<int order, bool wrapX, bool wrapY, ThreadsPerAtom threadsPerAtom, int subGroupSize>
void	spread_charges (const float atomCharge, const int realGridSize[DIM], const int realGridSizePadded[DIM], sycl::global_ptr< float > gm_grid, const sycl::local_ptr< int > sm_gridlineIndices, const sycl::local_ptr< float > sm_theta, const sycl::nd_item< 3 > &itemIdx)
	Charge spreading onto the grid. This corresponds to the CPU function spread_coefficients_bsplines_thread(). Optional second stage of the spline_and_spread_kernel. More...

template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int numGrids, bool writeGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize>
auto	pmeSplineAndSpreadKernel (sycl::handler &cgh, const int nAtoms, float __restrict__ gm_realGrid_0, float __restrict__ gm_realGrid_1, float __restrict__ gm_theta, float __restrict__ gm_dtheta, int __restrict__ gm_gridlineIndices, const float __restrict__ gm_fractShiftsTable, const int __restrict__ gm_gridlineIndicesTable, const float __restrict__ gm_coefficients_0, const float __restrict__ gm_coefficients_1, const Float3 __restrict__ gm_coordinates, const gmx::IVec tablesOffsets, const gmx::IVec realGridSize, const gmx::RVec realGridSizeFP, const gmx::IVec realGridSizePadded, const gmx::RVec currentRecipBox0, const gmx::RVec currentRecipBox1, const gmx::RVec currentRecipBox2, const PmeGpuPipeliningParams pipeliningParams)
	A spline computation and charge spreading kernel function. More...

Macro Definition Documentation

#define INSTANTIATE	(	order,
		subGroupSize
	)

Value:

INSTANTIATE_2(order, 1, ThreadsPerAtom::Order, subGroupSize);        \
    INSTANTIATE_2(order, 1, ThreadsPerAtom::OrderSquared, subGroupSize); \
    INSTANTIATE_2(order, 2, ThreadsPerAtom::Order, subGroupSize);        \
    INSTANTIATE_2(order, 2, ThreadsPerAtom::OrderSquared, subGroupSize);

#define INSTANTIATE_2	(	order,
		numGrids,
		threadsPerAtom,
		subGroupSize
	)

Value:

INSTANTIATE_3(order, true, true, numGrids, true, threadsPerAtom, subGroupSize);  \
    INSTANTIATE_3(order, true, false, numGrids, true, threadsPerAtom, subGroupSize); \
    INSTANTIATE_3(order, false, true, numGrids, true, threadsPerAtom, subGroupSize); \
    INSTANTIATE_3(order, true, true, numGrids, false, threadsPerAtom, subGroupSize);

Function Documentation

template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int numGrids, bool writeGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize>

auto pmeSplineAndSpreadKernel	(	sycl::handler &	cgh,
		const int	nAtoms,
		float *__restrict__	gm_realGrid_0,
		float *__restrict__	gm_realGrid_1,
		float *__restrict__	gm_theta,
		float *__restrict__	gm_dtheta,
		int *__restrict__	gm_gridlineIndices,
		const float *__restrict__	gm_fractShiftsTable,
		const int *__restrict__	gm_gridlineIndicesTable,
		const float *__restrict__	gm_coefficients_0,
		const float *__restrict__	gm_coefficients_1,
		const Float3 *__restrict__	gm_coordinates,
		const gmx::IVec	tablesOffsets,
		const gmx::IVec	realGridSize,
		const gmx::RVec	realGridSizeFP,
		const gmx::IVec	realGridSizePadded,
		const gmx::RVec	currentRecipBox0,
		const gmx::RVec	currentRecipBox1,
		const gmx::RVec	currentRecipBox2,
		const PmeGpuPipeliningParams	pipeliningParams
	)

A spline computation and charge spreading kernel function.

Two tuning parameters can be used for additional performance. For small systems and for debugging writeGlobal should be used removing the need to recalculate the theta values in the gather kernel. Similarly for large systems, with useOrderThreads, using order threads per atom gives higher performance than order*order threads.

Template Parameters

order	PME interpolation order.
computeSplines	A boolean which tells if the spline parameter and gridline indices' computation should be performed.
spreadCharges	A boolean which tells if the charge spreading should be performed.
wrapX	A boolean which tells if the grid overlap in dimension X should be wrapped.
wrapY	A boolean which tells if the grid overlap in dimension Y should be wrapped.
numGrids	The number of grids to use in the kernel. Can be 1 or 2.
writeGlobal	A boolean which tells if the theta values and gridlines should be written to global memory.
threadsPerAtom	How many threads work on each atom.
subGroupSize	Size of the sub-group.

template<int order, bool wrapX, bool wrapY, ThreadsPerAtom threadsPerAtom, int subGroupSize>

void spread_charges	(	const float	atomCharge,
		const int	realGridSize[DIM],
		const int	realGridSizePadded[DIM],
		sycl::global_ptr< float >	gm_grid,
		const sycl::local_ptr< int >	sm_gridlineIndices,
		const sycl::local_ptr< float >	sm_theta,
		const sycl::nd_item< 3 > &	itemIdx
	)

inline

Charge spreading onto the grid. This corresponds to the CPU function spread_coefficients_bsplines_thread(). Optional second stage of the spline_and_spread_kernel.

Template Parameters

order	PME interpolation order.
wrapX	Whether the grid overlap in dimension X should be wrapped.
wrapY	Whether the grid overlap in dimension Y should be wrapped.
threadsPerAtom	How many threads work on each atom.
subGroupSize	Size of the sub-group.

Parameters

[in]	atomCharge	Atom charge/coefficient of atom processed by thread.
[in]	realGridSize	Size of the real grid.
[in]	realGridSizePadded	Padded of the real grid.
[in,out]	gm_grid	Device pointer to the real grid to which charges are added.
[in]	sm_gridlineIndices	Atom gridline indices in the local memory.
[in]	sm_theta	Atom spline values in the local memory.
[in]	itemIdx	Current thread ID.

Description

Macros

Functions

Macro Definition Documentation

Function Documentation