#include "gmxpre.h"
#include <cassert>
#include "gromacs/gpu_utils/gmxsycl.h"
#include "gromacs/gpu_utils/gputraits_sycl.h"
#include "gromacs/gpu_utils/sycl_kernel_utils.h"
#include "pme_gpu_constants.h"
#include "pme_gpu_types.h"
#include "pme_grid.h"

Include dependency graph for pme_gpu_calculate_splines_sycl.h:

This graph shows which files directly or indirectly include this file:

Description

Implements helper routines for PME gather and spline routines.

Author: Andrey Alekseenko al42a.nosp@m.nd@g.nosp@m.mail..nosp@m.com

Typedefs
using	mode = sycl::access_mode

Functions
template<typename T >
void	anonymous_namespace{pme_gpu_calculate_splines_sycl.h}::assertIsFinite (T arg)
	Asserts if the argument is finite. More...

template<>
void	anonymous_namespace{pme_gpu_calculate_splines_sycl.h}::assertIsFinite (Float3 gmx_unused arg)

template<typename T >
void	anonymous_namespace{pme_gpu_calculate_splines_sycl.h}::assertIsFinite (T gmx_unused arg)

template<int order, int atomsPerSubGroup>
static int	getSplineParamIndexBase (int subGroupIndex, int atomSubGroupIndex)
	Gets a base of the unique index to an element in a spline parameter buffer (theta/dtheta), which is laid out for GPU spread/gather kernels. The base only corresponds to the atom index within the execution block. Feed the result into getSplineParamIndex() to get a full index. TODO: it's likely that both parameters can be just replaced with a single atom index, as they are derived from it. Do that, verifying that the generated code is not bloated, and/or revise the spline indexing scheme. Removing warp dependency would also be nice (and would probably coincide with removing c_pmeSpreadGatherAtomsPerWarp). More...

template<int order, int atomsPerSubGroup>
static int	getSplineParamIndex (int paramIndexBase, int dimIndex, int splineIndex)
	Gets a unique index to an element in a spline parameter buffer (theta/dtheta), which is laid out for GPU spread/gather kernels. The index is wrt to the execution block, in range(0, atomsPerBlock * order * DIM). This function consumes result of getSplineParamIndexBase() and adjusts it for `dimIndex` and `splineIndex`. More...

static bool	pmeGpuCheckAtomCharge (const float charge)
	An inline function for skipping the zero-charge atoms when we have `c_skipNeutralAtoms` set to `true`. More...

template<typename T , int atomsPerWorkGroup, int dataCountPerAtom>
static void	pmeGpuStageAtomData (sycl::local_ptr< T > sm_destination, const sycl::global_ptr< const T > gm_source, sycl::nd_item< 3 > itemIdx)
	General purpose function for loading atom-related data from global to shared memory. More...

template<int order, int atomsPerBlock, int atomsPerWarp, bool writeSmDtheta, bool writeGlobal, int numGrids, int subGroupSize>
static void	calculateSplines (const int atomIndexOffset, const Float3 atomX, const float atomCharge, const gmx::IVec tablesOffsets, const gmx::RVec realGridSizeFP, const gmx::RVec currentRecipBox0, const gmx::RVec currentRecipBox1, const gmx::RVec currentRecipBox2, sycl::global_ptr< float > gm_theta, sycl::global_ptr< float > gm_dtheta, sycl::global_ptr< int > gm_gridlineIndices, const sycl::global_ptr< const float > gm_fractShiftsTable, const sycl::global_ptr< const int > gm_gridlineIndicesTable, sycl::local_ptr< float > sm_theta, sycl::local_ptr< float > sm_dtheta, sycl::local_ptr< int > sm_gridlineIndices, sycl::local_ptr< float > sm_fractCoords, sycl::nd_item< 3 > itemIdx)
	PME GPU spline parameter and gridline indices calculation. This corresponds to the CPU functions calc_interpolation_idx() and make_bsplines(). First stage of the whole kernel. More...

Function Documentation

template<int order, int atomsPerBlock, int atomsPerWarp, bool writeSmDtheta, bool writeGlobal, int numGrids, int subGroupSize>

static void calculateSplines	(	const int	atomIndexOffset,
		const Float3	atomX,
		const float	atomCharge,
		const gmx::IVec	tablesOffsets,
		const gmx::RVec	realGridSizeFP,
		const gmx::RVec	currentRecipBox0,
		const gmx::RVec	currentRecipBox1,
		const gmx::RVec	currentRecipBox2,
		sycl::global_ptr< float >	gm_theta,
		sycl::global_ptr< float >	gm_dtheta,
		sycl::global_ptr< int >	gm_gridlineIndices,
		const sycl::global_ptr< const float >	gm_fractShiftsTable,
		const sycl::global_ptr< const int >	gm_gridlineIndicesTable,
		sycl::local_ptr< float >	sm_theta,
		sycl::local_ptr< float >	sm_dtheta,
		sycl::local_ptr< int >	sm_gridlineIndices,
		sycl::local_ptr< float >	sm_fractCoords,
		sycl::nd_item< 3 >	itemIdx
	)

inlinestatic

PME GPU spline parameter and gridline indices calculation. This corresponds to the CPU functions calc_interpolation_idx() and make_bsplines(). First stage of the whole kernel.

Template Parameters

order	PME interpolation order.
atomsPerBlock	Number of atoms processed by a block - should be accounted for in the sizes of the shared memory arrays.
atomsPerWarp	Number of atoms processed by a warp
writeSmDtheta	Bool controlling if the theta derivative should be written to shared memory. Enables calculation of dtheta if set.
writeGlobal	A boolean which tells if the theta values and gridlines should be written to global memory. Enables calculation of dtheta if set.
numGrids	The number of grids using the splines.
subGroupSize	The size of a sub-group (warp).

Parameters

[in]	atomIndexOffset	Starting atom index for the execution block in the global memory.
[in]	atomX	Coordinates of atom processed by thread.
[in]	atomCharge	Charge/coefficient of atom processed by thread.
[in]	tablesOffsets	Offsets for X/Y/Z components of `gm_fractShiftsTable` and `gm_gridlineIndicesTable`.
[in]	realGridSizeFP	Real-space grid dimensions, converted to floating point.
[in]	currentRecipBox0	Current reciprocal (inverted unit cell) box, vector 1.
[in]	currentRecipBox1	Current reciprocal (inverted unit cell) box, vector 2.
[in]	currentRecipBox2	Current reciprocal (inverted unit cell) box, vector 3.
[out]	gm_theta	Atom spline values in the global memory. Used only if `writeGlobal` is `true`.
[out]	gm_dtheta	Derivatives of atom spline values in the global memory. Used only if `writeGlobal` is `true`.
[out]	gm_gridlineIndices	Atom gridline indices in the global memory. Used only if `writeGlobal` is `true`.
[in]	gm_fractShiftsTable	Fractional shifts lookup table in the global memory.
[in]	gm_gridlineIndicesTable	Gridline indices lookup table in the global memory.
[out]	sm_theta	Atom spline values in the local memory.
[out]	sm_dtheta	Derivatives of atom spline values in the local memory.
[out]	sm_gridlineIndices	Atom gridline indices in the local memory.
[out]	sm_fractCoords	Fractional coordinates in the local memory.
[in]	itemIdx	SYCL thread ID.

template<int order, int atomsPerSubGroup>

static int getSplineParamIndex	(	int	paramIndexBase,
		int	dimIndex,
		int	splineIndex
	)

inlinestatic

Gets a unique index to an element in a spline parameter buffer (theta/dtheta), which is laid out for GPU spread/gather kernels. The index is wrt to the execution block, in range(0, atomsPerBlock * order * DIM). This function consumes result of getSplineParamIndexBase() and adjusts it for dimIndex and splineIndex.

Template Parameters

order	PME order
atomsPerSubGroup	Number of atoms processed by a sub group

Parameters

[in]	paramIndexBase	Must be result of getSplineParamIndexBase().
[in]	dimIndex	Dimension index (from 0 to 2)
[in]	splineIndex	Spline contribution index (from 0 to `order` - 1)

Returns: Index into theta or dtheta array using GPU layout.

template<int order, int atomsPerSubGroup>

static int getSplineParamIndexBase	(	int	subGroupIndex,
		int	atomSubGroupIndex
	)

inlinestatic

Gets a base of the unique index to an element in a spline parameter buffer (theta/dtheta), which is laid out for GPU spread/gather kernels. The base only corresponds to the atom index within the execution block. Feed the result into getSplineParamIndex() to get a full index. TODO: it's likely that both parameters can be just replaced with a single atom index, as they are derived from it. Do that, verifying that the generated code is not bloated, and/or revise the spline indexing scheme. Removing warp dependency would also be nice (and would probably coincide with removing c_pmeSpreadGatherAtomsPerWarp).

Template Parameters

order	PME order
atomsPerSubGroup	Number of atoms processed by a sub group

Parameters

[in]	subGroupIndex	Sub group index in the work group.
[in]	atomSubGroupIndex	Atom index in the sub group (from 0 to atomsPerSubGroup - 1).

Returns: Index into theta or dtheta array using GPU layout.

static bool pmeGpuCheckAtomCharge ( const float charge )

inlinestatic

An inline function for skipping the zero-charge atoms when we have c_skipNeutralAtoms set to true.

Returns: true if atom should be processed, false otherwise.

Parameters

[in] charge The atom charge.

template<typename T , int atomsPerWorkGroup, int dataCountPerAtom>

static void pmeGpuStageAtomData	(	sycl::local_ptr< T >	sm_destination,
		const sycl::global_ptr< const T >	gm_source,
		sycl::nd_item< 3 >	itemIdx
	)

inlinestatic

General purpose function for loading atom-related data from global to shared memory.

Template Parameters

T	Data type (float/int/...).
atomsPerWorkGroup	Number of atoms processed by a block - should be accounted for in the size of the shared memory array.
dataCountPerAtom	Number of data elements per single atom (e.g. `DIM` for an rvec coordinates array).

Parameters

[out]	sm_destination	Shared memory array for output.
[in]	gm_source	Global memory array for input.
[in]	itemIdx	SYCL thread ID.

Description

Typedefs

Functions

Function Documentation