#include <string>
#include <vector>
#include "gromacs/gpu_utils/devicebuffer_datatype.h"
#include "gromacs/gpu_utils/gpu_macros.h"
#include "gromacs/math/vectypes.h"
#include "gromacs/utility/real.h"

Include dependency graph for pme.h:

This graph shows which files directly or indirectly include this file:

Description

This file contains function declarations necessary for computing energies and forces for the PME long-ranged part (Coulomb and LJ).

Author: Berk Hess hess@.nosp@m.kth..nosp@m.se

Classes
class	gmx::ArrayRef< typename >
	STL-like interface to a C array of T (or part of a std container of T). More...

class	gmx::SeparatePmeRanksPermitted
	Class for managing usage of separate PME-only ranks. More...

Enumerations
enum	{ GMX_SUM_GRID_FORWARD, GMX_SUM_GRID_BACKWARD }

enum	PmeRunMode { PmeRunMode::None, PmeRunMode::CPU, PmeRunMode::GPU, PmeRunMode::Mixed }
	Possible PME codepaths on a rank. More...

Functions
int	minimalPmeGridSize (int pmeOrder)
	Return the smallest allowed PME grid size for `pmeOrder`.

int	numGridLinesForExtendedHaloRegion (int pmeOrder, real haloExtentForAtomDisplacement, real gridSpacing)
	Calculates the number of grid lines used as halo region for PME decomposition. More...

real	getGridSpacingFromBox (const matrix box, const ivec gridDim)
	Gets grid spacing from simulation box and grid dimension. More...

bool	gmx_pme_grid_matches (const gmx_pme_t &pme, const ivec grid_size)
	Return whether the grid of `pme` is identical to `grid_size`.

bool	gmx_pme_check_restrictions (int pme_order, int nkx, int nky, int nkz, int numPmeDomainsAlongX, int numPmeDomainsAlongY, int extendedHaloRegion, bool useGpuPme, bool useThreads, bool errorsAreFatal)
	Check restrictions on pme_order and the PME grid nkx,nky,nkz. More...

gmx_pme_t *	gmx_pme_init (const t_commrec cr, const NumPmeDomains &numPmeDomains, const t_inputrec ir, const matrix box, real haloExtentForAtomDisplacement, gmx_bool bFreeEnergy_q, gmx_bool bFreeEnergy_lj, gmx_bool bReproducible, real ewaldcoeff_q, real ewaldcoeff_lj, int nthread, PmeRunMode runMode, PmeGpu pmeGpu, const DeviceContext deviceContext, const DeviceStream deviceStream, const PmeGpuProgram pmeGpuProgram, const gmx::MDLogger &mdlog)
	Construct PME data. More...

void	gmx_pme_reinit (gmx_pme_t *pmedata, const t_commrec cr, gmx_pme_t pme_src, const t_inputrec ir, const ivec grid_size, real ewaldcoeff_q, real ewaldcoeff_lj)
	As gmx_pme_init, but takes most settings, except the grid/Ewald coefficients, from pme_src. This is only called when the PME cut-off/grid size changes.

void	gmx_pme_destroy (gmx_pme_t *pme)
	Destroys the PME data structure (including shared data).

void	gmx_pme_destroy (gmx_pme_t *pme, bool destroySharedData)
	Destroys the PME data structure. More...

int	gmx_pme_do (struct gmx_pme_t pme, gmx::ArrayRef< const gmx::RVec > coordinates, gmx::ArrayRef< gmx::RVec > forces, gmx::ArrayRef< const real > chargeA, gmx::ArrayRef< const real > chargeB, gmx::ArrayRef< const real > c6A, gmx::ArrayRef< const real > c6B, gmx::ArrayRef< const real > sigmaA, gmx::ArrayRef< const real > sigmaB, const matrix box, const t_commrec cr, int maxshift_x, int maxshift_y, t_nrnb nrnb, gmx_wallcycle wcycle, matrix vir_q, matrix vir_lj, real energy_q, real energy_lj, real lambda_q, real lambda_lj, real dvdlambda_q, real dvdlambda_lj, const gmx::StepWorkload &stepWork)
	Do a PME calculation on a CPU for the long range electrostatics and/or LJ. More...

real	gmx_pme_calc_energy (gmx_pme_t *pme, gmx::ArrayRef< const gmx::RVec > x, gmx::ArrayRef< const real > q)
	Calculate the PME grid energy V for n charges. More...

void	gmx_pme_reinit_atoms (gmx_pme_t *pme, int numAtoms, gmx::ArrayRef< const real > chargesA, gmx::ArrayRef< const real > chargesB)
	This function updates the local atom data on GPU after DD (charges, coordinates, etc.). TODO: it should update the PME CPU atom data as well. (currently PME CPU call gmx_pme_do() gets passed the input pointers for each computation). More...

bool	pme_gpu_supports_build (std::string *error)
	Checks whether the GROMACS build allows to run PME on GPU. TODO: this partly duplicates an internal PME assert function pme_gpu_check_restrictions(), except that works with a formed gmx_pme_t structure. Should that one go away/work with inputrec? More...

bool	pme_gpu_supports_input (const t_inputrec &ir, std::string *error)
	Checks whether the input system allows to run PME on GPU. TODO: this partly duplicates an internal PME assert function pme_gpu_check_restrictions(), except that works with a formed gmx_pme_t structure. Should that one go away/work with inputrec? More...

bool	pme_gpu_mixed_mode_supports_input (const t_inputrec &ir, std::string *error)
	Checks whether the input system allows to run PME on GPU in Mixed mode. Assumes that the input system is compatible with GPU PME otherwise, that is, before calling this function one should check that pme_gpu_supports_input returns `true`. More...

PmeRunMode	pme_run_mode (const gmx_pme_t *pme)
	Returns the active PME codepath (CPU, GPU, mixed). More...

gmx::PinningPolicy	pme_get_pinning_policy ()
	Return the pinning policy appropriate for this build configuration for relevant buffers used for PME task on this rank (e.g. running on a GPU).

bool	pme_gpu_task_enabled (const gmx_pme_t *pme)
	Tells if PME is enabled to run on GPU (not necessarily active at the moment). More...

void	pme_gpu_use_nvshmem (PmeGpu *pmeGpu, bool useNvshmem)
	Sets the nvshmem usage status, and allocates required structs if NVSHMEM should be used. More...

int	pme_gpu_get_block_size (const gmx_pme_t *pme)
	Returns the block size requirement. More...

void	pme_gpu_reset_timings (const gmx_pme_t *pme)
	Resets the PME GPU timings. To be called at the reset step. More...

void	pme_gpu_get_timings (const gmx_pme_t pme, gmx_wallclock_gpu_pme_t timings)
	Copies the PME GPU timings to the gmx_wallclock_gpu_pme_t structure (for log output). To be called at the run end. More...

void	pme_gpu_prepare_computation (gmx_pme_t pme, const matrix box, gmx_wallcycle wcycle, const gmx::StepWorkload &stepWork)
	Prepares PME on GPU computation (updating the box if needed) More...

void	pme_gpu_launch_spread (gmx_pme_t pme, GpuEventSynchronizer xReadyOnDevice, gmx_wallcycle wcycle, real lambdaQ, bool useGpuDirectComm, gmx::PmeCoordinateReceiverGpu pmeCoordinateReceiverGpu, bool useMdGpuGraph)
	Launches first stage of PME on GPU - spreading kernel. More...

void	pme_gpu_launch_complex_transforms (gmx_pme_t pme, gmx_wallcycle wcycle, const gmx::StepWorkload &stepWork)
	Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode. More...

void	pme_gpu_launch_gather (const gmx_pme_t pme, gmx_wallcycle wcycle, real lambdaQ, bool computeVirial)
	Launches last stage of PME on GPU - force gathering and D2H force transfer. More...

bool	pme_gpu_try_finish_task (gmx_pme_t pme, const gmx::StepWorkload &stepWork, gmx_wallcycle wcycle, gmx::ForceWithVirial forceWithVirial, gmx_enerdata_t enerd, real lambdaQ, GpuTaskCompletion completionKind)
	Attempts to complete PME GPU tasks. More...

void	pme_gpu_wait_and_reduce (gmx_pme_t pme, const gmx::StepWorkload &stepWork, gmx_wallcycle wcycle, gmx::ForceWithVirial forceWithVirial, gmx_enerdata_t enerd, real lambdaQ)
	Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy (if they were to be computed). More...

void	pme_gpu_reinit_computation (const gmx_pme_t pme, bool gpuGraphWithSeparatePmeRank, gmx_wallcycle wcycle)
	The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing. More...

void	pme_gpu_set_device_x (const gmx_pme_t *pme, DeviceBuffer< gmx::RVec > d_x)
	Set pointer to device copy of coordinate data. More...

DeviceBuffer< gmx::RVec >	pme_gpu_get_device_f (const gmx_pme_t *pme)
	Get pointer to device copy of force data. More...

GpuEventSynchronizer *	pme_gpu_get_f_ready_synchronizer (const gmx_pme_t *pme)
	Get pointer to the device synchronizer object that allows syncing on PME force calculation completion. More...

Enumeration Type Documentation

enum PmeRunMode

strong

Possible PME codepaths on a rank.

Todo:: : make this enum class with gmx_pme_t C++ refactoring

Enumerator
None	No PME task is done.
CPU	Whole PME computation is done on CPU.
GPU	Whole PME computation is done on GPU.
Mixed	Mixed mode: only spread and gather run on GPU; FFT and solving are done on CPU.

Function Documentation

real getGridSpacingFromBox	(	const matrix	box,
		const ivec	gridDim
	)

Gets grid spacing from simulation box and grid dimension.

Parameters

[in]	box	Simulation box
[in]	gridDim	Fourier grid dimension

Returns: Grid spacing value

real gmx_pme_calc_energy	(	gmx_pme_t *	pme,
		gmx::ArrayRef< const gmx::RVec >	x,
		gmx::ArrayRef< const real >	q
	)

Calculate the PME grid energy V for n charges.

The potential (found in pme) must have been found already with a call to gmx_pme_do(). Note that the charges are not spread on the grid in the pme struct. Currently does not work in parallel or with free energy.

bool gmx_pme_check_restrictions	(	int	pme_order,
		int	nkx,
		int	nky,
		int	nkz,
		int	numPmeDomainsAlongX,
		int	numPmeDomainsAlongY,
		int	extendedHaloRegion,
		bool	useGpuPme,
		bool	useThreads,
		bool	errorsAreFatal
	)

Check restrictions on pme_order and the PME grid nkx,nky,nkz.

With errorsAreFatal=true, an exception or fatal error is generated on violation of restrictions. With errorsAreFatal=false, false is returned on violation of restrictions. When all restrictions are obeyed, true is returned. Argument useThreads tells if any MPI rank doing PME uses more than 1 threads. If at calling useThreads is unknown, pass true for conservative checking.

The PME GPU restrictions are checked separately during pme_gpu_init().

void gmx_pme_destroy	(	gmx_pme_t *	pme,
		bool	destroySharedData
	)

Destroys the PME data structure.

Parameters

pme	The data structure to destroy.
destroySharedData	Set to `false` if `pme` is a copy created by gmx_pme_reinit, and only it should be destroyed, while shared data should be preserved. If `true`, the shared data will be destroyed, like in `gmx_pme_destroy(gmx_pme_t* pme)`.

int gmx_pme_do	(	struct gmx_pme_t *	pme,
		gmx::ArrayRef< const gmx::RVec >	coordinates,
		gmx::ArrayRef< gmx::RVec >	forces,
		gmx::ArrayRef< const real >	chargeA,
		gmx::ArrayRef< const real >	chargeB,
		gmx::ArrayRef< const real >	c6A,
		gmx::ArrayRef< const real >	c6B,
		gmx::ArrayRef< const real >	sigmaA,
		gmx::ArrayRef< const real >	sigmaB,
		const matrix	box,
		const t_commrec *	cr,
		int	maxshift_x,
		int	maxshift_y,
		t_nrnb *	nrnb,
		gmx_wallcycle *	wcycle,
		matrix	vir_q,
		matrix	vir_lj,
		real *	energy_q,
		real *	energy_lj,
		real	lambda_q,
		real	lambda_lj,
		real *	dvdlambda_q,
		real *	dvdlambda_lj,
		const gmx::StepWorkload &	stepWork
	)

Do a PME calculation on a CPU for the long range electrostatics and/or LJ.

Computes the PME forces and the energy and viral, when requested, for all atoms in coordinates. Forces, when requested, are added to the buffer forces, which is allowed to contain more elements than the number of elements in coordinates. The meaning of flags is defined above, and determines which parts of the calculation are performed.

Returns: 0 indicates all well, non zero is an error code.

gmx_pme_t* gmx_pme_init	(	const t_commrec *	cr,
		const NumPmeDomains &	numPmeDomains,
		const t_inputrec *	ir,
		const matrix	box,
		real	haloExtentForAtomDisplacement,
		gmx_bool	bFreeEnergy_q,
		gmx_bool	bFreeEnergy_lj,
		gmx_bool	bReproducible,
		real	ewaldcoeff_q,
		real	ewaldcoeff_lj,
		int	nthread,
		PmeRunMode	runMode,
		PmeGpu *	pmeGpu,
		const DeviceContext *	deviceContext,
		const DeviceStream *	deviceStream,
		const PmeGpuProgram *	pmeGpuProgram,
		const gmx::MDLogger &	mdlog
	)

Construct PME data.

Exceptions

gmx::InconsistentInputError if input grid sizes/PME order are inconsistent.

Returns: Pointer to newly allocated and initialized PME data.

Todo:: We should evolve something like a GpuManager that holds DeviceInformation* and PmeGpuProgram* and perhaps other related things whose lifetime can/should exceed that of a task (or perhaps task manager). See Issue #2522.

void gmx_pme_reinit_atoms	(	gmx_pme_t *	pme,
		int	numAtoms,
		gmx::ArrayRef< const real >	chargesA,
		gmx::ArrayRef< const real >	chargesB
	)

This function updates the local atom data on GPU after DD (charges, coordinates, etc.). TODO: it should update the PME CPU atom data as well. (currently PME CPU call gmx_pme_do() gets passed the input pointers for each computation).

Parameters

[in,out]	pme	The PME structure.
[in]	numAtoms	The number of particles.
[in]	chargesA	The pointer to the array of particle charges in the normal state or FEP state A. Can be nullptr if PME is not performed on the GPU.
[in]	chargesB	The pointer to the array of particle charges in state B. Only used if charges are perturbed and can otherwise be nullptr.

int numGridLinesForExtendedHaloRegion	(	int	pmeOrder,
		real	haloExtentForAtomDisplacement,
		real	gridSpacing
	)

Calculates the number of grid lines used as halo region for PME decomposition.

This function is only used for PME GPU decomposition case

Parameters

[in]	pmeOrder	PME interpolation order
[in]	haloExtentForAtomDisplacement	extent of halo region in nm to account for atom displacement
[in]	gridSpacing	spacing between grid lines

Returns: extended halo region used in GPU PME decomposition

int pme_gpu_get_block_size ( const gmx_pme_t * pme )

Returns the block size requirement.

The GPU version of PME requires that the coordinates array have a size divisible by the returned number.

Parameters

[in] pme The PME data structure.

DeviceBuffer<gmx::RVec> pme_gpu_get_device_f ( const gmx_pme_t * pme )

Get pointer to device copy of force data.

Parameters

[in] pme The PME data structure.

Returns: Pointer to force data

GpuEventSynchronizer* pme_gpu_get_f_ready_synchronizer ( const gmx_pme_t * pme )

Get pointer to the device synchronizer object that allows syncing on PME force calculation completion.

Parameters

[in] pme The PME data structure.

Returns: Pointer to synchronizer

void pme_gpu_get_timings	(	const gmx_pme_t *	pme,
		gmx_wallclock_gpu_pme_t *	timings
	)

Copies the PME GPU timings to the gmx_wallclock_gpu_pme_t structure (for log output). To be called at the run end.

Parameters

[in]	pme	The PME structure.
[in]	timings	The gmx_wallclock_gpu_pme_t structure.

void pme_gpu_launch_complex_transforms	(	gmx_pme_t *	pme,
		gmx_wallcycle *	wcycle,
		const gmx::StepWorkload &	stepWork
	)

Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.

Parameters

[in]	pme	The PME data structure.
[in]	wcycle	The wallclock counter.
[in]	stepWork	The required work for this simulation step

void pme_gpu_launch_gather	(	const gmx_pme_t *	pme,
		gmx_wallcycle *	wcycle,
		real	lambdaQ,
		bool	computeVirial
	)

Launches last stage of PME on GPU - force gathering and D2H force transfer.

Parameters

[in]	pme	The PME data structure.
[in]	wcycle	The wallclock counter.
[in]	lambdaQ	The Coulomb lambda to use when calculating the results.
[in]	computeVirial	Whether this is a virial step.

void pme_gpu_launch_spread	(	gmx_pme_t *	pme,
		GpuEventSynchronizer *	xReadyOnDevice,
		gmx_wallcycle *	wcycle,
		real	lambdaQ,
		bool	useGpuDirectComm,
		gmx::PmeCoordinateReceiverGpu *	pmeCoordinateReceiverGpu,
		bool	useMdGpuGraph
	)

Launches first stage of PME on GPU - spreading kernel.

Parameters

[in]	pme	The PME data structure.
[in]	xReadyOnDevice	Event synchronizer indicating that the coordinates are ready in the device memory; nullptr allowed only on separate PME ranks.
[in]	wcycle	The wallclock counter.
[in]	lambdaQ	The Coulomb lambda of the current state of the system. Only used if FEP of Coulomb is active.
[in]	useGpuDirectComm	Whether direct GPU PME-PP communication is active
[in]	pmeCoordinateReceiverGpu	Coordinate receiver object, which must be valid when direct GPU PME-PP communication is active
[in]	useMdGpuGraph	Whether MD GPU Graph is in use.

bool pme_gpu_mixed_mode_supports_input	(	const t_inputrec &	ir,
		std::string *	error
	)

Checks whether the input system allows to run PME on GPU in Mixed mode. Assumes that the input system is compatible with GPU PME otherwise, that is, before calling this function one should check that pme_gpu_supports_input returns true.

Parameters

[in]	ir	Input system.
[out]	error	If non-null, the error message if the input is not supported.

Returns: true if PME can run on GPU in Mixed mode with this input, false otherwise.

void pme_gpu_prepare_computation	(	gmx_pme_t *	pme,
		const matrix	box,
		gmx_wallcycle *	wcycle,
		const gmx::StepWorkload &	stepWork
	)

Prepares PME on GPU computation (updating the box if needed)

Parameters

[in]	pme	The PME data structure.
[in]	box	The unit cell box.
[in]	wcycle	The wallclock counter.
[in]	stepWork	The required work for this simulation step

void pme_gpu_reinit_computation	(	const gmx_pme_t *	pme,
		bool	gpuGraphWithSeparatePmeRank,
		gmx_wallcycle *	wcycle
	)

The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.

Clears the internal grid and energy/virial buffers; it is not safe to start the PME computation without calling this. Note that unlike in the nbnxn module, the force buffer does not need clearing.

Todo:: Rename this function to clear – it clearly only does output resetting and we should be clear about what the function does..

Parameters

[in]	pme	The PME data structure.
[in]	gpuGraphWithSeparatePmeRank	Whether MD GPU Graph with separate PME rank is in use.
[in]	wcycle	The wallclock counter.

void pme_gpu_reset_timings ( const gmx_pme_t * pme )

Resets the PME GPU timings. To be called at the reset step.

Parameters

[in] pme The PME structure.

void pme_gpu_set_device_x	(	const gmx_pme_t *	pme,
		DeviceBuffer< gmx::RVec >	d_x
	)

Set pointer to device copy of coordinate data.

Parameters

[in]	pme	The PME data structure.
[in]	d_x	The pointer to the positions buffer to be set

bool pme_gpu_supports_build ( std::string * error )

Checks whether the GROMACS build allows to run PME on GPU. TODO: this partly duplicates an internal PME assert function pme_gpu_check_restrictions(), except that works with a formed gmx_pme_t structure. Should that one go away/work with inputrec?

Parameters

[out] error If non-null, the error message when PME is not supported on GPU.

Returns: true if PME can run on GPU on this build, false otherwise.

bool pme_gpu_supports_input	(	const t_inputrec &	ir,
		std::string *	error
	)

Checks whether the input system allows to run PME on GPU. TODO: this partly duplicates an internal PME assert function pme_gpu_check_restrictions(), except that works with a formed gmx_pme_t structure. Should that one go away/work with inputrec?

Parameters

[in]	ir	Input system.
[out]	error	If non-null, the error message if the input is not supported on GPU.

Returns: true if PME can run on GPU with this input, false otherwise.

bool pme_gpu_task_enabled ( const gmx_pme_t * pme )

inline

Tells if PME is enabled to run on GPU (not necessarily active at the moment).

Todo:: This is a rather static data that should be managed by the hardware assignment manager. For now, it is synonymous with the active PME codepath (in the absence of dynamic switching).

Parameters

[in] pme The PME data structure.

Returns: true if PME can run on GPU, false otherwise.

bool pme_gpu_try_finish_task	(	gmx_pme_t *	pme,
		const gmx::StepWorkload &	stepWork,
		gmx_wallcycle *	wcycle,
		gmx::ForceWithVirial *	forceWithVirial,
		gmx_enerdata_t *	enerd,
		real	lambdaQ,
		GpuTaskCompletion	completionKind
	)

Attempts to complete PME GPU tasks.

The completionKind argument controls whether the function blocks until all PME GPU tasks enqueued completed (as pme_gpu_wait_finish_task() does) or only checks and returns immediately if they did not. When blocking or the tasks have completed it also gets the output forces by assigning the ArrayRef to the forces pointer passed in. Virial/energy are also outputs if they were to be computed.

Parameters

[in]	pme	The PME data structure.
[in]	stepWork	The required work for this simulation step
[in]	wcycle	The wallclock counter.
[out]	forceWithVirial	The output force and virial
[out]	enerd	The output energies
[in]	lambdaQ	The Coulomb lambda to use when calculating the results.
[in]	completionKind	Indicates whether PME task completion should only be checked rather than waited for

Returns: True if the PME GPU tasks have completed

void pme_gpu_use_nvshmem	(	PmeGpu *	pmeGpu,
		bool	useNvshmem
	)

Sets the nvshmem usage status, and allocates required structs if NVSHMEM should be used.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	useNvshmem	should use NVSHMEM.

void pme_gpu_wait_and_reduce	(	gmx_pme_t *	pme,
		const gmx::StepWorkload &	stepWork,
		gmx_wallcycle *	wcycle,
		gmx::ForceWithVirial *	forceWithVirial,
		gmx_enerdata_t *	enerd,
		real	lambdaQ
	)

Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy (if they were to be computed).

Parameters

[in]	pme	The PME data structure.
[in]	stepWork	The required work for this simulation step
[in]	wcycle	The wallclock counter.
[out]	forceWithVirial	The output force and virial
[out]	enerd	The output energies
[in]	lambdaQ	The Coulomb lambda to use when calculating the results.

PmeRunMode pme_run_mode ( const gmx_pme_t * pme )

Returns the active PME codepath (CPU, GPU, mixed).

Todo:: This is a rather static data that should be managed by the higher level task scheduler.

Parameters

[in] pme The PME data structure.

Returns: active PME codepath.

Description

Classes

Enumerations

Functions

Enumeration Type Documentation

Function Documentation