#include "gmxpre.h"
#include "config.h"
#include <list>
#include "gromacs/ewald/ewald_utils.h"
#include "gromacs/ewald/pme.h"
#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
#include "gromacs/fft/parallel_3dfft.h"
#include "gromacs/math/boxmatrix.h"
#include "gromacs/mdlib/gmx_omp_nthreads.h"
#include "gromacs/mdtypes/enerdata.h"
#include "gromacs/mdtypes/forceoutput.h"
#include "gromacs/mdtypes/inputrec.h"
#include "gromacs/mdtypes/simulation_workload.h"
#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/stringutil.h"
#include "pme_gpu_internal.h"
#include "pme_gpu_settings.h"
#include "pme_gpu_timings.h"
#include "pme_gpu_types_host.h"
#include "pme_grid.h"
#include "pme_internal.h"
#include "pme_solve.h"

Include dependency graph for pme_gpu.cpp:

Description

Implements high-level PME GPU functions which do not require GPU framework-specific code.

Author: Aleksei Iupinov a.yup.nosp@m.inov.nosp@m.@gmai.nosp@m.l.co.nosp@m.m

Functions
static bool	pme_gpu_active (const gmx_pme_t *pme)
	Finds out if PME is currently running on GPU. More...

void	pme_gpu_reset_timings (const gmx_pme_t *pme)
	Resets the PME GPU timings. To be called at the reset step. More...

void	pme_gpu_get_timings (const gmx_pme_t pme, gmx_wallclock_gpu_pme_t timings)
	Copies the PME GPU timings to the gmx_wallclock_gpu_pme_t structure (for log output). To be called at the run end. More...

int	pme_gpu_get_block_size (const gmx_pme_t *pme)
	Returns the block size requirement. More...

void	parallel_3dfft_execute_gpu_wrapper (gmx_pme_t pme, const int gridIndex, enum gmx_fft_direction dir, gmx_wallcycle wcycle)
	A convenience wrapper for launching either the GPU or CPU FFT. More...

void	pme_gpu_prepare_computation (gmx_pme_t pme, const matrix box, gmx_wallcycle wcycle, const gmx::StepWorkload &stepWork)
	Prepares PME on GPU computation (updating the box if needed) More...

void	pme_gpu_launch_spread (gmx_pme_t pme, GpuEventSynchronizer xReadyOnDevice, gmx_wallcycle wcycle, const real lambdaQ, const bool useGpuDirectComm, gmx::PmeCoordinateReceiverGpu pmeCoordinateReceiverGpu, const bool useMdGpuGraph)
	Launches first stage of PME on GPU - spreading kernel. More...

void	pme_gpu_launch_complex_transforms (gmx_pme_t pme, gmx_wallcycle wcycle, const gmx::StepWorkload &stepWork)
	Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode. More...

void	pme_gpu_launch_gather (const gmx_pme_t pme, gmx_wallcycle gmx_unused wcycle, const real lambdaQ, const bool computeVirial)

static void	sum_forces (gmx::ArrayRef< gmx::RVec > f, gmx::ArrayRef< const gmx::RVec > forceToAdd)
	Accumulate the `forcesToAdd` to `f`, using the available threads.

static void	pme_gpu_reduce_outputs (const bool computeEnergyAndVirial, const PmeOutput &output, gmx_wallcycle wcycle, gmx::ForceWithVirial forceWithVirial, gmx_enerdata_t *enerd)
	Reduce quantities from `output` to `forceWithVirial` and `enerd`.

bool	pme_gpu_try_finish_task (gmx_pme_t pme, const gmx::StepWorkload &stepWork, gmx_wallcycle wcycle, gmx::ForceWithVirial forceWithVirial, gmx_enerdata_t enerd, const real lambdaQ, GpuTaskCompletion completionKind)
	Attempts to complete PME GPU tasks. More...

PmeOutput	pme_gpu_wait_finish_task (gmx_pme_t pme, const bool computeEnergyAndVirial, const real lambdaQ, gmx_wallcycle wcycle)
	Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy (if they were to be computed). More...

void	pme_gpu_wait_and_reduce (gmx_pme_t pme, const gmx::StepWorkload &stepWork, gmx_wallcycle wcycle, gmx::ForceWithVirial forceWithVirial, gmx_enerdata_t enerd, const real lambdaQ)
	Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy (if they were to be computed). More...

void	pme_gpu_reinit_computation (const gmx_pme_t pme, const bool gpuGraphWithSeparatePmeRank, gmx_wallcycle wcycle)
	The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing. More...

DeviceBuffer< gmx::RVec >	pme_gpu_get_device_f (const gmx_pme_t *pme)
	Get pointer to device copy of force data. More...

void	pme_gpu_set_device_x (const gmx_pme_t *pme, DeviceBuffer< gmx::RVec > d_x)
	Set pointer to device copy of coordinate data. More...

GpuEventSynchronizer *	pme_gpu_get_f_ready_synchronizer (const gmx_pme_t *pme)
	Get pointer to the device synchronizer object that allows syncing on PME force calculation completion. More...

void	pme_gpu_use_nvshmem (PmeGpu *pmeGpu, bool useNvshmem)
	Sets the nvshmem usage status, and allocates required structs if NVSHMEM should be used. More...

Function Documentation

void parallel_3dfft_execute_gpu_wrapper	(	gmx_pme_t *	pme,
		const int	gridIndex,
		enum gmx_fft_direction	dir,
		gmx_wallcycle *	wcycle
	)

inline

A convenience wrapper for launching either the GPU or CPU FFT.

Parameters

[in]	pme	The PME structure.
[in]	gridIndex	The grid index - should currently always be 0.
[in]	dir	The FFT direction enum.
[in]	wcycle	The wallclock counter.

static bool pme_gpu_active ( const gmx_pme_t * pme )

inlinestatic

Finds out if PME is currently running on GPU.

Todo:: The GPU module should not be constructed (or at least called) when it is not active, so there should be no need to check whether it is active. An assertion that this is true makes sense.

Parameters

[in] pme The PME structure.

Returns: True if PME runs on GPU currently, false otherwise.

int pme_gpu_get_block_size ( const gmx_pme_t * pme )

Returns the block size requirement.

The GPU version of PME requires that the coordinates array have a size divisible by the returned number.

Parameters

[in] pme The PME data structure.

DeviceBuffer<gmx::RVec> pme_gpu_get_device_f ( const gmx_pme_t * pme )

Get pointer to device copy of force data.

Parameters

[in] pme The PME data structure.

Returns: Pointer to force data

GpuEventSynchronizer* pme_gpu_get_f_ready_synchronizer ( const gmx_pme_t * pme )

Get pointer to the device synchronizer object that allows syncing on PME force calculation completion.

Parameters

[in] pme The PME data structure.

Returns: Pointer to synchronizer

void pme_gpu_get_timings	(	const gmx_pme_t *	pme,
		gmx_wallclock_gpu_pme_t *	timings
	)

Copies the PME GPU timings to the gmx_wallclock_gpu_pme_t structure (for log output). To be called at the run end.

Parameters

[in]	pme	The PME structure.
[in]	timings	The gmx_wallclock_gpu_pme_t structure.

void pme_gpu_launch_complex_transforms	(	gmx_pme_t *	pme,
		gmx_wallcycle *	wcycle,
		const gmx::StepWorkload &	stepWork
	)

Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.

Parameters

[in]	pme	The PME data structure.
[in]	wcycle	The wallclock counter.
[in]	stepWork	The required work for this simulation step

void pme_gpu_launch_spread	(	gmx_pme_t *	pme,
		GpuEventSynchronizer *	xReadyOnDevice,
		gmx_wallcycle *	wcycle,
		real	lambdaQ,
		bool	useGpuDirectComm,
		gmx::PmeCoordinateReceiverGpu *	pmeCoordinateReceiverGpu,
		bool	useMdGpuGraph
	)

Launches first stage of PME on GPU - spreading kernel.

Parameters

[in]	pme	The PME data structure.
[in]	xReadyOnDevice	Event synchronizer indicating that the coordinates are ready in the device memory; nullptr allowed only on separate PME ranks.
[in]	wcycle	The wallclock counter.
[in]	lambdaQ	The Coulomb lambda of the current state of the system. Only used if FEP of Coulomb is active.
[in]	useGpuDirectComm	Whether direct GPU PME-PP communication is active
[in]	pmeCoordinateReceiverGpu	Coordinate receiver object, which must be valid when direct GPU PME-PP communication is active
[in]	useMdGpuGraph	Whether MD GPU Graph is in use.

void pme_gpu_prepare_computation	(	gmx_pme_t *	pme,
		const matrix	box,
		gmx_wallcycle *	wcycle,
		const gmx::StepWorkload &	stepWork
	)

Prepares PME on GPU computation (updating the box if needed)

Parameters

[in]	pme	The PME data structure.
[in]	box	The unit cell box.
[in]	wcycle	The wallclock counter.
[in]	stepWork	The required work for this simulation step

void pme_gpu_reinit_computation	(	const gmx_pme_t *	pme,
		bool	gpuGraphWithSeparatePmeRank,
		gmx_wallcycle *	wcycle
	)

The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.

Clears the internal grid and energy/virial buffers; it is not safe to start the PME computation without calling this. Note that unlike in the nbnxn module, the force buffer does not need clearing.

Todo:: Rename this function to clear – it clearly only does output resetting and we should be clear about what the function does..

Parameters

[in]	pme	The PME data structure.
[in]	gpuGraphWithSeparatePmeRank	Whether MD GPU Graph with separate PME rank is in use.
[in]	wcycle	The wallclock counter.

void pme_gpu_reset_timings ( const gmx_pme_t * pme )

Resets the PME GPU timings. To be called at the reset step.

Parameters

[in] pme The PME structure.

void pme_gpu_set_device_x	(	const gmx_pme_t *	pme,
		DeviceBuffer< gmx::RVec >	d_x
	)

Set pointer to device copy of coordinate data.

Parameters

[in]	pme	The PME data structure.
[in]	d_x	The pointer to the positions buffer to be set

bool pme_gpu_try_finish_task	(	gmx_pme_t *	pme,
		const gmx::StepWorkload &	stepWork,
		gmx_wallcycle *	wcycle,
		gmx::ForceWithVirial *	forceWithVirial,
		gmx_enerdata_t *	enerd,
		real	lambdaQ,
		GpuTaskCompletion	completionKind
	)

Attempts to complete PME GPU tasks.

The completionKind argument controls whether the function blocks until all PME GPU tasks enqueued completed (as pme_gpu_wait_finish_task() does) or only checks and returns immediately if they did not. When blocking or the tasks have completed it also gets the output forces by assigning the ArrayRef to the forces pointer passed in. Virial/energy are also outputs if they were to be computed.

Parameters

[in]	pme	The PME data structure.
[in]	stepWork	The required work for this simulation step
[in]	wcycle	The wallclock counter.
[out]	forceWithVirial	The output force and virial
[out]	enerd	The output energies
[in]	lambdaQ	The Coulomb lambda to use when calculating the results.
[in]	completionKind	Indicates whether PME task completion should only be checked rather than waited for

Returns: True if the PME GPU tasks have completed

void pme_gpu_use_nvshmem	(	PmeGpu *	pmeGpu,
		bool	useNvshmem
	)

Sets the nvshmem usage status, and allocates required structs if NVSHMEM should be used.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	useNvshmem	should use NVSHMEM.

void pme_gpu_wait_and_reduce	(	gmx_pme_t *	pme,
		const gmx::StepWorkload &	stepWork,
		gmx_wallcycle *	wcycle,
		gmx::ForceWithVirial *	forceWithVirial,
		gmx_enerdata_t *	enerd,
		real	lambdaQ
	)

Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy (if they were to be computed).

Parameters

[in]	pme	The PME data structure.
[in]	stepWork	The required work for this simulation step
[in]	wcycle	The wallclock counter.
[out]	forceWithVirial	The output force and virial
[out]	enerd	The output energies
[in]	lambdaQ	The Coulomb lambda to use when calculating the results.

PmeOutput pme_gpu_wait_finish_task	(	gmx_pme_t *	pme,
		bool	computeEnergyAndVirial,
		real	lambdaQ,
		gmx_wallcycle *	wcycle
	)

Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy (if they were to be computed).

Parameters

[in]	pme	The PME data structure.
[in]	computeEnergyAndVirial	Tells if the energy and virial computation should be performed.
[in]	lambdaQ	The Coulomb lambda to use when calculating the results.
[out]	wcycle	The wallclock counter.

Returns: The output forces, energy and virial

Description

Functions

Function Documentation