#include "gmxpre.h"
#include "pme_gpu_internal.h"
#include "config.h"
#include <list>
#include <memory>
#include <string>
#include "gromacs/ewald/ewald_utils.h"
#include "gromacs/fft/gpu_3dfft.h"
#include "gromacs/gpu_utils/device_context.h"
#include "gromacs/gpu_utils/device_stream.h"
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/gpu_utils/pmalloc.h"
#include "gromacs/ewald/pme.h"
#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
#include "gromacs/hardware/device_information.h"
#include "gromacs/math/boxmatrix.h"
#include "gromacs/math/units.h"
#include "gromacs/timing/gpu_timing.h"
#include "gromacs/timing/wallcycle.h"
#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/logger.h"
#include "gromacs/utility/stringutil.h"
#include "pme_gpu_calculate_splines.h"
#include "pme_gpu_constants.h"
#include "pme_gpu_grid.h"
#include "pme_gpu_program_impl.h"
#include "pme_gpu_timings.h"
#include "pme_gpu_types.h"
#include "pme_gpu_types_host.h"
#include "pme_gpu_types_host_impl.h"
#include "pme_grid.h"
#include "pme_internal.h"
#include "pme_solve.h"

Include dependency graph for pme_gpu_internal.cpp:

Description

This file contains internal function implementations for performing the PME calculations on GPU.

Note that this file is compiled as regular C++ source in OpenCL builds, but it is treated as CUDA source in CUDA-enabled GPU builds.

Author: Aleksei Iupinov a.yup.nosp@m.inov.nosp@m.@gmai.nosp@m.l.co.nosp@m.m

Functions
static PmeGpuKernelParamsBase *	pme_gpu_get_kernel_params_base_ptr (const PmeGpu *pmeGpu)
	Wrapper for getting a pointer to the plain C++ part of the GPU kernel parameters structure. More...

int	pme_gpu_get_atom_data_block_size ()
	Returns the size of the block size requirement. More...

int	pme_gpu_get_atoms_per_warp (const PmeGpu *pmeGpu)
	Return the number of atoms per warp.

void	pme_gpu_synchronize (const PmeGpu *pmeGpu)
	Synchronizes the current computation, waiting for the GPU kernels/transfers to finish. More...

void	pme_gpu_alloc_energy_virial (PmeGpu *pmeGpu)
	Allocates the fixed size energy and virial buffer both on GPU and CPU. More...

void	pme_gpu_free_energy_virial (PmeGpu *pmeGpu)
	Frees the energy and virial memory both on GPU and CPU. More...

void	pme_gpu_clear_energy_virial (const PmeGpu *pmeGpu, const bool gpuGraphWithSeparatePmeRank)
	Clears the energy and virial memory on GPU with 0. Should be called at the end of PME computation which returned energy/virial. More...

void	pme_gpu_realloc_and_copy_bspline_values (PmeGpu *pmeGpu, const int gridIndex)
	Reallocates and copies the pre-computed B-spline values to the GPU. More...

void	pme_gpu_free_bspline_values (const PmeGpu *pmeGpu)
	Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers). More...

void	pme_gpu_realloc_forces (PmeGpu *pmeGpu)
	Reallocates the GPU buffer for the PME forces. More...

void	pme_gpu_free_forces (const PmeGpu *pmeGpu)
	Frees the GPU buffer for the PME forces. More...

void	pme_gpu_copy_input_forces (PmeGpu *pmeGpu)
	Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered forces). To be called e.g. after the bonded calculations. More...

void	pme_gpu_copy_output_forces (PmeGpu *pmeGpu)
	Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage. More...

void	pme_gpu_realloc_and_copy_input_coefficients (const PmeGpu pmeGpu, const float h_coefficients, const int gridIndex)
	Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer. Clears the padded part if needed. More...

void	pme_gpu_free_coefficients (const PmeGpu *pmeGpu)
	Frees the charges/coefficients on the GPU. More...

void	pme_gpu_realloc_spline_data (PmeGpu *pmeGpu)
	Reallocates the buffers on the GPU and the host for the atoms spline data. More...

void	pme_gpu_free_spline_data (const PmeGpu *pmeGpu)
	Frees the buffers on the GPU for the atoms spline data. More...

void	pme_gpu_realloc_grid_indices (PmeGpu *pmeGpu)
	Reallocates the buffers on the GPU and the host for the particle gridline indices. More...

void	pme_gpu_free_grid_indices (const PmeGpu *pmeGpu)
	Frees the buffer on the GPU for the particle gridline indices. More...

void	pme_gpu_realloc_grids (PmeGpu *pmeGpu)
	Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU. More...

void	pme_gpu_free_grids (const PmeGpu *pmeGpu)
	Frees the real space grid and the complex reciprocal grid (if needed) on the GPU. More...

void	pme_gpu_reinit_haloexchange (PmeGpu *pmeGpu)
	Reinitialize PME halo exchange parameters and staging device buffers for MPI communication. More...

void	pme_gpu_free_haloexchange (const PmeGpu *pmeGpu)
	Frees device staging buffers used for PME halo exchange. More...

void	pme_gpu_clear_grids (const PmeGpu *pmeGpu)
	Clears the real space grid on the GPU. Should be called at the end of each computation. More...

void	pme_gpu_realloc_and_copy_fract_shifts (PmeGpu *pmeGpu)
	Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU. More...

void	pme_gpu_free_fract_shifts (const PmeGpu *pmeGpu)
	Frees the pre-computed fractional coordinates' shifts on the GPU. More...

bool	pme_gpu_stream_query (const PmeGpu *pmeGpu)
	Checks whether work in the PME GPU stream has completed. More...

void	pme_gpu_copy_input_gather_grid (const PmeGpu pmeGpu, const float h_grid, const int gridIndex)
	Copies the input real-space grid from the host to the GPU. More...

void	pme_gpu_copy_output_spread_grid (const PmeGpu pmeGpu, float h_grid, const int gridIndex)
	Copies the output real-space grid from the GPU to the host. More...

void	pme_gpu_copy_output_spread_atom_data (const PmeGpu *pmeGpu)
	Copies the spread output spline data and gridline indices from the GPU to the host. More...

void	pme_gpu_copy_input_gather_atom_data (const PmeGpu *pmeGpu)
	Copies the gather input spline data and gridline indices from the host to the GPU. More...

void	pme_gpu_sync_spread_grid (const PmeGpu *pmeGpu)
	Waits for the grid copying to the host-side buffer after spreading to finish. More...

static void	pme_gpu_init_internal (PmeGpu *pmeGpu, const DeviceContext &deviceContext, const DeviceStream &deviceStream)
	Internal GPU initialization for PME. More...

static gmx::FftBackend	getFftBackend (const PmeGpu *pmeGpu)
	Wrapper for getting FFT backend depending on Gromacs GPU backend compilation flags. More...

void	pme_gpu_reinit_3dfft (const PmeGpu *pmeGpu)
	Initializes the CUDA FFT structures. More...

void	pme_gpu_destroy_3dfft (const PmeGpu *pmeGpu)
	Destroys the CUDA FFT structures. More...

void	pme_gpu_getEnergyAndVirial (const gmx_pme_t &pme, const float lambda, PmeOutput *output)
	Returns the energy and virial GPU outputs, useful for testing. More...

static void	pme_gpu_getForceOutput (PmeGpu pmeGpu, PmeOutput output)
	Sets the force-related members in `output`. More...

PmeOutput	pme_gpu_getOutput (const gmx_pme_t &pme, const bool computeEnergyAndVirial, const real lambdaQ)
	Returns the GPU outputs (forces, energy and virial) More...

void	pme_gpu_update_input_box (PmeGpu gmx_unused *pmeGpu, const matrix gmx_unused box)

static void	pme_gpu_reinit_grids (PmeGpu *pmeGpu)
	(Re-)initializes all the PME GPU data related to the grid size and cut-off. More...

static void	pme_gpu_copy_common_data_from (const gmx_pme_t *pme)
	Copies everything useful from the PME CPU to the PME GPU structure. The goal is to minimize interaction with the PME CPU structure in the GPU code. More...

static void	pme_gpu_select_best_performing_pme_spreadgather_kernels (PmeGpu *pmeGpu)
	uses heuristics to select the best performing PME gather and scatter kernels More...

static void	pme_gpu_init (gmx_pme_t pme, const DeviceContext &deviceContext, const DeviceStream &deviceStream, const PmeGpuProgram pmeGpuProgram)
	Initializes the PME GPU data at the beginning of the run. TODO: this should become PmeGpu::PmeGpu() More...

void	pme_gpu_get_real_grid_sizes (const PmeGpu pmeGpu, gmx::IVec gridSize, gmx::IVec *paddedGridSize)
	Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests. More...

void	pme_gpu_reinit (gmx_pme_t pme, const DeviceContext deviceContext, const DeviceStream deviceStream, const PmeGpuProgram pmeGpuProgram, const bool useMdGpuGraph)
	(Re-)initializes the PME GPU data at the beginning of the run or on DLB. More...

void	pme_gpu_destroy (PmeGpu *pmeGpu)
	Destroys the PME GPU data at the end of the run. More...

void	pme_gpu_reinit_atoms (PmeGpu pmeGpu, const int nAtoms, const real chargesA, const real *chargesB)
	Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU. More...

static CommandEvent *	pme_gpu_fetch_timing_event (const PmeGpu *pmeGpu, PmeStage pmeStageId)
	Returns raw timing event from the corresponding GpuRegionTimer (if timings are enabled). In CUDA result can be nullptr stub, per GpuRegionTimer implementation. More...

void	pme_gpu_3dfft (const PmeGpu *pmeGpu, gmx_fft_direction dir, const int grid_index)
	3D FFT R2C/C2R routine. More...

std::pair< int, int >	pmeGpuCreateGrid (const PmeGpu *pmeGpu, int blockCount)
	Given possibly large `blockCount`, returns a compact 1D or 2D grid for kernel scheduling, to minimize number of unused blocks.

static auto	selectSplineAndSpreadKernelPtr (const PmeGpu *pmeGpu, ThreadsPerAtom threadsPerAtom, bool writeSplinesToGlobal, const int numGrids)
	Returns a pointer to appropriate spline and spread kernel based on the input bool values. More...

static auto	selectSplineKernelPtr (const PmeGpu *pmeGpu, ThreadsPerAtom threadsPerAtom, bool gmx_unused writeSplinesToGlobal, const int numGrids)
	Returns a pointer to appropriate spline kernel based on the input bool values. More...

static auto	selectSpreadKernelPtr (const PmeGpu *pmeGpu, ThreadsPerAtom threadsPerAtom, bool writeSplinesToGlobal, const int numGrids)
	Returns a pointer to appropriate spread kernel based on the input bool values. More...

static int	manageSyncWithPpCoordinateSenderGpu (const PmeGpu pmeGpu, gmx::PmeCoordinateReceiverGpu pmeCoordinateReceiverGpu, bool usePipeline=false, int pipelineStage=0)
	Manages synchronization with remote GPU's PP coordinate sender, for a stage of the communication operation. For thread-MPI, an event associated with a stage of the operation is enqueued to the GPU stream that will be used by the consumer. For lib-MPI, the CPU task executing this function will wait for a stage to be completed. In each case, the rank of the sender associated with the corresponding stage is returned. More...

void	pme_gpu_spread (const PmeGpu pmeGpu, GpuEventSynchronizer xReadyOnDevice, real *h_grids, gmx_parallel_3dfft_t fftSetup, bool computeSplines, bool spreadCharges, const real lambda, const bool useGpuDirectComm, gmx::PmeCoordinateReceiverGpu pmeCoordinateReceiverGpu, const bool useMdGpuGraph, gmx_wallcycle wcycle)
	A GPU spline computation and charge spreading function. More...

void	pme_gpu_solve (const PmeGpu pmeGpu, const int gridIndex, t_complex h_grid, GridOrdering gridOrdering, bool computeEnergyAndVirial)
	A GPU Fourier space solving function. More...

auto	selectGatherKernelPtr (const PmeGpu *pmeGpu, ThreadsPerAtom threadsPerAtom, bool readSplinesFromGlobal, const int numGrids)
	Returns a pointer to appropriate gather kernel based on the inputvalues. More...

void	pme_gpu_gather (PmeGpu pmeGpu, real h_grids, gmx_parallel_3dfft_t fftSetup, const float lambda, gmx_wallcycle *wcycle, bool computeVirial)
	A GPU force gathering function. More...

DeviceBuffer< gmx::RVec >	pme_gpu_get_kernelparam_forces (const PmeGpu *pmeGpu)
	Return pointer to device copy of force data. More...

void	pme_gpu_set_kernelparam_useNvshmem (const PmeGpu *pmeGpu, bool useNvshmem)

void	pme_gpu_set_kernelparam_coordinates (const PmeGpu *pmeGpu, DeviceBuffer< gmx::RVec > d_x)
	Sets the device pointer to coordinate data. More...

GpuEventSynchronizer *	pme_gpu_get_forces_ready_synchronizer (const PmeGpu *pmeGpu)
	Return pointer to the sync object triggered after the PME force calculation completion. More...

Variables
constexpr int	c_pmeAtomDataBlockSize = 64
	Atom data block size (in terms of number of atoms). This is the least common multiple of number of atoms processed by a single block/workgroup of the spread and gather kernels. The GPU atom data buffers must be padded, which means that the numbers of atoms used for determining the size of the memory allocation must be divisible by this.

Function Documentation

static gmx::FftBackend getFftBackend ( const PmeGpu * pmeGpu )

static

Wrapper for getting FFT backend depending on Gromacs GPU backend compilation flags.

Parameters

[in] pmeGpu The PME GPU structure.

Returns: FftBackend enum value.

static int manageSyncWithPpCoordinateSenderGpu	(	const PmeGpu *	pmeGpu,
		gmx::PmeCoordinateReceiverGpu *	pmeCoordinateReceiverGpu,
		bool	usePipeline = `false`,
		int	pipelineStage = `0`
	)

static

Manages synchronization with remote GPU's PP coordinate sender, for a stage of the communication operation. For thread-MPI, an event associated with a stage of the operation is enqueued to the GPU stream that will be used by the consumer. For lib-MPI, the CPU task executing this function will wait for a stage to be completed. In each case, the rank of the sender associated with the corresponding stage is returned.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	pmeCoordinateReceiverGpu	The PME coordinate reciever GPU object
[in]	usePipeline	Whether pipelining is in use for PME-PP communication
[in]	pipelineStage	Stage of the PME-PP communication pipeline

Returns: Rank of remote sender associated with this stage

void pme_gpu_3dfft	(	const PmeGpu *	pmeGpu,
		enum gmx_fft_direction	direction,
		int	gridIndex = `0`
	)

3D FFT R2C/C2R routine.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	direction	Transform direction (real-to-complex or complex-to-real)
[in]	gridIndex	The index of the grid to use. 0 is Coulomb in the normal state or FEP state A and 1 is Coulomb in FEP state B.

void pme_gpu_alloc_energy_virial ( PmeGpu * pmeGpu )

Allocates the fixed size energy and virial buffer both on GPU and CPU.

Parameters

[in,out] pmeGpu The PME GPU structure.

void pme_gpu_clear_energy_virial	(	const PmeGpu *	pmeGpu,
		bool	gpuGraphWithSeparatePmeRank
	)

Clears the energy and virial memory on GPU with 0. Should be called at the end of PME computation which returned energy/virial.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	gpuGraphWithSeparatePmeRank	Whether MD GPU Graph with separate PME rank is in use.

void pme_gpu_clear_grids ( const PmeGpu * pmeGpu )

Clears the real space grid on the GPU. Should be called at the end of each computation.

Parameters

[in] pmeGpu The PME GPU structure.

static void pme_gpu_copy_common_data_from ( const gmx_pme_t * pme )

static

Copies everything useful from the PME CPU to the PME GPU structure. The goal is to minimize interaction with the PME CPU structure in the GPU code.

Parameters

[in] pme The PME structure.

void pme_gpu_copy_input_forces ( PmeGpu * pmeGpu )

Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered forces). To be called e.g. after the bonded calculations.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_copy_input_gather_atom_data ( const PmeGpu * pmeGpu )

Copies the gather input spline data and gridline indices from the host to the GPU.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_copy_input_gather_grid	(	const PmeGpu *	pmeGpu,
		const float *	h_grid,
		int	gridIndex = `0`
	)

Copies the input real-space grid from the host to the GPU.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	h_grid	The host-side grid buffer.
[in]	gridIndex	The index of the grid to use. 0 is Coulomb in the normal state or FEP state A and 1 is Coulomb in FEP state B.

void pme_gpu_copy_output_forces ( PmeGpu * pmeGpu )

Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_copy_output_spread_atom_data ( const PmeGpu * pmeGpu )

Copies the spread output spline data and gridline indices from the GPU to the host.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_copy_output_spread_grid	(	const PmeGpu *	pmeGpu,
		float *	h_grid,
		int	gridIndex = `0`
	)

Copies the output real-space grid from the GPU to the host.

Parameters

[in]	pmeGpu	The PME GPU structure.
[out]	h_grid	The host-side grid buffer.
[in]	gridIndex	The index of the grid to use. 0 is Coulomb in the normal state or FEP state A and 1 is Coulomb in FEP state B.

void pme_gpu_destroy ( PmeGpu * pmeGpu )

Destroys the PME GPU data at the end of the run.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_destroy_3dfft ( const PmeGpu * pmeGpu )

Destroys the CUDA FFT structures.

Parameters

[in] pmeGpu The PME GPU structure.

static CommandEvent* pme_gpu_fetch_timing_event	(	const PmeGpu *	pmeGpu,
		PmeStage	pmeStageId
	)

static

Returns raw timing event from the corresponding GpuRegionTimer (if timings are enabled). In CUDA result can be nullptr stub, per GpuRegionTimer implementation.

Parameters

[in]	pmeGpu	The PME GPU data structure.
[in]	pmeStageId	The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h

void pme_gpu_free_bspline_values ( const PmeGpu * pmeGpu )

Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_free_coefficients ( const PmeGpu * pmeGpu )

Frees the charges/coefficients on the GPU.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_free_energy_virial ( PmeGpu * pmeGpu )

Frees the energy and virial memory both on GPU and CPU.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_free_forces ( const PmeGpu * pmeGpu )

Frees the GPU buffer for the PME forces.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_free_fract_shifts ( const PmeGpu * pmeGpu )

Frees the pre-computed fractional coordinates' shifts on the GPU.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_free_grid_indices ( const PmeGpu * pmeGpu )

Frees the buffer on the GPU for the particle gridline indices.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_free_grids ( const PmeGpu * pmeGpu )

Frees the real space grid and the complex reciprocal grid (if needed) on the GPU.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_free_haloexchange ( const PmeGpu * pmeGpu )

Frees device staging buffers used for PME halo exchange.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_free_spline_data ( const PmeGpu * pmeGpu )

Frees the buffers on the GPU for the atoms spline data.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_gather	(	PmeGpu *	pmeGpu,
		float **	h_grids,
		gmx_parallel_3dfft_t *	fftSetup,
		float	lambda,
		gmx_wallcycle *	wcycle,
		bool	computeVirial
	)

A GPU force gathering function.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	h_grids	The host-side grid buffer (used only in testing mode).
[in]	fftSetup	Host-side FFT setup structure used in Mixed mode
[in]	lambda	The lambda value to use.
[in]	wcycle	The wallclock counter.
[in]	computeVirial	Whether this is a virial step.

int pme_gpu_get_atom_data_block_size ( )

Returns the size of the block size requirement.

The GPU version of PME requires that the coordinates array have a size divisible by the returned number.

Returns: Number of atoms in a single GPU atom data chunk, which determines a minimum divisor of the size of the memory allocated.

GpuEventSynchronizer* pme_gpu_get_forces_ready_synchronizer ( const PmeGpu * pmeGpu )

Return pointer to the sync object triggered after the PME force calculation completion.

Parameters

[in] pmeGpu The PME GPU structure.

Returns: Pointer to sync object

static PmeGpuKernelParamsBase* pme_gpu_get_kernel_params_base_ptr ( const PmeGpu * pmeGpu )

static

Wrapper for getting a pointer to the plain C++ part of the GPU kernel parameters structure.

Parameters

[in] pmeGpu The PME GPU structure.

Returns: The pointer to the kernel parameters.

DeviceBuffer<gmx::RVec> pme_gpu_get_kernelparam_forces ( const PmeGpu * pmeGpu )

Return pointer to device copy of force data.

Parameters

[in] pmeGpu The PME GPU structure.

Returns: Pointer to force data

void pme_gpu_get_real_grid_sizes	(	const PmeGpu *	pmeGpu,
		gmx::IVec *	gridSize,
		gmx::IVec *	paddedGridSize
	)

Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.

Parameters

[in]	pmeGpu	The PME GPU structure.
[out]	gridSize	Pointer to the grid dimensions to fill in.
[out]	paddedGridSize	Pointer to the padded grid dimensions to fill in.

void pme_gpu_getEnergyAndVirial	(	const gmx_pme_t &	pme,
		float	lambda,
		PmeOutput *	output
	)

Returns the energy and virial GPU outputs, useful for testing.

It is the caller's responsibility to be aware of whether the GPU handled the solve stage.

Parameters

[in]	pme	The PME structure.
[in]	lambda	The lambda value to use when calculating the results.
[out]	output	Pointer to output where energy and virial should be stored.

static void pme_gpu_getForceOutput	(	PmeGpu *	pmeGpu,
		PmeOutput *	output
	)

static

Sets the force-related members in output.

Parameters

[in]	pmeGpu	PME GPU data structure
[out]	output	Pointer to PME output data structure

PmeOutput pme_gpu_getOutput	(	const gmx_pme_t &	pme,
		bool	computeEnergyAndVirial,
		real	lambdaQ
	)

Returns the GPU outputs (forces, energy and virial)

Parameters

[in]	pme	The PME structure.
[in]	computeEnergyAndVirial	Whether the energy and virial are being computed
[in]	lambdaQ	The Coulomb lambda to use when finalizing the output.

Returns: The output object.

static void pme_gpu_init	(	gmx_pme_t *	pme,
		const DeviceContext &	deviceContext,
		const DeviceStream &	deviceStream,
		const PmeGpuProgram *	pmeGpuProgram
	)

static

Initializes the PME GPU data at the beginning of the run. TODO: this should become PmeGpu::PmeGpu()

Parameters

[in,out]	pme	The PME structure.
[in]	deviceContext	The GPU context.
[in]	deviceStream	The GPU stream.
[in,out]	pmeGpuProgram	The handle to the program/kernel data created outside (e.g. in unit tests/runner)

static void pme_gpu_init_internal	(	PmeGpu *	pmeGpu,
		const DeviceContext &	deviceContext,
		const DeviceStream &	deviceStream
	)

static

Internal GPU initialization for PME.

Parameters

[in]	pmeGpu	GPU PME data.
[in]	deviceContext	GPU context.
[in]	deviceStream	GPU stream.

void pme_gpu_realloc_and_copy_bspline_values	(	PmeGpu *	pmeGpu,
		int	gridIndex = `0`
	)

Reallocates and copies the pre-computed B-spline values to the GPU.

Parameters

[in,out]	pmeGpu	The PME GPU structure.
[in]	gridIndex	The index of the grid to use. 0 is Coulomb in the normal state or FEP state A and 1 is Coulomb in FEP state B.

void pme_gpu_realloc_and_copy_fract_shifts ( PmeGpu * pmeGpu )

Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_realloc_and_copy_input_coefficients	(	const PmeGpu *	pmeGpu,
		const float *	h_coefficients,
		int	gridIndex = `0`
	)

Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer. Clears the padded part if needed.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	h_coefficients	The input atom charges/coefficients.
[in]	gridIndex	The index of the grid to use. 0 is Coulomb in the normal state or FEP state A and 1 is Coulomb in FEP state B.

Does not need to be done for every PME computation, only whenever the local charges change. (So, in the beginning of the run, or on DD step).

void pme_gpu_realloc_forces ( PmeGpu * pmeGpu )

Reallocates the GPU buffer for the PME forces.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_realloc_grid_indices ( PmeGpu * pmeGpu )

Reallocates the buffers on the GPU and the host for the particle gridline indices.

Parameters

[in,out] pmeGpu The PME GPU structure.

void pme_gpu_realloc_grids ( PmeGpu * pmeGpu )

Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_realloc_spline_data ( PmeGpu * pmeGpu )

Reallocates the buffers on the GPU and the host for the atoms spline data.

Parameters

[in,out] pmeGpu The PME GPU structure.

void pme_gpu_reinit	(	gmx_pme_t *	pme,
		const DeviceContext *	deviceContext,
		const DeviceStream *	deviceStream,
		const PmeGpuProgram *	pmeGpuProgram,
		bool	useMdGpuGraph
	)

(Re-)initializes the PME GPU data at the beginning of the run or on DLB.

Parameters

[in,out]	pme	The PME structure.
[in]	deviceContext	The GPU context.
[in]	deviceStream	The GPU stream.
[in,out]	pmeGpuProgram	The handle to the program/kernel data created outside (e.g. in unit tests/runner)
[in]	useMdGpuGraph	Whether MD GPU Graph is in use

Exceptions

gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.

void pme_gpu_reinit_3dfft ( const PmeGpu * pmeGpu )

Initializes the CUDA FFT structures.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_reinit_atoms	(	PmeGpu *	pmeGpu,
		int	nAtoms,
		const real *	chargesA,
		const real *	chargesB = `nullptr`
	)

Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	nAtoms	The number of particles.
[in]	chargesA	The pointer to the host-side array of particle charges in the unperturbed state or FEP state A.
[in]	chargesB	The pointer to the host-side array of particle charges in FEP state B.

This is a function that should only be called in the beginning of the run and on domain decomposition. Should be called before the pme_gpu_set_io_ranges.

static void pme_gpu_reinit_grids ( PmeGpu * pmeGpu )

static

(Re-)initializes all the PME GPU data related to the grid size and cut-off.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_reinit_haloexchange ( PmeGpu * pmeGpu )

Reinitialize PME halo exchange parameters and staging device buffers for MPI communication.

Parameters

[in] pmeGpu The PME GPU structure.

static void pme_gpu_select_best_performing_pme_spreadgather_kernels ( PmeGpu * pmeGpu )

static

uses heuristics to select the best performing PME gather and scatter kernels

Parameters

[in,out] pmeGpu The PME GPU structure.

void pme_gpu_set_kernelparam_coordinates	(	const PmeGpu *	pmeGpu,
		DeviceBuffer< gmx::RVec >	d_x
	)

Sets the device pointer to coordinate data.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	d_x	Pointer to coordinate data

void pme_gpu_solve	(	const PmeGpu *	pmeGpu,
		int	gridIndex,
		t_complex *	h_grid,
		GridOrdering	gridOrdering,
		bool	computeEnergyAndVirial
	)

A GPU Fourier space solving function.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	gridIndex	The index of the grid to use. 0 is Coulomb in the normal state or FEP state A and 1 is Coulomb in FEP state B.
[in,out]	h_grid	The host-side input and output Fourier grid buffer (used only with testing or host-side FFT)
[in]	gridOrdering	Specifies the dimenion ordering of the complex grid. TODO: store this information?
[in]	computeEnergyAndVirial	Tells if the energy and virial computation should be performed.

void pme_gpu_spread	(	const PmeGpu *	pmeGpu,
		GpuEventSynchronizer *	xReadyOnDevice,
		float **	h_grids,
		gmx_parallel_3dfft_t *	fftSetup,
		bool	computeSplines,
		bool	spreadCharges,
		real	lambda,
		bool	useGpuDirectComm,
		gmx::PmeCoordinateReceiverGpu *	pmeCoordinateReceiverGpu,
		bool	useMdGpuGraph,
		gmx_wallcycle *	wcycle
	)

A GPU spline computation and charge spreading function.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	xReadyOnDevice	Event synchronizer indicating that the coordinates are ready in the device memory; can be nullptr when invoked on a separate PME rank or from PME tests.
[out]	h_grids	The host-side grid buffers (used only if the result of the spread is expected on the host, e.g. testing or host-side FFT)
[in]	fftSetup	Host-side FFT setup structure used in Mixed mode
[in]	computeSplines	Should the computation of spline parameters and gridline indices be performed.
[in]	spreadCharges	Should the charges/coefficients be spread on the grid.
[in]	lambda	The lambda value of the current system state.
[in]	useGpuDirectComm	Whether direct GPU PME-PP communication is active
[in]	pmeCoordinateReceiverGpu	Coordinate receiver object, which must be valid when direct GPU PME-PP communication is active
[in]	useMdGpuGraph	Whether MD GPU Graph is in use.
[in]	wcycle	The wallclock counter.

bool pme_gpu_stream_query ( const PmeGpu * pmeGpu )

Checks whether work in the PME GPU stream has completed.

Parameters

[in] pmeGpu The PME GPU structure.

Returns: True if work in the PME stream has completed.

void pme_gpu_sync_spread_grid ( const PmeGpu * pmeGpu )

Waits for the grid copying to the host-side buffer after spreading to finish.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_synchronize ( const PmeGpu * pmeGpu )

Synchronizes the current computation, waiting for the GPU kernels/transfers to finish.

Parameters

[in] pmeGpu The PME GPU structure.

auto selectGatherKernelPtr	(	const PmeGpu *	pmeGpu,
		ThreadsPerAtom	threadsPerAtom,
		bool	readSplinesFromGlobal,
		const int	numGrids
	)

inline

Returns a pointer to appropriate gather kernel based on the inputvalues.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	threadsPerAtom	Controls whether we should use order or order*order threads per atom
[in]	readSplinesFromGlobal	bool controlling if we should write spline data to global memory
[in]	numGrids	Number of grids to use. numGrids == 2 if Coulomb is perturbed.

Returns: Pointer to CUDA kernel

static auto selectSplineAndSpreadKernelPtr	(	const PmeGpu *	pmeGpu,
		ThreadsPerAtom	threadsPerAtom,
		bool	writeSplinesToGlobal,
		const int	numGrids
	)

static

Returns a pointer to appropriate spline and spread kernel based on the input bool values.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	threadsPerAtom	Controls whether we should use order or order*order threads per atom
[in]	writeSplinesToGlobal	bool controlling if we should write spline data to global memory
[in]	numGrids	Number of grids to use. numGrids == 2 if Coulomb is perturbed.

Returns: Pointer to CUDA kernel

static auto selectSplineKernelPtr	(	const PmeGpu *	pmeGpu,
		ThreadsPerAtom	threadsPerAtom,
		bool gmx_unused	writeSplinesToGlobal,
		const int	numGrids
	)

static

Returns a pointer to appropriate spline kernel based on the input bool values.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	threadsPerAtom	Controls whether we should use order or order*order threads per atom
[in]	writeSplinesToGlobal	bool controlling if we should write spline data to global memory
[in]	numGrids	Number of grids to use. numGrids == 2 if Coulomb is perturbed.

Returns: Pointer to CUDA kernel

static auto selectSpreadKernelPtr	(	const PmeGpu *	pmeGpu,
		ThreadsPerAtom	threadsPerAtom,
		bool	writeSplinesToGlobal,
		const int	numGrids
	)

static

Returns a pointer to appropriate spread kernel based on the input bool values.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	threadsPerAtom	Controls whether we should use order or order*order threads per atom
[in]	writeSplinesToGlobal	bool controlling if we should write spline data to global memory
[in]	numGrids	Number of grids to use. numGrids == 2 if Coulomb is perturbed.

Returns: Pointer to CUDA kernel

Description

Functions

Variables

Function Documentation