#include "gromacs/fft/fft.h"
#include "gromacs/gpu_utils/gpu_macros.h"
#include "gromacs/utility/arrayref.h"
#include "pme-gpu-types.h"

Include dependency graph for pme-gpu-internal.h:

This graph shows which files directly or indirectly include this file:

Description

This file contains internal function definitions for performing the PME calculations on GPU. These are not meant to be exposed outside of the PME GPU code. As of now, their bodies are still in the common pme-gpu.cpp files.

Author: Aleksei Iupinov a.yup.nosp@m.inov.nosp@m.@gmai.nosp@m.l.co.nosp@m.m

Enumerations
enum	PmeSplineDataType { Values, Derivatives }
	Type of spline data.

enum	GridOrdering { YZX, XYZ }
	PME grid dimension ordering (from major to minor)

enum	PmeLayoutTransform { GpuToHost, HostToGpu }
	A binary enum for spline data layout transformation.

Functions
int	pme_gpu_get_atom_data_alignment (const PmeGpu *pmeGpu)
	Returns the number of atoms per chunk in the atom charges/coordinates data layout. Depends on CUDA-specific block sizes, needed for the atom data padding. More...

int	pme_gpu_get_atoms_per_warp (const PmeGpu *pmeGpu)
	Returns the number of atoms per chunk in the atom spline theta/dtheta data layout. More...

void	pme_gpu_synchronize (const PmeGpu *pmeGpu)
	Synchronizes the current computation, waiting for the GPU kernels/transfers to finish. More...

void	pme_gpu_alloc_energy_virial (const PmeGpu *pmeGpu)
	Allocates the fixed size energy and virial buffer both on GPU and CPU. More...

void	pme_gpu_free_energy_virial (PmeGpu *pmeGpu)
	Frees the energy and virial memory both on GPU and CPU. More...

void	pme_gpu_clear_energy_virial (const PmeGpu *pmeGpu)
	Clears the energy and virial memory on GPU with 0. Should be called at the end of PME computation which returned energy/virial. More...

void	pme_gpu_realloc_and_copy_bspline_values (const PmeGpu *pmeGpu)
	Reallocates and copies the pre-computed B-spline values to the GPU. More...

void	pme_gpu_free_bspline_values (const PmeGpu *pmeGpu)
	Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers). More...

void	pme_gpu_realloc_forces (PmeGpu *pmeGpu)
	Reallocates the GPU buffer for the PME forces. More...

void	pme_gpu_free_forces (const PmeGpu *pmeGpu)
	Frees the GPU buffer for the PME forces. More...

void	pme_gpu_copy_input_forces (PmeGpu *pmeGpu)
	Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered forces). To be called e.g. after the bonded calculations. More...

void	pme_gpu_copy_output_forces (PmeGpu *pmeGpu)
	Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage. More...

bool	pme_gpu_stream_query (const PmeGpu *pmeGpu)
	Checks whether work in the PME GPU stream has completed. More...

void	pme_gpu_realloc_coordinates (const PmeGpu *pmeGpu)
	Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed). More...

void	pme_gpu_copy_input_coordinates (const PmeGpu pmeGpu, const rvec h_coordinates)
	Copies the input coordinates from the CPU buffer onto the GPU. More...

void	pme_gpu_free_coordinates (const PmeGpu *pmeGpu)
	Frees the coordinates on the GPU. More...

void	pme_gpu_realloc_and_copy_input_coefficients (const PmeGpu pmeGpu, const float h_coefficients)
	Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer. Clears the padded part if needed. More...

void	pme_gpu_free_coefficients (const PmeGpu *pmeGpu)
	Frees the charges/coefficients on the GPU. More...

void	pme_gpu_realloc_spline_data (const PmeGpu *pmeGpu)
	Reallocates the buffers on the GPU and the host for the atoms spline data. More...

void	pme_gpu_free_spline_data (const PmeGpu *pmeGpu)
	Frees the buffers on the GPU for the atoms spline data. More...

void	pme_gpu_realloc_grid_indices (const PmeGpu *pmeGpu)
	Reallocates the buffers on the GPU and the host for the particle gridline indices. More...

void	pme_gpu_free_grid_indices (const PmeGpu *pmeGpu)
	Frees the buffer on the GPU for the particle gridline indices. More...

void	pme_gpu_realloc_grids (PmeGpu *pmeGpu)
	Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU. More...

void	pme_gpu_free_grids (const PmeGpu *pmeGpu)
	Frees the real space grid and the complex reciprocal grid (if needed) on the GPU. More...

void	pme_gpu_clear_grids (const PmeGpu *pmeGpu)
	Clears the real space grid on the GPU. Should be called at the end of each computation. More...

void	pme_gpu_realloc_and_copy_fract_shifts (PmeGpu *pmeGpu)
	Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU. More...

void	pme_gpu_free_fract_shifts (const PmeGpu *pmeGpu)
	Frees the pre-computed fractional coordinates' shifts on the GPU. More...

void	pme_gpu_copy_input_gather_grid (const PmeGpu pmeGpu, float h_grid)
	Copies the input real-space grid from the host to the GPU. More...

void	pme_gpu_copy_output_spread_grid (const PmeGpu pmeGpu, float h_grid)
	Copies the output real-space grid from the GPU to the host. More...

void	pme_gpu_copy_output_spread_atom_data (const PmeGpu *pmeGpu)
	Copies the spread output spline data and gridline indices from the GPU to the host. More...

void	pme_gpu_copy_input_gather_atom_data (const PmeGpu *pmeGpu)
	Copies the gather input spline data and gridline indices from the host to the GPU. More...

void	pme_gpu_sync_spread_grid (const PmeGpu *pmeGpu)
	Waits for the grid copying to the host-side buffer after spreading to finish. More...

void	pme_gpu_init_internal (PmeGpu *pmeGpu)
	Does the one-time GPU-framework specific PME initialization. For CUDA, the PME stream is created with the highest priority. More...

void	pme_gpu_destroy_specific (const PmeGpu *pmeGpu)
	Destroys the PME GPU-framework specific data. Should be called last in the PME GPU destructor. More...

void	pme_gpu_init_sync_events (const PmeGpu *pmeGpu)
	Initializes the PME GPU synchronization events. More...

void	pme_gpu_destroy_sync_events (const PmeGpu *pmeGpu)
	Destroys the PME GPU synchronization events. More...

void	pme_gpu_reinit_3dfft (const PmeGpu *pmeGpu)
	Initializes the CUDA FFT structures. More...

void	pme_gpu_destroy_3dfft (const PmeGpu *pmeGpu)
	Destroys the CUDA FFT structures. More...

void	pme_gpu_update_timings (const PmeGpu *pmeGpu)
	Finalizes all the active PME GPU stage timings for the current computation. Should be called at the end of every computation. More...

void	pme_gpu_reinit_timings (const PmeGpu *pmeGpu)
	Updates the internal list of active PME GPU stages (if timings are enabled). More...

void	pme_gpu_reset_timings (const PmeGpu *pmeGpu)
	Resets the PME GPU timings. To be called at the reset MD step. More...

void	pme_gpu_get_timings (const PmeGpu pmeGpu, gmx_wallclock_gpu_pme_t timings)
	Copies the PME GPU timings to the gmx_wallclock_gpu_t structure (for log output). To be called at the run end. More...

void	pme_gpu_spread (const PmeGpu pmeGpu, int gridIndex, real h_grid, bool computeSplines, bool spreadCharges)
	A GPU spline computation and charge spreading function. More...

void	pme_gpu_3dfft (const PmeGpu *pmeGpu, enum gmx_fft_direction direction, const int gridIndex)
	3D FFT R2C/C2R routine. More...

void	pme_gpu_solve (const PmeGpu pmeGpu, t_complex h_grid, GridOrdering gridOrdering, bool computeEnergyAndVirial)
	A GPU Fourier space solving function. More...

void	pme_gpu_gather (PmeGpu pmeGpu, PmeForceOutputHandling forceTreatment, const float h_grid)
	A GPU force gathering function. More...

bool	pme_gpu_uses_dd (const PmeGpu *pmeGpu)
	Tells if PME runs on multiple GPUs with the decomposition. More...

bool	pme_gpu_performs_gather (const PmeGpu *pmeGpu)
	Tells if PME performs the gathering stage on GPU. More...

bool	pme_gpu_performs_FFT (const PmeGpu *pmeGpu)
	Tells if PME performs the FFT stages on GPU. More...

bool	pme_gpu_performs_wrapping (const PmeGpu *pmeGpu)
	Tells if PME performs the grid (un-)wrapping on GPU. More...

bool	pme_gpu_performs_solve (const PmeGpu *pmeGpu)
	Tells if PME performs the grid solving on GPU. More...

void	pme_gpu_set_testing (PmeGpu *pmeGpu, bool testing)
	Enables or disables the testing mode. Testing mode only implies copying all the outputs, even the intermediate ones, to the host, and also makes the copies synchronous. More...

bool	pme_gpu_is_testing (const PmeGpu *pmeGpu)
	Tells if PME is in the testing mode. More...

gmx::ArrayRef< gmx::RVec >	pme_gpu_get_forces (PmeGpu *pmeGpu)
	Returns the GPU gathering staging forces buffer. More...

void	pme_gpu_get_energy_virial (const PmeGpu pmeGpu, real energy, matrix virial)
	Returns the output virial and energy of the PME solving. More...

void	pme_gpu_update_input_box (PmeGpu *pmeGpu, const matrix box)
	Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_prepare_computation(). More...

void	pme_gpu_finish_computation (const PmeGpu *pmeGpu)
	Finishes the PME GPU computation, waiting for the output forces and/or energy/virial to be copied to the host. If forces were computed, they will have arrived at the external host buffer provided to gather. If virial/energy were computed, they will have arrived into the internal staging buffer (even though that should have already happened before even launching the gather). Finally, cudaEvent_t based GPU timers get updated if enabled. They also need stream synchronization for correctness. Additionally, device-side buffers are cleared asynchronously for the next computation. More...

void	pme_gpu_transform_spline_atom_data (const PmeGpu pmeGpu, const pme_atomcomm_t atc, PmeSplineDataType type, int dimIndex, PmeLayoutTransform transform)
	Rearranges the atom spline data between the GPU and host layouts. Only used for test purposes so far, likely to be horribly slow. More...

void	pme_gpu_get_real_grid_sizes (const PmeGpu pmeGpu, gmx::IVec gridSize, gmx::IVec *paddedGridSize)
	Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests. More...

void	pme_gpu_reinit (gmx_pme_t pme, gmx_device_info_t gpuInfo)
	(Re-)initializes the PME GPU data at the beginning of the run or on DLB. More...

void	pme_gpu_destroy (PmeGpu *pmeGpu)
	Destroys the PME GPU data at the end of the run. More...

void	pme_gpu_reinit_atoms (PmeGpu pmeGpu, const int nAtoms, const real charges)
	Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU. More...

void	pme_gpu_reinit_computation (const PmeGpu *pmeGpu)
	The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing. More...

Variables
const bool	c_usePadding = true
	false: The atom data GPU buffers are sized precisely according to the number of atoms. (Except GPU spline data layout which is regardless intertwined for 2 atoms per warp). The atom index checks in the spread/gather code potentially hinder the performance. true: The atom data GPU buffers are padded with zeroes so that the possible number of atoms fitting in is divisible by PME_ATOM_DATA_ALIGNMENT. The atom index checks are not performed. There should be a performance win, but how big is it, remains to be seen. Additional cudaMemsetAsync calls are done occasionally (only charges/coordinates; spline data is always recalculated now). More...

const bool	c_skipNeutralAtoms = false
	false: Atoms with zero charges are processed by PME. Could introduce some overhead. true: Atoms with zero charges are not processed by PME. Adds branching to the spread/gather. Could be good for performance in specific systems with lots of neutral atoms. More...

const int	c_virialAndEnergyCount = 7
	Number of PME solve output floating point numbers. 6 for symmetric virial matrix + 1 for reciprocal energy.

Function Documentation

void pme_gpu_3dfft	(	const PmeGpu *	pmeGpu,
		enum gmx_fft_direction	direction,
		const int	gridIndex
	)

3D FFT R2C/C2R routine.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	direction	Transform direction (real-to-complex or complex-to-real)
[in]	gridIndex	Index of the PME grid - unused, assumed to be 0.

void pme_gpu_alloc_energy_virial ( const PmeGpu * pmeGpu )

Allocates the fixed size energy and virial buffer both on GPU and CPU.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_clear_energy_virial ( const PmeGpu * pmeGpu )

Clears the energy and virial memory on GPU with 0. Should be called at the end of PME computation which returned energy/virial.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_clear_grids ( const PmeGpu * pmeGpu )

Clears the real space grid on the GPU. Should be called at the end of each computation.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_copy_input_coordinates	(	const PmeGpu *	pmeGpu,
		const rvec *	h_coordinates
	)

Copies the input coordinates from the CPU buffer onto the GPU.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	h_coordinates	Input coordinates (XYZ rvec array).

Needs to be called for every PME computation. The coordinates are then used in the spline calculation.

void pme_gpu_copy_input_forces ( PmeGpu * pmeGpu )

Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered forces). To be called e.g. after the bonded calculations.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_copy_input_gather_atom_data ( const PmeGpu * pmeGpu )

Copies the gather input spline data and gridline indices from the host to the GPU.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_copy_input_gather_grid	(	const PmeGpu *	pmeGpu,
		float *	h_grid
	)

Copies the input real-space grid from the host to the GPU.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	h_grid	The host-side grid buffer.

void pme_gpu_copy_output_forces ( PmeGpu * pmeGpu )

Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_copy_output_spread_atom_data ( const PmeGpu * pmeGpu )

Copies the spread output spline data and gridline indices from the GPU to the host.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_copy_output_spread_grid	(	const PmeGpu *	pmeGpu,
		float *	h_grid
	)

Copies the output real-space grid from the GPU to the host.

Parameters

[in]	pmeGpu	The PME GPU structure.
[out]	h_grid	The host-side grid buffer.

void pme_gpu_destroy ( PmeGpu * pmeGpu )

Destroys the PME GPU data at the end of the run.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_destroy_3dfft ( const PmeGpu * pmeGpu )

Destroys the CUDA FFT structures.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_destroy_specific ( const PmeGpu * pmeGpu )

Destroys the PME GPU-framework specific data. Should be called last in the PME GPU destructor.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_destroy_sync_events ( const PmeGpu * pmeGpu )

Destroys the PME GPU synchronization events.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_finish_computation ( const PmeGpu * pmeGpu )

Finishes the PME GPU computation, waiting for the output forces and/or energy/virial to be copied to the host. If forces were computed, they will have arrived at the external host buffer provided to gather. If virial/energy were computed, they will have arrived into the internal staging buffer (even though that should have already happened before even launching the gather). Finally, cudaEvent_t based GPU timers get updated if enabled. They also need stream synchronization for correctness. Additionally, device-side buffers are cleared asynchronously for the next computation.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_free_bspline_values ( const PmeGpu * pmeGpu )

Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_free_coefficients ( const PmeGpu * pmeGpu )

Frees the charges/coefficients on the GPU.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_free_coordinates ( const PmeGpu * pmeGpu )

Frees the coordinates on the GPU.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_free_energy_virial ( PmeGpu * pmeGpu )

Frees the energy and virial memory both on GPU and CPU.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_free_forces ( const PmeGpu * pmeGpu )

Frees the GPU buffer for the PME forces.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_free_fract_shifts ( const PmeGpu * pmeGpu )

Frees the pre-computed fractional coordinates' shifts on the GPU.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_free_grid_indices ( const PmeGpu * pmeGpu )

Frees the buffer on the GPU for the particle gridline indices.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_free_grids ( const PmeGpu * pmeGpu )

Frees the real space grid and the complex reciprocal grid (if needed) on the GPU.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_free_spline_data ( const PmeGpu * pmeGpu )

Frees the buffers on the GPU for the atoms spline data.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_gather	(	PmeGpu *	pmeGpu,
		PmeForceOutputHandling	forceTreatment,
		const float *	h_grid
	)

A GPU force gathering function.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	forceTreatment	Tells how data in h_forces should be treated. TODO: determine efficiency/balance of host/device-side reductions.
[in]	h_grid	The host-side grid buffer (used only in testing mode)

int pme_gpu_get_atom_data_alignment ( const PmeGpu * pmeGpu )

Returns the number of atoms per chunk in the atom charges/coordinates data layout. Depends on CUDA-specific block sizes, needed for the atom data padding.

Parameters

[in] pmeGpu The PME GPU structure.

Returns: Number of atoms in a single GPU atom data chunk.

int pme_gpu_get_atoms_per_warp ( const PmeGpu * pmeGpu )

Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.

Parameters

[in] pmeGpu The PME GPU structure.

Returns: Number of atoms in a single GPU atom spline data chunk.

void pme_gpu_get_energy_virial	(	const PmeGpu *	pmeGpu,
		real *	energy,
		matrix	virial
	)

Returns the output virial and energy of the PME solving.

Parameters

[in]	pmeGpu	The PME GPU structure.
[out]	energy	The output energy.
[out]	virial	The output virial matrix.

gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces ( PmeGpu * pmeGpu )

Returns the GPU gathering staging forces buffer.

Parameters

[in] pmeGpu The PME GPU structure.

Returns: The input/output forces.

void pme_gpu_get_real_grid_sizes	(	const PmeGpu *	pmeGpu,
		gmx::IVec *	gridSize,
		gmx::IVec *	paddedGridSize
	)

Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.

Parameters

[in]	pmeGpu	The PME GPU structure.
[out]	gridSize	Pointer to the grid dimensions to fill in.
[out]	paddedGridSize	Pointer to the padded grid dimensions to fill in.

void pme_gpu_get_timings	(	const PmeGpu *	pmeGpu,
		gmx_wallclock_gpu_pme_t *	timings
	)

Copies the PME GPU timings to the gmx_wallclock_gpu_t structure (for log output). To be called at the run end.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	timings	The gmx_wallclock_gpu_pme_t structure.

void pme_gpu_init_internal ( PmeGpu * pmeGpu )

Does the one-time GPU-framework specific PME initialization. For CUDA, the PME stream is created with the highest priority.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_init_sync_events ( const PmeGpu * pmeGpu )

Initializes the PME GPU synchronization events.

Parameters

[in] pmeGpu The PME GPU structure.

bool pme_gpu_is_testing ( const PmeGpu * pmeGpu )

inline

Tells if PME is in the testing mode.

Parameters

[in] pmeGpu The PME GPU structure.

Returns: true if testing mode is enabled, false otherwise.

bool pme_gpu_performs_FFT ( const PmeGpu * pmeGpu )

inline

Tells if PME performs the FFT stages on GPU.

Parameters

[in] pmeGpu The PME GPU structure.

Returns: True if FFT is performed on GPU, false otherwise.

bool pme_gpu_performs_gather ( const PmeGpu * pmeGpu )

inline

Tells if PME performs the gathering stage on GPU.

Parameters

[in] pmeGpu The PME GPU structure.

Returns: True if the gathering is performed on GPU, false otherwise.

bool pme_gpu_performs_solve ( const PmeGpu * pmeGpu )

inline

Tells if PME performs the grid solving on GPU.

Parameters

[in] pmeGpu The PME GPU structure.

Returns: True if solving is performed on GPU, false otherwise.

bool pme_gpu_performs_wrapping ( const PmeGpu * pmeGpu )

inline

Tells if PME performs the grid (un-)wrapping on GPU.

Parameters

[in] pmeGpu The PME GPU structure.

Returns: True if (un-)wrapping is performed on GPU, false otherwise.

void pme_gpu_realloc_and_copy_bspline_values ( const PmeGpu * pmeGpu )

Reallocates and copies the pre-computed B-spline values to the GPU.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_realloc_and_copy_fract_shifts ( PmeGpu * pmeGpu )

Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_realloc_and_copy_input_coefficients	(	const PmeGpu *	pmeGpu,
		const float *	h_coefficients
	)

Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer. Clears the padded part if needed.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	h_coefficients	The input atom charges/coefficients.

Does not need to be done for every PME computation, only whenever the local charges change. (So, in the beginning of the run, or on DD step).

void pme_gpu_realloc_coordinates ( const PmeGpu * pmeGpu )

Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).

Parameters

[in] pmeGpu The PME GPU structure.

Needs to be called on every DD step/in the beginning.

void pme_gpu_realloc_forces ( PmeGpu * pmeGpu )

Reallocates the GPU buffer for the PME forces.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_realloc_grid_indices ( const PmeGpu * pmeGpu )

Reallocates the buffers on the GPU and the host for the particle gridline indices.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_realloc_grids ( PmeGpu * pmeGpu )

Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_realloc_spline_data ( const PmeGpu * pmeGpu )

Reallocates the buffers on the GPU and the host for the atoms spline data.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_reinit	(	gmx_pme_t *	pme,
		gmx_device_info_t *	gpuInfo
	)

(Re-)initializes the PME GPU data at the beginning of the run or on DLB.

Parameters

[in,out]	pme	The PME structure.
[in,out]	gpuInfo	The GPU information structure.

Exceptions

gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.

void pme_gpu_reinit_3dfft ( const PmeGpu * pmeGpu )

Initializes the CUDA FFT structures.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_reinit_atoms	(	PmeGpu *	pmeGpu,
		const int	nAtoms,
		const real *	charges
	)

Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	nAtoms	The number of particles.
[in]	charges	The pointer to the host-side array of particle charges.

This is a function that should only be called in the beginning of the run and on domain decomposition. Should be called before the pme_gpu_set_io_ranges.

void pme_gpu_reinit_computation ( const PmeGpu * pmeGpu )

The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.

This clears the device-side working buffers in preparation for new computation.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	pmeGpu	The PME GPU structure.

void pme_gpu_reinit_timings ( const PmeGpu * pmeGpu )

Updates the internal list of active PME GPU stages (if timings are enabled).

Parameters

[in] pmeGpu The PME GPU data structure.

void pme_gpu_reset_timings ( const PmeGpu * pmeGpu )

Resets the PME GPU timings. To be called at the reset MD step.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_set_testing	(	PmeGpu *	pmeGpu,
		bool	testing
	)

inline

Enables or disables the testing mode. Testing mode only implies copying all the outputs, even the intermediate ones, to the host, and also makes the copies synchronous.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	testing	Should the testing mode be enabled, or disabled.

void pme_gpu_solve	(	const PmeGpu *	pmeGpu,
		t_complex *	h_grid,
		GridOrdering	gridOrdering,
		bool	computeEnergyAndVirial
	)

A GPU Fourier space solving function.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in,out]	h_grid	The host-side input and output Fourier grid buffer (used only with testing or host-side FFT)
[in]	gridOrdering	Specifies the dimenion ordering of the complex grid. TODO: store this information?
[in]	computeEnergyAndVirial	Tells if the energy and virial computation should also be performed.

void pme_gpu_spread	(	const PmeGpu *	pmeGpu,
		int	gridIndex,
		real *	h_grid,
		bool	computeSplines,
		bool	spreadCharges
	)

A GPU spline computation and charge spreading function.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	gridIndex	Index of the PME grid - unused, assumed to be 0.
[out]	h_grid	The host-side grid buffer (used only if the result of the spread is expected on the host, e.g. testing or host-side FFT)
[in]	computeSplines	Should the computation of spline parameters and gridline indices be performed.
[in]	spreadCharges	Should the charges/coefficients be spread on the grid.

bool pme_gpu_stream_query ( const PmeGpu * pmeGpu )

Checks whether work in the PME GPU stream has completed.

Parameters

[in] pmeGpu The PME GPU structure.

Returns: True if work in the PME stream has completed.

void pme_gpu_sync_spread_grid ( const PmeGpu * pmeGpu )

Waits for the grid copying to the host-side buffer after spreading to finish.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_synchronize ( const PmeGpu * pmeGpu )

Synchronizes the current computation, waiting for the GPU kernels/transfers to finish.

Parameters

[in] pmeGpu The PME GPU structure.

void pme_gpu_transform_spline_atom_data	(	const PmeGpu *	pmeGpu,
		const pme_atomcomm_t *	atc,
		PmeSplineDataType	type,
		int	dimIndex,
		PmeLayoutTransform	transform
	)

Rearranges the atom spline data between the GPU and host layouts. Only used for test purposes so far, likely to be horribly slow.

Parameters

[in]	pmeGpu	The PME GPU structure.
[out]	atc	The PME CPU atom data structure (with a single-threaded layout).
[in]	type	The spline data type (values or derivatives).
[in]	dimIndex	Dimension index.
[in]	transform	Layout transform type

void pme_gpu_update_input_box	(	PmeGpu *	pmeGpu,
		const matrix	box
	)

Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_prepare_computation().

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	box	The unit cell box.

void pme_gpu_update_timings ( const PmeGpu * pmeGpu )

Finalizes all the active PME GPU stage timings for the current computation. Should be called at the end of every computation.

Parameters

[in] pmeGpu The PME GPU structure.

bool pme_gpu_uses_dd ( const PmeGpu * pmeGpu )

inline

Tells if PME runs on multiple GPUs with the decomposition.

Parameters

[in] pmeGpu The PME GPU structure.

Returns: True if PME runs on multiple GPUs, false otherwise.

Variable Documentation

const bool c_skipNeutralAtoms = false

false: Atoms with zero charges are processed by PME. Could introduce some overhead. true: Atoms with zero charges are not processed by PME. Adds branching to the spread/gather. Could be good for performance in specific systems with lots of neutral atoms.

Todo:: Estimate performance differences.

const bool c_usePadding = true

false: The atom data GPU buffers are sized precisely according to the number of atoms. (Except GPU spline data layout which is regardless intertwined for 2 atoms per warp). The atom index checks in the spread/gather code potentially hinder the performance. true: The atom data GPU buffers are padded with zeroes so that the possible number of atoms fitting in is divisible by PME_ATOM_DATA_ALIGNMENT. The atom index checks are not performed. There should be a performance win, but how big is it, remains to be seen. Additional cudaMemsetAsync calls are done occasionally (only charges/coordinates; spline data is always recalculated now).

Todo:: Estimate performance differences

Description

Enumerations

Functions

Variables

Function Documentation

Variable Documentation