Gromacs
2024.4
|
#include "gmxpre.h"
#include "config.h"
#include <list>
#include "gromacs/ewald/ewald_utils.h"
#include "gromacs/ewald/pme.h"
#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
#include "gromacs/fft/parallel_3dfft.h"
#include "gromacs/math/boxmatrix.h"
#include "gromacs/mdlib/gmx_omp_nthreads.h"
#include "gromacs/mdtypes/enerdata.h"
#include "gromacs/mdtypes/forceoutput.h"
#include "gromacs/mdtypes/inputrec.h"
#include "gromacs/mdtypes/simulation_workload.h"
#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/stringutil.h"
#include "pme_gpu_internal.h"
#include "pme_gpu_settings.h"
#include "pme_gpu_timings.h"
#include "pme_gpu_types_host.h"
#include "pme_grid.h"
#include "pme_internal.h"
#include "pme_solve.h"
Implements high-level PME GPU functions which do not require GPU framework-specific code.
Functions | |
static bool | pme_gpu_active (const gmx_pme_t *pme) |
Finds out if PME is currently running on GPU. More... | |
void | pme_gpu_reset_timings (const gmx_pme_t *pme) |
Resets the PME GPU timings. To be called at the reset step. More... | |
void | pme_gpu_get_timings (const gmx_pme_t *pme, gmx_wallclock_gpu_pme_t *timings) |
Copies the PME GPU timings to the gmx_wallclock_gpu_pme_t structure (for log output). To be called at the run end. More... | |
int | pme_gpu_get_block_size (const gmx_pme_t *pme) |
Returns the block size requirement. More... | |
void | parallel_3dfft_execute_gpu_wrapper (gmx_pme_t *pme, const int gridIndex, enum gmx_fft_direction dir, gmx_wallcycle *wcycle) |
A convenience wrapper for launching either the GPU or CPU FFT. More... | |
void | pme_gpu_prepare_computation (gmx_pme_t *pme, const matrix box, gmx_wallcycle *wcycle, const gmx::StepWorkload &stepWork) |
Prepares PME on GPU computation (updating the box if needed) More... | |
void | pme_gpu_launch_spread (gmx_pme_t *pme, GpuEventSynchronizer *xReadyOnDevice, gmx_wallcycle *wcycle, const real lambdaQ, const bool useGpuDirectComm, gmx::PmeCoordinateReceiverGpu *pmeCoordinateReceiverGpu, const bool useMdGpuGraph) |
Launches first stage of PME on GPU - spreading kernel. More... | |
void | pme_gpu_launch_complex_transforms (gmx_pme_t *pme, gmx_wallcycle *wcycle, const gmx::StepWorkload &stepWork) |
Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode. More... | |
void | pme_gpu_launch_gather (const gmx_pme_t *pme, gmx_wallcycle gmx_unused *wcycle, const real lambdaQ, const bool computeVirial) |
static void | sum_forces (gmx::ArrayRef< gmx::RVec > f, gmx::ArrayRef< const gmx::RVec > forceToAdd) |
Accumulate the forcesToAdd to f , using the available threads. | |
static void | pme_gpu_reduce_outputs (const bool computeEnergyAndVirial, const PmeOutput &output, gmx_wallcycle *wcycle, gmx::ForceWithVirial *forceWithVirial, gmx_enerdata_t *enerd) |
Reduce quantities from output to forceWithVirial and enerd . | |
bool | pme_gpu_try_finish_task (gmx_pme_t *pme, const gmx::StepWorkload &stepWork, gmx_wallcycle *wcycle, gmx::ForceWithVirial *forceWithVirial, gmx_enerdata_t *enerd, const real lambdaQ, GpuTaskCompletion completionKind) |
Attempts to complete PME GPU tasks. More... | |
PmeOutput | pme_gpu_wait_finish_task (gmx_pme_t *pme, const bool computeEnergyAndVirial, const real lambdaQ, gmx_wallcycle *wcycle) |
Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy (if they were to be computed). More... | |
void | pme_gpu_wait_and_reduce (gmx_pme_t *pme, const gmx::StepWorkload &stepWork, gmx_wallcycle *wcycle, gmx::ForceWithVirial *forceWithVirial, gmx_enerdata_t *enerd, const real lambdaQ) |
Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy (if they were to be computed). More... | |
void | pme_gpu_reinit_computation (const gmx_pme_t *pme, const bool gpuGraphWithSeparatePmeRank, gmx_wallcycle *wcycle) |
The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing. More... | |
DeviceBuffer< gmx::RVec > | pme_gpu_get_device_f (const gmx_pme_t *pme) |
Get pointer to device copy of force data. More... | |
void | pme_gpu_set_device_x (const gmx_pme_t *pme, DeviceBuffer< gmx::RVec > d_x) |
Set pointer to device copy of coordinate data. More... | |
GpuEventSynchronizer * | pme_gpu_get_f_ready_synchronizer (const gmx_pme_t *pme) |
Get pointer to the device synchronizer object that allows syncing on PME force calculation completion. More... | |
void | pme_gpu_use_nvshmem (PmeGpu *pmeGpu, bool useNvshmem) |
Sets the nvshmem usage status, and allocates required structs if NVSHMEM should be used. More... | |
|
inline |
A convenience wrapper for launching either the GPU or CPU FFT.
[in] | pme | The PME structure. |
[in] | gridIndex | The grid index - should currently always be 0. |
[in] | dir | The FFT direction enum. |
[in] | wcycle | The wallclock counter. |
|
inlinestatic |
Finds out if PME is currently running on GPU.
[in] | pme | The PME structure. |
int pme_gpu_get_block_size | ( | const gmx_pme_t * | pme | ) |
Returns the block size requirement.
The GPU version of PME requires that the coordinates array have a size divisible by the returned number.
[in] | pme | The PME data structure. |
DeviceBuffer<gmx::RVec> pme_gpu_get_device_f | ( | const gmx_pme_t * | pme | ) |
Get pointer to device copy of force data.
[in] | pme | The PME data structure. |
GpuEventSynchronizer* pme_gpu_get_f_ready_synchronizer | ( | const gmx_pme_t * | pme | ) |
Get pointer to the device synchronizer object that allows syncing on PME force calculation completion.
[in] | pme | The PME data structure. |
void pme_gpu_get_timings | ( | const gmx_pme_t * | pme, |
gmx_wallclock_gpu_pme_t * | timings | ||
) |
Copies the PME GPU timings to the gmx_wallclock_gpu_pme_t structure (for log output). To be called at the run end.
[in] | pme | The PME structure. |
[in] | timings | The gmx_wallclock_gpu_pme_t structure. |
void pme_gpu_launch_complex_transforms | ( | gmx_pme_t * | pme, |
gmx_wallcycle * | wcycle, | ||
const gmx::StepWorkload & | stepWork | ||
) |
Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
[in] | pme | The PME data structure. |
[in] | wcycle | The wallclock counter. |
[in] | stepWork | The required work for this simulation step |
void pme_gpu_launch_spread | ( | gmx_pme_t * | pme, |
GpuEventSynchronizer * | xReadyOnDevice, | ||
gmx_wallcycle * | wcycle, | ||
real | lambdaQ, | ||
bool | useGpuDirectComm, | ||
gmx::PmeCoordinateReceiverGpu * | pmeCoordinateReceiverGpu, | ||
bool | useMdGpuGraph | ||
) |
Launches first stage of PME on GPU - spreading kernel.
[in] | pme | The PME data structure. |
[in] | xReadyOnDevice | Event synchronizer indicating that the coordinates are ready in the device memory; nullptr allowed only on separate PME ranks. |
[in] | wcycle | The wallclock counter. |
[in] | lambdaQ | The Coulomb lambda of the current state of the system. Only used if FEP of Coulomb is active. |
[in] | useGpuDirectComm | Whether direct GPU PME-PP communication is active |
[in] | pmeCoordinateReceiverGpu | Coordinate receiver object, which must be valid when direct GPU PME-PP communication is active |
[in] | useMdGpuGraph | Whether MD GPU Graph is in use. |
void pme_gpu_prepare_computation | ( | gmx_pme_t * | pme, |
const matrix | box, | ||
gmx_wallcycle * | wcycle, | ||
const gmx::StepWorkload & | stepWork | ||
) |
Prepares PME on GPU computation (updating the box if needed)
[in] | pme | The PME data structure. |
[in] | box | The unit cell box. |
[in] | wcycle | The wallclock counter. |
[in] | stepWork | The required work for this simulation step |
void pme_gpu_reinit_computation | ( | const gmx_pme_t * | pme, |
bool | gpuGraphWithSeparatePmeRank, | ||
gmx_wallcycle * | wcycle | ||
) |
The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
Clears the internal grid and energy/virial buffers; it is not safe to start the PME computation without calling this. Note that unlike in the nbnxn module, the force buffer does not need clearing.
[in] | pme | The PME data structure. |
[in] | gpuGraphWithSeparatePmeRank | Whether MD GPU Graph with separate PME rank is in use. |
[in] | wcycle | The wallclock counter. |
void pme_gpu_reset_timings | ( | const gmx_pme_t * | pme | ) |
Resets the PME GPU timings. To be called at the reset step.
[in] | pme | The PME structure. |
void pme_gpu_set_device_x | ( | const gmx_pme_t * | pme, |
DeviceBuffer< gmx::RVec > | d_x | ||
) |
Set pointer to device copy of coordinate data.
[in] | pme | The PME data structure. |
[in] | d_x | The pointer to the positions buffer to be set |
bool pme_gpu_try_finish_task | ( | gmx_pme_t * | pme, |
const gmx::StepWorkload & | stepWork, | ||
gmx_wallcycle * | wcycle, | ||
gmx::ForceWithVirial * | forceWithVirial, | ||
gmx_enerdata_t * | enerd, | ||
real | lambdaQ, | ||
GpuTaskCompletion | completionKind | ||
) |
Attempts to complete PME GPU tasks.
The completionKind
argument controls whether the function blocks until all PME GPU tasks enqueued completed (as pme_gpu_wait_finish_task() does) or only checks and returns immediately if they did not. When blocking or the tasks have completed it also gets the output forces by assigning the ArrayRef to the forces
pointer passed in. Virial/energy are also outputs if they were to be computed.
[in] | pme | The PME data structure. |
[in] | stepWork | The required work for this simulation step |
[in] | wcycle | The wallclock counter. |
[out] | forceWithVirial | The output force and virial |
[out] | enerd | The output energies |
[in] | lambdaQ | The Coulomb lambda to use when calculating the results. |
[in] | completionKind | Indicates whether PME task completion should only be checked rather than waited for |
void pme_gpu_use_nvshmem | ( | PmeGpu * | pmeGpu, |
bool | useNvshmem | ||
) |
Sets the nvshmem usage status, and allocates required structs if NVSHMEM should be used.
[in] | pmeGpu | The PME GPU structure. |
[in] | useNvshmem | should use NVSHMEM. |
void pme_gpu_wait_and_reduce | ( | gmx_pme_t * | pme, |
const gmx::StepWorkload & | stepWork, | ||
gmx_wallcycle * | wcycle, | ||
gmx::ForceWithVirial * | forceWithVirial, | ||
gmx_enerdata_t * | enerd, | ||
real | lambdaQ | ||
) |
Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy (if they were to be computed).
[in] | pme | The PME data structure. |
[in] | stepWork | The required work for this simulation step |
[in] | wcycle | The wallclock counter. |
[out] | forceWithVirial | The output force and virial |
[out] | enerd | The output energies |
[in] | lambdaQ | The Coulomb lambda to use when calculating the results. |
PmeOutput pme_gpu_wait_finish_task | ( | gmx_pme_t * | pme, |
bool | computeEnergyAndVirial, | ||
real | lambdaQ, | ||
gmx_wallcycle * | wcycle | ||
) |
Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy (if they were to be computed).
[in] | pme | The PME data structure. |
[in] | computeEnergyAndVirial | Tells if the energy and virial computation should be performed. |
[in] | lambdaQ | The Coulomb lambda to use when calculating the results. |
[out] | wcycle | The wallclock counter. |