Gromacs
2020.4
|
#include "gmxpre.h"
#include "pme_gpu_internal.h"
#include "config.h"
#include <list>
#include <memory>
#include <string>
#include "gromacs/ewald/ewald_utils.h"
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/math/invertmatrix.h"
#include "gromacs/math/units.h"
#include "gromacs/timing/gpu_timing.h"
#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/logger.h"
#include "gromacs/utility/stringutil.h"
#include "gromacs/gpu_utils/pmalloc_cuda.h"
#include "pme.cuh"
#include "gromacs/ewald/pme.h"
#include "pme_gpu_3dfft.h"
#include "pme_gpu_program_impl.h"
#include "pme_gpu_timings.h"
#include "pme_gpu_types_host.h"
#include "pme_gpu_types_host_impl.h"
#include "pme_gpu_utils.h"
#include "pme_grid.h"
#include "pme_internal.h"
#include "pme_solve.h"
This file contains internal function implementations for performing the PME calculations on GPU.
Note that this file is compiled as regular C++ source in OpenCL builds, but it is treated as CUDA source in CUDA-enabled GPU builds.
Functions | |
static PmeGpuKernelParamsBase * | pme_gpu_get_kernel_params_base_ptr (const PmeGpu *pmeGpu) |
Wrapper for getting a pointer to the plain C++ part of the GPU kernel parameters structure. More... | |
int | pme_gpu_get_atom_data_alignment (const PmeGpu *) |
Returns the number of atoms per chunk in the atom charges/coordinates data layout. Depends on CUDA-specific block sizes, needed for the atom data padding. More... | |
int | pme_gpu_get_atoms_per_warp (const PmeGpu *pmeGpu) |
Returns the number of atoms per chunk in the atom spline theta/dtheta data layout. More... | |
void | pme_gpu_synchronize (const PmeGpu *pmeGpu) |
Synchronizes the current computation, waiting for the GPU kernels/transfers to finish. More... | |
void | pme_gpu_alloc_energy_virial (PmeGpu *pmeGpu) |
Allocates the fixed size energy and virial buffer both on GPU and CPU. More... | |
void | pme_gpu_free_energy_virial (PmeGpu *pmeGpu) |
Frees the energy and virial memory both on GPU and CPU. More... | |
void | pme_gpu_clear_energy_virial (const PmeGpu *pmeGpu) |
Clears the energy and virial memory on GPU with 0. Should be called at the end of PME computation which returned energy/virial. More... | |
void | pme_gpu_realloc_and_copy_bspline_values (PmeGpu *pmeGpu) |
Reallocates and copies the pre-computed B-spline values to the GPU. More... | |
void | pme_gpu_free_bspline_values (const PmeGpu *pmeGpu) |
Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers). More... | |
void | pme_gpu_realloc_forces (PmeGpu *pmeGpu) |
Reallocates the GPU buffer for the PME forces. More... | |
void | pme_gpu_free_forces (const PmeGpu *pmeGpu) |
Frees the GPU buffer for the PME forces. More... | |
void | pme_gpu_copy_input_forces (PmeGpu *pmeGpu) |
Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered forces). To be called e.g. after the bonded calculations. More... | |
void | pme_gpu_copy_output_forces (PmeGpu *pmeGpu) |
Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage. More... | |
void | pme_gpu_realloc_coordinates (const PmeGpu *pmeGpu) |
Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed). More... | |
void | pme_gpu_free_coordinates (const PmeGpu *pmeGpu) |
Frees the coordinates on the GPU. More... | |
void | pme_gpu_realloc_and_copy_input_coefficients (const PmeGpu *pmeGpu, const float *h_coefficients) |
Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer. Clears the padded part if needed. More... | |
void | pme_gpu_free_coefficients (const PmeGpu *pmeGpu) |
Frees the charges/coefficients on the GPU. More... | |
void | pme_gpu_realloc_spline_data (PmeGpu *pmeGpu) |
Reallocates the buffers on the GPU and the host for the atoms spline data. More... | |
void | pme_gpu_free_spline_data (const PmeGpu *pmeGpu) |
Frees the buffers on the GPU for the atoms spline data. More... | |
void | pme_gpu_realloc_grid_indices (PmeGpu *pmeGpu) |
Reallocates the buffers on the GPU and the host for the particle gridline indices. More... | |
void | pme_gpu_free_grid_indices (const PmeGpu *pmeGpu) |
Frees the buffer on the GPU for the particle gridline indices. More... | |
void | pme_gpu_realloc_grids (PmeGpu *pmeGpu) |
Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU. More... | |
void | pme_gpu_free_grids (const PmeGpu *pmeGpu) |
Frees the real space grid and the complex reciprocal grid (if needed) on the GPU. More... | |
void | pme_gpu_clear_grids (const PmeGpu *pmeGpu) |
Clears the real space grid on the GPU. Should be called at the end of each computation. More... | |
void | pme_gpu_realloc_and_copy_fract_shifts (PmeGpu *pmeGpu) |
Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU. More... | |
void | pme_gpu_free_fract_shifts (const PmeGpu *pmeGpu) |
Frees the pre-computed fractional coordinates' shifts on the GPU. More... | |
bool | pme_gpu_stream_query (const PmeGpu *pmeGpu) |
Checks whether work in the PME GPU stream has completed. More... | |
void | pme_gpu_copy_input_gather_grid (const PmeGpu *pmeGpu, float *h_grid) |
Copies the input real-space grid from the host to the GPU. More... | |
void | pme_gpu_copy_output_spread_grid (const PmeGpu *pmeGpu, float *h_grid) |
Copies the output real-space grid from the GPU to the host. More... | |
void | pme_gpu_copy_output_spread_atom_data (const PmeGpu *pmeGpu) |
Copies the spread output spline data and gridline indices from the GPU to the host. More... | |
void | pme_gpu_copy_input_gather_atom_data (const PmeGpu *pmeGpu) |
Copies the gather input spline data and gridline indices from the host to the GPU. More... | |
void | pme_gpu_sync_spread_grid (const PmeGpu *pmeGpu) |
Waits for the grid copying to the host-side buffer after spreading to finish. More... | |
void | pme_gpu_init_internal (PmeGpu *pmeGpu) |
Does the one-time GPU-framework specific PME initialization. For CUDA, the PME stream is created with the highest priority. More... | |
void | pme_gpu_destroy_specific (const PmeGpu *pmeGpu) |
Destroys the PME GPU-framework specific data. Should be called last in the PME GPU destructor. More... | |
void | pme_gpu_reinit_3dfft (const PmeGpu *pmeGpu) |
Initializes the CUDA FFT structures. More... | |
void | pme_gpu_destroy_3dfft (const PmeGpu *pmeGpu) |
Destroys the CUDA FFT structures. More... | |
int | getSplineParamFullIndex (int order, int splineIndex, int dimIndex, int atomIndex, int atomsPerWarp) |
Gets a unique index to an element in a spline parameter buffer (theta/dtheta), which is laid out for GPU spread/gather kernels. The index is wrt the execution block, in range(0, atomsPerBlock * order * DIM). This is a wrapper, only used in unit tests. More... | |
void | pme_gpu_getEnergyAndVirial (const gmx_pme_t &pme, PmeOutput *output) |
Returns the energy and virial GPU outputs, useful for testing. More... | |
static void | pme_gpu_getForceOutput (PmeGpu *pmeGpu, PmeOutput *output) |
Sets the force-related members in output . More... | |
PmeOutput | pme_gpu_getOutput (const gmx_pme_t &pme, const int flags) |
Returns the GPU outputs (forces, energy and virial) More... | |
void | pme_gpu_update_input_box (PmeGpu *pmeGpu, const matrix box) |
Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_prepare_computation(). More... | |
static void | pme_gpu_reinit_grids (PmeGpu *pmeGpu) |
(Re-)initializes all the PME GPU data related to the grid size and cut-off. More... | |
static void | pme_gpu_copy_common_data_from (const gmx_pme_t *pme) |
Copies everything useful from the PME CPU to the PME GPU structure. The goal is to minimize interaction with the PME CPU structure in the GPU code. More... | |
static void | pme_gpu_select_best_performing_pme_spreadgather_kernels (PmeGpu *pmeGpu) |
uses heuristics to select the best performing PME gather and scatter kernels More... | |
static void | pme_gpu_init (gmx_pme_t *pme, const gmx_device_info_t *gpuInfo, PmeGpuProgramHandle pmeGpuProgram) |
Initializes the PME GPU data at the beginning of the run. TODO: this should become PmeGpu::PmeGpu() More... | |
void | pme_gpu_transform_spline_atom_data (const PmeGpu *pmeGpu, const PmeAtomComm *atc, PmeSplineDataType type, int dimIndex, PmeLayoutTransform transform) |
Rearranges the atom spline data between the GPU and host layouts. Only used for test purposes so far, likely to be horribly slow. More... | |
void | pme_gpu_get_real_grid_sizes (const PmeGpu *pmeGpu, gmx::IVec *gridSize, gmx::IVec *paddedGridSize) |
Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests. More... | |
void | pme_gpu_reinit (gmx_pme_t *pme, const gmx_device_info_t *gpuInfo, PmeGpuProgramHandle pmeGpuProgram) |
(Re-)initializes the PME GPU data at the beginning of the run or on DLB. More... | |
void | pme_gpu_destroy (PmeGpu *pmeGpu) |
Destroys the PME GPU data at the end of the run. More... | |
void | pme_gpu_reinit_atoms (PmeGpu *pmeGpu, const int nAtoms, const real *charges) |
Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU. More... | |
void | pme_gpu_3dfft (const PmeGpu *pmeGpu, gmx_fft_direction dir, int grid_index) |
3D FFT R2C/C2R routine. More... | |
std::pair< int, int > | pmeGpuCreateGrid (const PmeGpu *pmeGpu, int blockCount) |
Given possibly large blockCount , returns a compact 1D or 2D grid for kernel scheduling, to minimize number of unused blocks. | |
static auto | selectSplineAndSpreadKernelPtr (const PmeGpu *pmeGpu, bool useOrderThreadsPerAtom, bool writeSplinesToGlobal) |
Returns a pointer to appropriate spline and spread kernel based on the input bool values. More... | |
static auto | selectSplineKernelPtr (const PmeGpu *pmeGpu, bool useOrderThreadsPerAtom, bool writeSplinesToGlobal) |
Returns a pointer to appropriate spline kernel based on the input bool values. More... | |
static auto | selectSpreadKernelPtr (const PmeGpu *pmeGpu, bool useOrderThreadsPerAtom, bool writeSplinesToGlobal) |
Returns a pointer to appropriate spread kernel based on the input bool values. More... | |
void | pme_gpu_spread (const PmeGpu *pmeGpu, GpuEventSynchronizer *xReadyOnDevice, int gridIndex, real *h_grid, bool computeSplines, bool spreadCharges) |
A GPU spline computation and charge spreading function. More... | |
void | pme_gpu_solve (const PmeGpu *pmeGpu, t_complex *h_grid, GridOrdering gridOrdering, bool computeEnergyAndVirial) |
A GPU Fourier space solving function. More... | |
auto | selectGatherKernelPtr (const PmeGpu *pmeGpu, bool useOrderThreadsPerAtom, bool readSplinesFromGlobal, PmeForceOutputHandling forceTreatment) |
Returns a pointer to appropriate gather kernel based on the inputvalues. More... | |
void | pme_gpu_gather (PmeGpu *pmeGpu, PmeForceOutputHandling forceTreatment, const float *h_grid) |
A GPU force gathering function. More... | |
DeviceBuffer< float > | pme_gpu_get_kernelparam_coordinates (const PmeGpu *pmeGpu) |
Return pointer to device copy of coordinate data. More... | |
void * | pme_gpu_get_kernelparam_forces (const PmeGpu *pmeGpu) |
Return pointer to device copy of force data. More... | |
template<typename T > | |
static bool | checkDeviceBuffer (DeviceBuffer< T > buffer, int requiredSize) |
Check the validity of the device buffer. More... | |
void | pme_gpu_set_kernelparam_coordinates (const PmeGpu *pmeGpu, DeviceBuffer< float > d_x) |
Sets the device pointer to coordinate data. More... | |
void * | pme_gpu_get_stream (const PmeGpu *pmeGpu) |
Return pointer to GPU stream. More... | |
void * | pme_gpu_get_context (const PmeGpu *pmeGpu) |
Return pointer to GPU context (for OpenCL builds). More... | |
GpuEventSynchronizer * | pme_gpu_get_forces_ready_synchronizer (const PmeGpu *pmeGpu) |
Return pointer to the sync object triggered after the PME force calculation completion. More... | |
Variables | |
constexpr int | c_pmeGpuPerformanceAtomLimit = 23000 |
CUDA only Atom limit above which it is advantageous to turn on the recalcuating of the splines in the gather and using less threads per atom in the spline and spread. | |
|
static |
Check the validity of the device buffer.
Checks if the buffer is not nullptr and, when possible, if it is big enough.
[in] | buffer | Device buffer to be checked. |
[in] | requiredSize | Number of elements that the buffer will have to accommodate. |
int getSplineParamFullIndex | ( | int | order, |
int | splineIndex, | ||
int | dimIndex, | ||
int | atomIndex, | ||
int | atomsPerWarp | ||
) |
Gets a unique index to an element in a spline parameter buffer (theta/dtheta), which is laid out for GPU spread/gather kernels. The index is wrt the execution block, in range(0, atomsPerBlock * order * DIM). This is a wrapper, only used in unit tests.
[in] | order | PME order |
[in] | splineIndex | Spline contribution index (from 0 to order - 1) |
[in] | dimIndex | Dimension index (from 0 to 2) |
[in] | atomIndex | Atom index wrt the block. |
[in] | atomsPerWarp | Number of atoms processed by a warp. |
void pme_gpu_3dfft | ( | const PmeGpu * | pmeGpu, |
enum gmx_fft_direction | direction, | ||
int | gridIndex | ||
) |
3D FFT R2C/C2R routine.
[in] | pmeGpu | The PME GPU structure. |
[in] | direction | Transform direction (real-to-complex or complex-to-real) |
[in] | gridIndex | Index of the PME grid - unused, assumed to be 0. |
void pme_gpu_alloc_energy_virial | ( | PmeGpu * | pmeGpu | ) |
Allocates the fixed size energy and virial buffer both on GPU and CPU.
[in,out] | pmeGpu | The PME GPU structure. |
void pme_gpu_clear_energy_virial | ( | const PmeGpu * | pmeGpu | ) |
Clears the energy and virial memory on GPU with 0. Should be called at the end of PME computation which returned energy/virial.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_clear_grids | ( | const PmeGpu * | pmeGpu | ) |
Clears the real space grid on the GPU. Should be called at the end of each computation.
[in] | pmeGpu | The PME GPU structure. |
|
static |
Copies everything useful from the PME CPU to the PME GPU structure. The goal is to minimize interaction with the PME CPU structure in the GPU code.
[in] | pme | The PME structure. |
void pme_gpu_copy_input_forces | ( | PmeGpu * | pmeGpu | ) |
Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered forces). To be called e.g. after the bonded calculations.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_copy_input_gather_atom_data | ( | const PmeGpu * | pmeGpu | ) |
Copies the gather input spline data and gridline indices from the host to the GPU.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_copy_input_gather_grid | ( | const PmeGpu * | pmeGpu, |
float * | h_grid | ||
) |
Copies the input real-space grid from the host to the GPU.
[in] | pmeGpu | The PME GPU structure. |
[in] | h_grid | The host-side grid buffer. |
void pme_gpu_copy_output_forces | ( | PmeGpu * | pmeGpu | ) |
Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_copy_output_spread_atom_data | ( | const PmeGpu * | pmeGpu | ) |
Copies the spread output spline data and gridline indices from the GPU to the host.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_copy_output_spread_grid | ( | const PmeGpu * | pmeGpu, |
float * | h_grid | ||
) |
Copies the output real-space grid from the GPU to the host.
[in] | pmeGpu | The PME GPU structure. |
[out] | h_grid | The host-side grid buffer. |
void pme_gpu_destroy | ( | PmeGpu * | pmeGpu | ) |
Destroys the PME GPU data at the end of the run.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_destroy_3dfft | ( | const PmeGpu * | pmeGpu | ) |
Destroys the CUDA FFT structures.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_destroy_specific | ( | const PmeGpu * | pmeGpu | ) |
Destroys the PME GPU-framework specific data. Should be called last in the PME GPU destructor.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_free_bspline_values | ( | const PmeGpu * | pmeGpu | ) |
Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_free_coefficients | ( | const PmeGpu * | pmeGpu | ) |
Frees the charges/coefficients on the GPU.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_free_coordinates | ( | const PmeGpu * | pmeGpu | ) |
Frees the coordinates on the GPU.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_free_energy_virial | ( | PmeGpu * | pmeGpu | ) |
Frees the energy and virial memory both on GPU and CPU.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_free_forces | ( | const PmeGpu * | pmeGpu | ) |
Frees the GPU buffer for the PME forces.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_free_fract_shifts | ( | const PmeGpu * | pmeGpu | ) |
Frees the pre-computed fractional coordinates' shifts on the GPU.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_free_grid_indices | ( | const PmeGpu * | pmeGpu | ) |
Frees the buffer on the GPU for the particle gridline indices.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_free_grids | ( | const PmeGpu * | pmeGpu | ) |
Frees the real space grid and the complex reciprocal grid (if needed) on the GPU.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_free_spline_data | ( | const PmeGpu * | pmeGpu | ) |
Frees the buffers on the GPU for the atoms spline data.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_gather | ( | PmeGpu * | pmeGpu, |
PmeForceOutputHandling | forceTreatment, | ||
const float * | h_grid | ||
) |
A GPU force gathering function.
[in] | pmeGpu | The PME GPU structure. |
[in] | forceTreatment | Tells how data in h_forces should be treated. TODO: determine efficiency/balance of host/device-side reductions. |
[in] | h_grid | The host-side grid buffer (used only in testing mode) |
int pme_gpu_get_atom_data_alignment | ( | const PmeGpu * | pmeGpu | ) |
Returns the number of atoms per chunk in the atom charges/coordinates data layout. Depends on CUDA-specific block sizes, needed for the atom data padding.
[in] | pmeGpu | The PME GPU structure. |
int pme_gpu_get_atoms_per_warp | ( | const PmeGpu * | pmeGpu | ) |
Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
[in] | pmeGpu | The PME GPU structure. |
void* pme_gpu_get_context | ( | const PmeGpu * | pmeGpu | ) |
Return pointer to GPU context (for OpenCL builds).
[in] | pmeGpu | The PME GPU structure. |
GpuEventSynchronizer* pme_gpu_get_forces_ready_synchronizer | ( | const PmeGpu * | pmeGpu | ) |
Return pointer to the sync object triggered after the PME force calculation completion.
[in] | pmeGpu | The PME GPU structure. |
|
static |
Wrapper for getting a pointer to the plain C++ part of the GPU kernel parameters structure.
[in] | pmeGpu | The PME GPU structure. |
DeviceBuffer<float> pme_gpu_get_kernelparam_coordinates | ( | const PmeGpu * | pmeGpu | ) |
Return pointer to device copy of coordinate data.
[in] | pmeGpu | The PME GPU structure. |
void* pme_gpu_get_kernelparam_forces | ( | const PmeGpu * | pmeGpu | ) |
Return pointer to device copy of force data.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_get_real_grid_sizes | ( | const PmeGpu * | pmeGpu, |
gmx::IVec * | gridSize, | ||
gmx::IVec * | paddedGridSize | ||
) |
Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.
[in] | pmeGpu | The PME GPU structure. |
[out] | gridSize | Pointer to the grid dimensions to fill in. |
[out] | paddedGridSize | Pointer to the padded grid dimensions to fill in. |
void* pme_gpu_get_stream | ( | const PmeGpu * | pmeGpu | ) |
Return pointer to GPU stream.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_getEnergyAndVirial | ( | const gmx_pme_t & | pme, |
PmeOutput * | output | ||
) |
Returns the energy and virial GPU outputs, useful for testing.
It is the caller's responsibility to be aware of whether the GPU handled the solve stage.
[in] | pme | The PME structure. |
[out] | output | Pointer to output where energy and virial should be stored. |
|
static |
Sets the force-related members in output
.
[in] | pmeGpu | PME GPU data structure |
[out] | output | Pointer to PME output data structure |
PmeOutput pme_gpu_getOutput | ( | const gmx_pme_t & | pme, |
int | flags | ||
) |
Returns the GPU outputs (forces, energy and virial)
[in] | pme | The PME structure. |
[in] | flags | The combination of flags that affected this PME computation. The flags are the GMX_PME_ flags from pme.h. |
|
static |
Initializes the PME GPU data at the beginning of the run. TODO: this should become PmeGpu::PmeGpu()
[in,out] | pme | The PME structure. |
[in,out] | gpuInfo | The GPU information structure. |
[in] | pmeGpuProgram | The handle to the program/kernel data created outside (e.g. in unit tests/runner) |
void pme_gpu_init_internal | ( | PmeGpu * | pmeGpu | ) |
Does the one-time GPU-framework specific PME initialization. For CUDA, the PME stream is created with the highest priority.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_realloc_and_copy_bspline_values | ( | PmeGpu * | pmeGpu | ) |
Reallocates and copies the pre-computed B-spline values to the GPU.
[in,out] | pmeGpu | The PME GPU structure. |
void pme_gpu_realloc_and_copy_fract_shifts | ( | PmeGpu * | pmeGpu | ) |
Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_realloc_and_copy_input_coefficients | ( | const PmeGpu * | pmeGpu, |
const float * | h_coefficients | ||
) |
Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer. Clears the padded part if needed.
[in] | pmeGpu | The PME GPU structure. |
[in] | h_coefficients | The input atom charges/coefficients. |
Does not need to be done for every PME computation, only whenever the local charges change. (So, in the beginning of the run, or on DD step).
void pme_gpu_realloc_coordinates | ( | const PmeGpu * | pmeGpu | ) |
Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
[in] | pmeGpu | The PME GPU structure. |
Needs to be called on every DD step/in the beginning.
void pme_gpu_realloc_forces | ( | PmeGpu * | pmeGpu | ) |
Reallocates the GPU buffer for the PME forces.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_realloc_grid_indices | ( | PmeGpu * | pmeGpu | ) |
Reallocates the buffers on the GPU and the host for the particle gridline indices.
[in,out] | pmeGpu | The PME GPU structure. |
void pme_gpu_realloc_grids | ( | PmeGpu * | pmeGpu | ) |
Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_realloc_spline_data | ( | PmeGpu * | pmeGpu | ) |
Reallocates the buffers on the GPU and the host for the atoms spline data.
[in,out] | pmeGpu | The PME GPU structure. |
void pme_gpu_reinit | ( | gmx_pme_t * | pme, |
const gmx_device_info_t * | gpuInfo, | ||
PmeGpuProgramHandle | pmeGpuProgram | ||
) |
(Re-)initializes the PME GPU data at the beginning of the run or on DLB.
[in,out] | pme | The PME structure. |
[in] | gpuInfo | The GPU information structure. |
[in] | pmeGpuProgram | The PME GPU program data |
gmx::NotImplementedError | if this generally valid PME structure is not valid for GPU runs. |
void pme_gpu_reinit_3dfft | ( | const PmeGpu * | pmeGpu | ) |
Initializes the CUDA FFT structures.
[in] | pmeGpu | The PME GPU structure. |
Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.
[in] | pmeGpu | The PME GPU structure. |
[in] | nAtoms | The number of particles. |
[in] | charges | The pointer to the host-side array of particle charges. |
This is a function that should only be called in the beginning of the run and on domain decomposition. Should be called before the pme_gpu_set_io_ranges.
|
static |
(Re-)initializes all the PME GPU data related to the grid size and cut-off.
[in] | pmeGpu | The PME GPU structure. |
|
static |
uses heuristics to select the best performing PME gather and scatter kernels
[in,out] | pmeGpu | The PME GPU structure. |
void pme_gpu_set_kernelparam_coordinates | ( | const PmeGpu * | pmeGpu, |
DeviceBuffer< float > | d_x | ||
) |
Sets the device pointer to coordinate data.
[in] | pmeGpu | The PME GPU structure. |
[in] | d_x | Pointer to coordinate data |
void pme_gpu_solve | ( | const PmeGpu * | pmeGpu, |
t_complex * | h_grid, | ||
GridOrdering | gridOrdering, | ||
bool | computeEnergyAndVirial | ||
) |
A GPU Fourier space solving function.
[in] | pmeGpu | The PME GPU structure. |
[in,out] | h_grid | The host-side input and output Fourier grid buffer (used only with testing or host-side FFT) |
[in] | gridOrdering | Specifies the dimenion ordering of the complex grid. TODO: store this information? |
[in] | computeEnergyAndVirial | Tells if the energy and virial computation should also be performed. |
void pme_gpu_spread | ( | const PmeGpu * | pmeGpu, |
GpuEventSynchronizer * | xReadyOnDevice, | ||
int | gridIndex, | ||
real * | h_grid, | ||
bool | computeSplines, | ||
bool | spreadCharges | ||
) |
A GPU spline computation and charge spreading function.
[in] | pmeGpu | The PME GPU structure. |
[in] | xReadyOnDevice | Event synchronizer indicating that the coordinates are ready in the device memory; can be nullptr when invoked on a separate PME rank or from PME tests. |
[in] | gridIndex | Index of the PME grid - unused, assumed to be 0. |
[out] | h_grid | The host-side grid buffer (used only if the result of the spread is expected on the host, e.g. testing or host-side FFT) |
[in] | computeSplines | Should the computation of spline parameters and gridline indices be performed. |
[in] | spreadCharges | Should the charges/coefficients be spread on the grid. |
bool pme_gpu_stream_query | ( | const PmeGpu * | pmeGpu | ) |
Checks whether work in the PME GPU stream has completed.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_sync_spread_grid | ( | const PmeGpu * | pmeGpu | ) |
Waits for the grid copying to the host-side buffer after spreading to finish.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_synchronize | ( | const PmeGpu * | pmeGpu | ) |
Synchronizes the current computation, waiting for the GPU kernels/transfers to finish.
[in] | pmeGpu | The PME GPU structure. |
void pme_gpu_transform_spline_atom_data | ( | const PmeGpu * | pmeGpu, |
const PmeAtomComm * | atc, | ||
PmeSplineDataType | type, | ||
int | dimIndex, | ||
PmeLayoutTransform | transform | ||
) |
Rearranges the atom spline data between the GPU and host layouts. Only used for test purposes so far, likely to be horribly slow.
[in] | pmeGpu | The PME GPU structure. |
[out] | atc | The PME CPU atom data structure (with a single-threaded layout). |
[in] | type | The spline data type (values or derivatives). |
[in] | dimIndex | Dimension index. |
[in] | transform | Layout transform type |
void pme_gpu_update_input_box | ( | PmeGpu * | pmeGpu, |
const matrix | box | ||
) |
Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_prepare_computation().
[in] | pmeGpu | The PME GPU structure. |
[in] | box | The unit cell box. |
|
inline |
Returns a pointer to appropriate gather kernel based on the inputvalues.
[in] | pmeGpu | The PME GPU structure. |
[in] | useOrderThreadsPerAtom | bool controlling if we should use order or order*order threads per atom |
[in] | readSplinesFromGlobal | bool controlling if we should write spline data to global memory |
[in] | forceTreatment | Controls if the forces from the gather should increment or replace the input forces. |
|
static |
Returns a pointer to appropriate spline and spread kernel based on the input bool values.
[in] | pmeGpu | The PME GPU structure. |
[in] | useOrderThreadsPerAtom | bool controlling if we should use order or order*order threads per atom |
[in] | writeSplinesToGlobal | bool controlling if we should write spline data to global memory |
|
static |
Returns a pointer to appropriate spline kernel based on the input bool values.
[in] | pmeGpu | The PME GPU structure. |
[in] | useOrderThreadsPerAtom | bool controlling if we should use order or order*order threads per atom |
[in] | writeSplinesToGlobal | bool controlling if we should write spline data to global memory |
|
static |
Returns a pointer to appropriate spread kernel based on the input bool values.
[in] | pmeGpu | The PME GPU structure. |
[in] | useOrderThreadsPerAtom | bool controlling if we should use order or order*order threads per atom |
[in] | writeSplinesToGlobal | bool controlling if we should write spline data to global memory |