Gromacs
2022
|
#include "gmxpre.h"
#include "pme_gather_sycl.h"
#include "gromacs/gpu_utils/gmxsycl.h"
#include "gromacs/gpu_utils/gputraits_sycl.h"
#include "gromacs/gpu_utils/sycl_kernel_utils.h"
#include "gromacs/gpu_utils/syclutils.h"
#include "gromacs/math/functions.h"
#include "pme_gpu_calculate_splines_sycl.h"
#include "pme_grid.h"
#include "pme_gpu_constants.h"
#include "pme_gpu_types_host.h"
Implements PME force gathering in SYCL.
Macros | |
#define | INSTANTIATE_3(order, numGrids, readGlobal, threadsPerAtom, subGroupSize) template class PmeGatherKernel<order, true, true, numGrids, readGlobal, threadsPerAtom, subGroupSize>; |
Kernel instantiations. | |
#define | INSTANTIATE_2(order, numGrids, threadsPerAtom, subGroupSize) |
#define | INSTANTIATE(order, subGroupSize) |
Functions | |
float | readGridSize (const float *realGridSizeFP, const int dimIndex) |
Use loads from constant address space indexed by constant offsets rather than dynamic index-based accesses to the grid size data to avoid local memory operations and related large overhead. More... | |
template<int order, int atomDataSize, int workGroupSize, int subGroupSize> | |
void | reduceAtomForces (sycl::nd_item< 3 > itemIdx, sycl::local_ptr< Float3 > sm_forces, const int atomIndexLocal, const int splineIndex, const int gmx_unused lineIndex, const float realGridSizeFP[3], float &fx, float &fy, float &fz) |
Reduce the partial force contributions. More... | |
template<int order, int atomsPerWarp, bool wrapX, bool wrapY> | |
void | sumForceComponents (sycl::private_ptr< float > fx, sycl::private_ptr< float > fy, sycl::private_ptr< float > fz, const int ithyMin, const int ithyMax, const int ixBase, const int iz, const int nx, const int ny, const int pny, const int pnz, const int atomIndexLocal, const int splineIndexBase, const sycl::float2 tdz, const sycl::local_ptr< int > sm_gridlineIndices, const sycl::local_ptr< float > sm_theta, const sycl::local_ptr< float > sm_dtheta, const sycl::global_ptr< const float > gm_grid) |
Calculate the sum of the force partial components (in X, Y and Z) More... | |
void | calculateAndStoreGridForces (sycl::local_ptr< Float3 > sm_forces, const int forceIndexLocal, const int forceIndexGlobal, const Float3 &recipBox0, const Float3 &recipBox1, const Float3 &recipBox2, const float scale, const sycl::global_ptr< const float > gm_coefficients) |
Calculate the grid forces and store them in shared memory. More... | |
template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize> | |
auto | pmeGatherKernel (sycl::handler &cgh, const int nAtoms, DeviceAccessor< float, mode::read > a_gridA, OptionalAccessor< float, mode::read, numGrids==2 > a_gridB, DeviceAccessor< float, mode::read > a_coefficientsA, OptionalAccessor< float, mode::read, numGrids==2 > a_coefficientsB, OptionalAccessor< Float3, mode::read,!readGlobal > a_coordinates, DeviceAccessor< Float3, mode::read_write > a_forces, DeviceAccessor< float, mode::read > a_theta, DeviceAccessor< float, mode::read > a_dtheta, DeviceAccessor< int, mode::read > a_gridlineIndices, OptionalAccessor< float, mode::read,!readGlobal > a_fractShiftsTable, OptionalAccessor< int, mode::read,!readGlobal > a_gridlineIndicesTable, const gmx::IVec tablesOffsets, const gmx::IVec realGridSize, const gmx::RVec realGridSizeFP, const gmx::IVec realGridSizePadded, const gmx::RVec currentRecipBox0, const gmx::RVec currentRecipBox1, const gmx::RVec currentRecipBox2, const float scale) |
A SYCL kernel which gathers the atom forces from the grid. The grid is assumed to be wrapped in dimension Z. More... | |
#define INSTANTIATE | ( | order, | |
subGroupSize | |||
) |
#define INSTANTIATE_2 | ( | order, | |
numGrids, | |||
threadsPerAtom, | |||
subGroupSize | |||
) |
|
inline |
Calculate the grid forces and store them in shared memory.
[in,out] | sm_forces | Shared memory array with the output forces. |
[in] | forceIndexLocal | The local (per thread) index in the sm_forces array. |
[in] | forceIndexGlobal | The index of the thread in the gm_coefficients array. |
[in] | recipBox0 | The reciprocal box (first vector). |
[in] | recipBox1 | The reciprocal box (second vector). |
[in] | recipBox2 | The reciprocal box (third vector). |
[in] | scale | The scale to use when calculating the forces. For gm_coefficientsB (when using multiple coefficients on a single grid) the scale will be (1.0 - scale). |
[in] | gm_coefficients | Global memory array of the coefficients to use for an unperturbed or FEP in state A if a single grid is used (multiCoefficientsSingleGrid == true).If two separate grids are used this should be the coefficients of the grid in question. |
auto pmeGatherKernel | ( | sycl::handler & | cgh, |
const int | nAtoms, | ||
DeviceAccessor< float, mode::read > | a_gridA, | ||
OptionalAccessor< float, mode::read, numGrids==2 > | a_gridB, | ||
DeviceAccessor< float, mode::read > | a_coefficientsA, | ||
OptionalAccessor< float, mode::read, numGrids==2 > | a_coefficientsB, | ||
OptionalAccessor< Float3, mode::read,!readGlobal > | a_coordinates, | ||
DeviceAccessor< Float3, mode::read_write > | a_forces, | ||
DeviceAccessor< float, mode::read > | a_theta, | ||
DeviceAccessor< float, mode::read > | a_dtheta, | ||
DeviceAccessor< int, mode::read > | a_gridlineIndices, | ||
OptionalAccessor< float, mode::read,!readGlobal > | a_fractShiftsTable, | ||
OptionalAccessor< int, mode::read,!readGlobal > | a_gridlineIndicesTable, | ||
const gmx::IVec | tablesOffsets, | ||
const gmx::IVec | realGridSize, | ||
const gmx::RVec | realGridSizeFP, | ||
const gmx::IVec | realGridSizePadded, | ||
const gmx::RVec | currentRecipBox0, | ||
const gmx::RVec | currentRecipBox1, | ||
const gmx::RVec | currentRecipBox2, | ||
const float | scale | ||
) |
A SYCL kernel which gathers the atom forces from the grid. The grid is assumed to be wrapped in dimension Z.
order | PME interpolation order. |
wrapX | A boolean which tells if the grid overlap in dimension X should be wrapped. |
wrapY | A boolean which tells if the grid overlap in dimension Y should be wrapped. |
numGrids | The number of grids to use in the kernel. Can be 1 or 2. |
writeGlobal | Tells if we should read spline values from global memory. |
threadsPerAtom | How many threads work on each atom. |
subGroupSize | Size of the sub-group. |
|
inline |
Use loads from constant address space indexed by constant offsets rather than dynamic index-based accesses to the grid size data to avoid local memory operations and related large overhead.
Drastically reduces register spills on AMD via hipSYCL, and improves performance 10x.
[in] | realGridSizeFP | Local grid size constant |
[in] | dimIndex | Dimension index (XX, YY, ZZ) |
|
inline |
Reduce the partial force contributions.
order | The PME order (must be 4). |
atomDataSize | The number of partial force contributions for each atom (currently order^2 == 16). |
workGroupSize | The size of a work-group. |
subGroupSize | The size of a sub-group. |
[in] | itemIdx | SYCL thread ID. |
[out] | sm_forces | Shared memory array with the output forces (number of elements is number of atoms per block). |
[in] | atomIndexLocal | Local atom index. |
[in] | splineIndex | Spline index. |
[in] | lineIndex | Line index (same as threadLocalId) |
[in] | realGridSizeFP | Local grid size constant |
[in] | fx | Input force partial component X |
[in] | fy | Input force partial component Y |
[in] | fz | Input force partial component Z |
|
inline |
Calculate the sum of the force partial components (in X, Y and Z)
order | The PME order (must be 4). |
atomsPerWarp | The number of atoms per GPU warp. |
wrapX | Tells if the grid is wrapped in the X dimension. |
wrapY | Tells if the grid is wrapped in the Y dimension. |
[out] | fx | The force partial component in the X dimension. |
[out] | fy | The force partial component in the Y dimension. |
[out] | fz | The force partial component in the Z dimension. |
[in] | ithyMin | The thread minimum index in the Y dimension. |
[in] | ithyMax | The thread maximum index in the Y dimension. |
[in] | ixBase | The grid line index base value in the X dimension. |
[in] | iz | The grid line index in the Z dimension. |
[in] | nx | The grid real size in the X dimension. |
[in] | ny | The grid real size in the Y dimension. |
[in] | pny | The padded grid real size in the Y dimension. |
[in] | pnz | The padded grid real size in the Z dimension. |
[in] | atomIndexLocal | The atom index for this thread. |
[in] | splineIndexBase | The base value of the spline parameter index. |
[in] | tdz | The theta and dtheta in the Z dimension. |
[in] | sm_gridlineIndices | Shared memory array of grid line indices. |
[in] | sm_theta | Shared memory array of atom theta values. |
[in] | sm_dtheta | Shared memory array of atom dtheta values. |
[in] | gm_grid | Global memory array of the grid to use. |