Implements PME GPU spline calculation and charge spreading in SYCL.
- Author
- Andrey Alekseenko al42a.nosp@m.nd@g.nosp@m.mail..nosp@m.com
|
#define | INSTANTIATE_3(order, computeSplines, spreadCharges, numGrids, writeGlobal, threadsPerAtom, subGroupSize) template class PmeSplineAndSpreadKernel<order, computeSplines, spreadCharges, true, true, numGrids, writeGlobal, threadsPerAtom, subGroupSize>; |
| Kernel instantiations.
|
|
#define | INSTANTIATE_2(order, numGrids, threadsPerAtom, subGroupSize) |
|
#define | INSTANTIATE(order, subGroupSize) |
|
|
template<int order, bool wrapX, bool wrapY, ThreadsPerAtom threadsPerAtom, int subGroupSize> |
void | spread_charges (const float atomCharge, const int realGridSize[DIM], const int realGridSizePadded[DIM], sycl::global_ptr< float > gm_grid, const sycl::local_ptr< int > sm_gridlineIndices, const sycl::local_ptr< float > sm_theta, const sycl::nd_item< 3 > &itemIdx) |
| Charge spreading onto the grid. This corresponds to the CPU function spread_coefficients_bsplines_thread(). Optional second stage of the spline_and_spread_kernel. More...
|
|
template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int numGrids, bool writeGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize> |
auto | pmeSplineAndSpreadKernel (sycl::handler &cgh, const int nAtoms, float *__restrict__ gm_realGrid_0, float *__restrict__ gm_realGrid_1, float *__restrict__ gm_theta, float *__restrict__ gm_dtheta, int *__restrict__ gm_gridlineIndices, const float *__restrict__ gm_fractShiftsTable, const int *__restrict__ gm_gridlineIndicesTable, const float *__restrict__ gm_coefficients_0, const float *__restrict__ gm_coefficients_1, const Float3 *__restrict__ gm_coordinates, const gmx::IVec tablesOffsets, const gmx::IVec realGridSize, const gmx::RVec realGridSizeFP, const gmx::IVec realGridSizePadded, const gmx::RVec currentRecipBox0, const gmx::RVec currentRecipBox1, const gmx::RVec currentRecipBox2, const PmeGpuPipeliningParams pipeliningParams) |
| A spline computation and charge spreading kernel function. More...
|
|
#define INSTANTIATE |
( |
|
order, |
|
|
|
subGroupSize |
|
) |
| |
Value:
Use a number of threads equal to the PME order (ie. 4)
Use a number of threads equal to the square of the PME order (ie. 16)
#define INSTANTIATE_2 |
( |
|
order, |
|
|
|
numGrids, |
|
|
|
threadsPerAtom, |
|
|
|
subGroupSize |
|
) |
| |
Value:
#define INSTANTIATE_3(order, computeSplines, spreadCharges, numGrids, writeGlobal, threadsPerAtom, subGroupSize)
Kernel instantiations.
Definition: pme_spread_sycl.cpp:435
static int numGrids(const GridSet::DomainSetup &domainSetup)
Returns the number of search grids.
Definition: gridset.cpp:67
template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int numGrids, bool writeGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize>
auto pmeSplineAndSpreadKernel |
( |
sycl::handler & |
cgh, |
|
|
const int |
nAtoms, |
|
|
float *__restrict__ |
gm_realGrid_0, |
|
|
float *__restrict__ |
gm_realGrid_1, |
|
|
float *__restrict__ |
gm_theta, |
|
|
float *__restrict__ |
gm_dtheta, |
|
|
int *__restrict__ |
gm_gridlineIndices, |
|
|
const float *__restrict__ |
gm_fractShiftsTable, |
|
|
const int *__restrict__ |
gm_gridlineIndicesTable, |
|
|
const float *__restrict__ |
gm_coefficients_0, |
|
|
const float *__restrict__ |
gm_coefficients_1, |
|
|
const Float3 *__restrict__ |
gm_coordinates, |
|
|
const gmx::IVec |
tablesOffsets, |
|
|
const gmx::IVec |
realGridSize, |
|
|
const gmx::RVec |
realGridSizeFP, |
|
|
const gmx::IVec |
realGridSizePadded, |
|
|
const gmx::RVec |
currentRecipBox0, |
|
|
const gmx::RVec |
currentRecipBox1, |
|
|
const gmx::RVec |
currentRecipBox2, |
|
|
const PmeGpuPipeliningParams |
pipeliningParams |
|
) |
| |
A spline computation and charge spreading kernel function.
Two tuning parameters can be used for additional performance. For small systems and for debugging writeGlobal should be used removing the need to recalculate the theta values in the gather kernel. Similarly for large systems, with useOrderThreads, using order threads per atom gives higher performance than order*order threads.
- Template Parameters
-
order | PME interpolation order. |
computeSplines | A boolean which tells if the spline parameter and gridline indices' computation should be performed. |
spreadCharges | A boolean which tells if the charge spreading should be performed. |
wrapX | A boolean which tells if the grid overlap in dimension X should be wrapped. |
wrapY | A boolean which tells if the grid overlap in dimension Y should be wrapped. |
numGrids | The number of grids to use in the kernel. Can be 1 or 2. |
writeGlobal | A boolean which tells if the theta values and gridlines should be written to global memory. |
threadsPerAtom | How many threads work on each atom. |
subGroupSize | Size of the sub-group. |
template<int order, bool wrapX, bool wrapY, ThreadsPerAtom threadsPerAtom, int subGroupSize>
void spread_charges |
( |
const float |
atomCharge, |
|
|
const int |
realGridSize[DIM], |
|
|
const int |
realGridSizePadded[DIM], |
|
|
sycl::global_ptr< float > |
gm_grid, |
|
|
const sycl::local_ptr< int > |
sm_gridlineIndices, |
|
|
const sycl::local_ptr< float > |
sm_theta, |
|
|
const sycl::nd_item< 3 > & |
itemIdx |
|
) |
| |
|
inline |
Charge spreading onto the grid. This corresponds to the CPU function spread_coefficients_bsplines_thread(). Optional second stage of the spline_and_spread_kernel.
- Template Parameters
-
order | PME interpolation order. |
wrapX | Whether the grid overlap in dimension X should be wrapped. |
wrapY | Whether the grid overlap in dimension Y should be wrapped. |
threadsPerAtom | How many threads work on each atom. |
subGroupSize | Size of the sub-group. |
- Parameters
-
[in] | atomCharge | Atom charge/coefficient of atom processed by thread. |
[in] | realGridSize | Size of the real grid. |
[in] | realGridSizePadded | Padded of the real grid. |
[in,out] | gm_grid | Device pointer to the real grid to which charges are added. |
[in] | sm_gridlineIndices | Atom gridline indices in the local memory. |
[in] | sm_theta | Atom spline values in the local memory. |
[in] | itemIdx | Current thread ID. |