|
| template<int order, int atomsPerWarp> |
static int __device__
__forceinline__ | getSplineParamIndexBase (int warpIndex, int atomWarpIndex) |
| | Gets a base of the unique index to an element in a spline parameter buffer (theta/dtheta), which is laid out for GPU spread/gather kernels. The base only corresponds to the atom index within the execution block. Feed the result into getSplineParamIndex() to get a full index. TODO: it's likely that both parameters can be just replaced with a single atom index, as they are derived from it. Do that, verifying that the generated code is not bloated, and/or revise the spline indexing scheme. Removing warp dependency would also be nice (and would probably coincide with removing c_pmeSpreadGatherAtomsPerWarp). More...
|
| |
| template<int order, int atomsPerWarp> |
static int __device__
__forceinline__ | getSplineParamIndex (int paramIndexBase, int dimIndex, int splineIndex) |
| | Gets a unique index to an element in a spline parameter buffer (theta/dtheta), which is laid out for GPU spread/gather kernels. The index is wrt to the execution block, in range(0, atomsPerBlock * order * DIM). This function consumes result of getSplineParamIndexBase() and adjusts it for dimIndex and splineIndex. More...
|
| |
static bool __device__
__forceinline__ | pme_gpu_check_atom_charge (const float coefficient) |
| | An inline HIP function for skipping the zero-charge atoms. More...
|
| |
| template<typename T > |
| static __device__ void | assertIsFinite (T arg) |
| | Asserts if the argument is finite. More...
|
| |
|
template<> |
| __device__ void | assertIsFinite (float3 gmx_unused arg) |
| |
|
template<typename T > |
| static __device__ void | assertIsFinite (T gmx_unused arg) |
| |
| template<typename T , int atomsPerBlock, int dataCountPerAtom> |
static __device__
__forceinline__ void | pme_gpu_stage_atom_data (T *__restrict__ sm_destination, const T *__restrict__ gm_source) |
| | General purpose function for loading atom-related data from global to shared memory. More...
|
| |
| template<int order, int atomsPerBlock, int atomsPerWarp, bool writeSmDtheta, bool writeGlobal, int numGrids, int parallelExecutionWidth> |
static __device__
__forceinline__ void | calculate_splines (const PmeGpuKernelParams kernelParams, const int atomIndexOffset, const float3 atomX, const float atomCharge, float *__restrict__ sm_theta, float *__restrict__ sm_dtheta, int *__restrict__ sm_gridlineIndices) |
| | PME GPU spline parameter and gridline indices calculation. This corresponds to the CPU functions calc_interpolation_idx() and make_bsplines(). First stage of the whole kernel. More...
|
| |
template<int order, int atomsPerBlock, int atomsPerWarp, bool writeSmDtheta, bool writeGlobal, int numGrids, int parallelExecutionWidth>
| static __device__ __forceinline__ void calculate_splines |
( |
const PmeGpuKernelParams |
kernelParams, |
|
|
const int |
atomIndexOffset, |
|
|
const float3 |
atomX, |
|
|
const float |
atomCharge, |
|
|
float *__restrict__ |
sm_theta, |
|
|
float *__restrict__ |
sm_dtheta, |
|
|
int *__restrict__ |
sm_gridlineIndices |
|
) |
| |
|
static |
PME GPU spline parameter and gridline indices calculation. This corresponds to the CPU functions calc_interpolation_idx() and make_bsplines(). First stage of the whole kernel.
- Template Parameters
-
| order | PME interpolation order. |
| atomsPerBlock | Number of atoms processed by a block - should be accounted for in the sizes of the shared memory arrays. |
| atomsPerWarp | Number of atoms processed by a warp |
| writeSmDtheta | Bool controlling if the theta derivative should be written to shared memory. Enables calculation of dtheta if set. |
| writeGlobal | A boolean which tells if the theta values and gridlines should be written to global memory. Enables calculation of dtheta if set. |
| numGrids | The number of grids using the splines. |
- Parameters
-
| [in] | kernelParams | Input PME HIP data in constant memory. |
| [in] | atomIndexOffset | Starting atom index for the execution block w.r.t. global memory. |
| [in] | atomX | Atom coordinate of atom processed by thread. |
| [in] | atomCharge | Atom charge/coefficient of atom processed by thread. |
| [out] | sm_theta | Atom spline values in the shared memory. |
| [out] | sm_dtheta | Derivative of atom spline values in shared memory. |
| [out] | sm_gridlineIndices | Atom gridline indices in the shared memory. |
template<int order, int atomsPerWarp>
| static int __device__ __forceinline__ getSplineParamIndex |
( |
int |
paramIndexBase, |
|
|
int |
dimIndex, |
|
|
int |
splineIndex |
|
) |
| |
|
static |
Gets a unique index to an element in a spline parameter buffer (theta/dtheta), which is laid out for GPU spread/gather kernels. The index is wrt to the execution block, in range(0, atomsPerBlock * order * DIM). This function consumes result of getSplineParamIndexBase() and adjusts it for dimIndex and splineIndex.
- Template Parameters
-
| order | PME order |
| atomsPerWarp | Number of atoms processed by a warp |
- Parameters
-
| [in] | paramIndexBase | Must be result of getSplineParamIndexBase(). |
| [in] | dimIndex | Dimension index (from 0 to 2) |
| [in] | splineIndex | Spline contribution index (from 0 to order - 1) |
- Returns
- Index into theta or dtheta array using GPU layout.
template<int order, int atomsPerWarp>
| static int __device__ __forceinline__ getSplineParamIndexBase |
( |
int |
warpIndex, |
|
|
int |
atomWarpIndex |
|
) |
| |
|
static |
Gets a base of the unique index to an element in a spline parameter buffer (theta/dtheta), which is laid out for GPU spread/gather kernels. The base only corresponds to the atom index within the execution block. Feed the result into getSplineParamIndex() to get a full index. TODO: it's likely that both parameters can be just replaced with a single atom index, as they are derived from it. Do that, verifying that the generated code is not bloated, and/or revise the spline indexing scheme. Removing warp dependency would also be nice (and would probably coincide with removing c_pmeSpreadGatherAtomsPerWarp).
- Template Parameters
-
| order | PME order |
| atomsPerWarp | Number of atoms processed by a warp |
- Parameters
-
| [in] | warpIndex | Warp index wrt the block. |
| [in] | atomWarpIndex | Atom index wrt the warp (from 0 to atomsPerWarp - 1). |
- Returns
- Index into theta or dtheta array using GPU layout.