#include "gromacs/gpu_utils/devicebuffer.h"
#include "gromacs/gpu_utils/gmxsycl.h"
#include "gromacs/gpu_utils/packed_float.h"
#include "gromacs/math/functions.h"
#include "gromacs/mdtypes/simulation_workload.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/utility/template_mp.h"
#include "nbnxm_sycl_kernel.h"
#include "nbnxm_sycl_kernel_utils.h"
#include "nbnxm_sycl_types.h"

Include dependency graph for nbnxm_sycl_kernel_body.h:

This graph shows which files directly or indirectly include this file:

Description

NBNXM SYCL kernels.

Classes
class	NbnxmKernel< doPruneNBL, doCalcEnergies, elecType, vdwType, subGroupSize >
	Class name for NBNXM kernel. More...

struct	Nbnxm::EnergyFunctionProperties< elecType, vdwType >
	Set of boolean constants mimicking preprocessor macros. More...

Macros
#define	GMX_NBNXM_ENABLE_PACKED_FLOAT3 1
	Macro to control the enablement of manually-packed Float3 structure. More...

Typedefs
using	FCiFloat3 = Float3

using	Nbnxm::mode = sycl::access_mode

Functions
static Float2	Nbnxm::convertSigmaEpsilonToC6C12 (const float sigma, const float epsilon)
	Convert `sigma` and `epsilon` VdW parameters to `c6`,c12 pair.

template<bool doCalcEnergies>
static void	Nbnxm::ljForceSwitch (const shift_consts_t dispersionShift, const shift_consts_t repulsionShift, const float rVdwSwitch, const float c6, const float c12, const float rInv, const float r2, sycl::private_ptr< float > fInvR, sycl::private_ptr< float > eLJ)
	Calculate force and energy for a pair of atoms, VdW force-switch flavor.

template<enum VdwType vdwType>
static float	Nbnxm::calculateLJEwaldC6Grid (const sycl::global_ptr< const Float2 > a_nbfpComb, const int typeI, const int typeJ)
	Fetch C6 grid contribution coefficients and return the product of these.

template<bool doCalcEnergies, enum VdwType vdwType>
static void	Nbnxm::ljEwaldComb (const sycl::global_ptr< const Float2 > a_nbfpComb, const float sh_lj_ewald, const int typeI, const int typeJ, const float r2, const float r2Inv, const float lje_coeff2, const float lje_coeff6_6, const float int_bit, sycl::private_ptr< float > fInvR, sycl::private_ptr< float > eLJ)
	Calculate LJ-PME grid force contribution with geometric or LB combination rule.

template<bool doCalcEnergies>
static void	Nbnxm::ljPotentialSwitch (const switch_consts_t vdwSwitch, const float rVdwSwitch, const float rInv, const float r2, sycl::private_ptr< float > fInvR, sycl::private_ptr< float > eLJ)
	Apply potential switch.

static float	Nbnxm::pmeCorrF (const float z2)
	Calculate analytical Ewald correction term.

template<typename T >
static T	Nbnxm::lerp (T d0, T d1, T t)
	Linear interpolation using exactly two FMA operations. More...

static float	Nbnxm::interpolateCoulombForceR (const sycl::global_ptr< const float > a_coulombTab, const float coulombTabScale, const float r)
	Interpolate Ewald coulomb force correction using the F*r table.

static void	Nbnxm::reduceForceJShuffle (Float3 f, const sycl::nd_item< 3 > itemIdx, const int tidxi, const int aidx, sycl::global_ptr< Float3 > a_f)
	Reduce c_clSize j-force components using shifts and atomically accumulate into a_f. More...

template<int subGroupSize, int groupSize>
static float	Nbnxm::groupReduce (const sycl::nd_item< 3 > itemIdx, const unsigned int tidxi, sycl::local_ptr< float > sm_buf, float valueToReduce)
	Do workgroup-level reduction of a single `float`. More...

static void	Nbnxm::reduceForceJGeneric (sycl::local_ptr< float > sm_buf, Float3 f, const sycl::nd_item< 3 > itemIdx, const int tidxi, const int tidxj, const int aidx, sycl::global_ptr< Float3 > a_f)
	Reduce c_clSize j-force components using local memory and atomically accumulate into a_f. More...

template<bool useShuffleReduction>
static void	Nbnxm::reduceForceJ (sycl::local_ptr< float > sm_buf, Float3 f, const sycl::nd_item< 3 > itemIdx, const int tidxi, const int tidxj, const int aidx, sycl::global_ptr< Float3 > a_f)
	Reduce c_clSize j-force components using either shifts or local memory and atomically accumulate into a_f.

template<typename FCiBufferWrapperX , typename FCiBufferWrapperY , typename FCiBufferWrapperZ >
static void	Nbnxm::reduceForceIAndFShiftGeneric (sycl::local_ptr< float > sm_buf, const FCiBufferWrapperX &fCiBufX, const FCiBufferWrapperY &fCiBufY, const FCiBufferWrapperZ &fCiBufZ, const bool calcFShift, const sycl::nd_item< 3 > itemIdx, const int tidxi, const int tidxj, const int sci, const int shift, sycl::global_ptr< Float3 > a_f, sycl::global_ptr< Float3 > a_fShift)
	Local memory-based i-force reduction. More...

template<int numShuffleReductionSteps, typename FCiBufferWrapperX , typename FCiBufferWrapperY , typename FCiBufferWrapperZ >
static std::enable_if_t < numShuffleReductionSteps!=1, void >	Nbnxm::reduceForceIAndFShiftShuffles (const FCiBufferWrapperX &fCiBufX, const FCiBufferWrapperY &fCiBufY, const FCiBufferWrapperZ &fCiBufZ, const bool calcFShift, const sycl::nd_item< 3 > itemIdx, const int tidxi, const int tidxj, const int sci, const int shift, sycl::global_ptr< Float3 > a_f, sycl::global_ptr< Float3 > a_fShift)
	Shuffle-based i-force reduction. More...

template<int numShuffleReductionSteps, typename FCiBufferWrapperX , typename FCiBufferWrapperY , typename FCiBufferWrapperZ >
static std::enable_if_t < numShuffleReductionSteps==1, void >	Nbnxm::reduceForceIAndFShiftShuffles (const FCiBufferWrapperX &fCiBufX, const FCiBufferWrapperY &fCiBufY, const FCiBufferWrapperZ &fCiBufZ, const bool calcFShift, const sycl::nd_item< 3 > itemIdx, const int tidxi, const int tidxj, const int sci, const int shift, sycl::global_ptr< Float3 > a_f, sycl::global_ptr< Float3 > a_fShift)
	`reduceForceIAndFShiftShuffles` specialization for single-step reduction (e.g., Intel iGPUs). More...

template<bool useShuffleReduction, int subGroupSize, typename FCiBufferWrapperX , typename FCiBufferWrapperY , typename FCiBufferWrapperZ >
static void	Nbnxm::reduceForceIAndFShift (sycl::local_ptr< float > sm_buf, const FCiBufferWrapperX &fCiBufX, const FCiBufferWrapperY &fCiBufY, const FCiBufferWrapperZ &fCiBufZ, const bool calcFShift, const sycl::nd_item< 3 > itemIdx, const int tidxi, const int tidxj, const int sci, const int shift, sycl::global_ptr< Float3 > a_f, sycl::global_ptr< Float3 > a_fShift)
	Final i-force reduction. More...

template<int subGroupSize, bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>
static auto	Nbnxm::nbnxmKernel (sycl::handler &cgh, const Float4 __restrict__ gm_xq, Float3 __restrict__ gm_f, const Float3 __restrict__ gm_shiftVec, Float3 __restrict__ gm_fShift, float __restrict__ gm_energyElec, float __restrict__ gm_energyVdw, nbnxn_cj_packed_t __restrict__ gm_plistCJPacked, const nbnxn_sci_t __restrict__ gm_plistSci, const nbnxn_excl_t __restrict__ gm_plistExcl, const Float2 __restrict__ gm_ljComb, const int __restrict__ gm_atomTypes, const Float2 __restrict__ gm_nbfp, const Float2 __restrict__ gm_nbfpComb, const float __restrict__ gm_coulombTab, const int numTypes, const float rCoulombSq, const float rVdwSq, const float twoKRf, const float ewaldBeta, const float rlistOuterSq, const float ewaldShift, const float epsFac, const float ewaldCoeffLJ_2, const float cRF, const shift_consts_t dispersionShift, const shift_consts_t repulsionShift, const switch_consts_t vdwSwitch, const float rVdwSwitch, const float ljEwaldShift, const float coulombTabScale, const bool calcShift)
	Main kernel for NBNXM. More...

template<int subGroupSize, bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType, class... Args>
static void	Nbnxm::launchNbnxmKernel (const DeviceStream &deviceStream, const int numSci, Args &&...args)
	NBNXM kernel launch code.

template<int subGroupSize, bool doPruneNBL, bool doCalcEnergies, class... Args>
void	Nbnxm::chooseAndLaunchNbnxmKernel (enum ElecType elecType, enum VdwType vdwType, Args &&...args)
	Select templated kernel and launch it.

template<int subGroupSize, bool doPruneNBL, bool doCalcEnergies>
void	Nbnxm::launchNbnxmKernelHelper (NbnxmGpu *nb, const gmx::StepWorkload &stepWork, const InteractionLocality iloc)

Variables
constexpr bool	Nbnxm::c_avoidFloatingPointAtomics = (c_clSize == 4)
	Should we avoid FP atomics to the same location from the same work-group? More...


template<enum VdwType vdwType>
constexpr bool	Nbnxm::ljComb = EnergyFunctionProperties<ElecType::Count, vdwType>().vdwComb
	Templated constants to shorten kernel function declaration.

template<enum ElecType elecType>
constexpr bool	Nbnxm::elecEwald = EnergyFunctionProperties<elecType, VdwType::Count>().elecEwald

template<enum ElecType elecType>
constexpr bool	Nbnxm::elecEwaldTab = EnergyFunctionProperties<elecType, VdwType::Count>().elecEwaldTab

template<enum VdwType vdwType>
constexpr bool	Nbnxm::ljEwald = EnergyFunctionProperties<ElecType::Count, vdwType>().vdwEwald

Macro Definition Documentation

#define GMX_NBNXM_ENABLE_PACKED_FLOAT3 1

Macro to control the enablement of manually-packed Float3 structure.

If enabled (default), the explicit packed math will be used on devices where it is known to be beneficial (currently, AMD MI250X / gfx90a). If disabled, packed math will never be explicitly used.

This only controls the use of AmdPackedFloat3 datastructure, not the layout of fCi buffer.

See issue #4854

Description

Classes

Macros

Typedefs

Functions

Variables

Macro Definition Documentation