Description

Namespace for non-bonded kernels.

Enumerations
enum	BenchMarkKernels : int { SimdAuto, SimdNo, Simd4XM, Simd2XMM, Count }
	Enum for selecting the SIMD kernel type for benchmarks.

enum	BenchMarkCombRule : int { RuleGeom, RuleLB, RuleNone, Count }
	Enum for selecting the combination rule for kernel benchmarks.

enum	BenchMarkCoulomb : int { Pme, ReactionField, Count }
	Enum for selecting coulomb type for kernel benchmarks.

enum	ElecType : int { ElecType::Cut, ElecType::RF, ElecType::EwaldTab, ElecType::EwaldTabTwin, ElecType::EwaldAna, ElecType::EwaldAnaTwin, ElecType::Count }
	Nbnxm electrostatic GPU kernel flavors. More...

enum	VdwType : int { VdwType::Cut, VdwType::CutCombGeom, VdwType::CutCombLB, VdwType::FSwitch, VdwType::PSwitch, VdwType::EwaldGeom, VdwType::EwaldLB, VdwType::Count }
	Nbnxm VdW GPU kernel flavors. More...

enum	KernelType : int { NotSet = 0, Cpu4x4_PlainC, Cpu4xN_Simd_4xN, Cpu4xN_Simd_2xNN, Gpu8x8x8, Cpu8x8x8_PlainC, Count }
	Nonbonded NxN kernel types: plain C, CPU SIMD, GPU, GPU emulation.

enum	EwaldExclusionType : int { NotSet = 0, Table, Analytical, DecidedByGpuModule }
	Ewald exclusion types.

enum	NonbondedResource : int { Cpu, Gpu, EmulateGpu }
	Resources that can be used to execute non-bonded kernels on.

Functions
static std::optional< std::string >	checkKernelSetup (const KernelBenchOptions &options)
	Checks the kernel setup. More...

static KernelType	translateBenchmarkEnum (const BenchMarkKernels &kernel)
	Helper to translate between the different enumeration values.

static KernelSetup	getKernelSetup (const KernelBenchOptions &options)
	Returns the kernel setup.

static interaction_const_t	setupInteractionConst (const KernelBenchOptions &options)
	Return an interaction constants struct with members used in the benchmark set appropriately.

static std::unique_ptr < nonbonded_verlet_t >	setupNbnxmForBenchInstance (const KernelBenchOptions &options, const gmx::BenchmarkSystem &system)
	Sets up and returns a Nbnxm object for the given benchmark options and system.

static void	expandSimdOptionAndPushBack (const KernelBenchOptions &options, std::vector< KernelBenchOptions > *optionsList)
	Add the options instance to the list for all requested kernel SIMD types.

static void	setupAndRunInstance (const gmx::BenchmarkSystem &system, const KernelBenchOptions &options, const bool doWarmup)
	Sets up and runs the requested benchmark instance and prints the results.

void	bench (int sizeFactor, const KernelBenchOptions &options)
	Sets up and runs one or more Nbnxm kernel benchmarks. More...

void	cuda_set_cacheconfig ()
	Set up the cache configuration for the non-bonded kernels.

static void	validateGpuAtomLocality (const AtomLocality atomLocality)
	Check that atom locality values are valid for the GPU module. More...

static InteractionLocality	gpuAtomToInteractionLocality (const AtomLocality atomLocality)
	Convert atom locality to interaction locality. More...

void	setupGpuShortRangeWork (NbnxmGpu nb, const gmx::GpuBonded gpuBonded, gmx::InteractionLocality iLocality)
	Set up internal flags that indicate what type of short-range work there is. More...

static bool	haveGpuShortRangeWork (const NbnxmGpu &nb, const gmx::InteractionLocality iLocality)
	Returns true if there is GPU short-range work for the given interaction locality. More...

bool	haveGpuShortRangeWork (const NbnxmGpu *nb, gmx::AtomLocality aLocality)
	Returns true if there is GPU short-range work for the given atom locality. More...

template<typename AtomDataT >
static void	getGpuAtomRange (const AtomDataT atomData, const AtomLocality atomLocality, int atomRangeBegin, int *atomRangeLen)
	Calculate atom range and return start index and length. More...

template<typename GpuTimers >
static void	countPruneKernelTime (GpuTimers timers, gmx_wallclock_gpu_nbnxn_t timings, const InteractionLocality iloc)
	Count pruning kernel time if either kernel has been triggered. More...

static void	gpu_reduce_staged_outputs (const nb_staging_t &nbst, const InteractionLocality iLocality, const bool reduceEnergies, const bool reduceFshift, real e_lj, real e_el, rvec *fshift)
	Reduce data staged internally in the nbnxn module. More...

template<typename GpuTimers , typename GpuPairlist >
static void	gpu_accumulate_timings (gmx_wallclock_gpu_nbnxn_t timings, GpuTimers timers, const GpuPairlist *plist, AtomLocality atomLocality, const gmx::StepWorkload &stepWork, bool doTiming)
	Do the per-step timing accounting of the nonbonded tasks. More...

bool	gpu_try_finish_task (NbnxmGpu nb, const gmx::StepWorkload &stepWork, const AtomLocality aloc, real e_lj, real e_el, gmx::ArrayRef< gmx::RVec > shiftForces, GpuTaskCompletion completionKind, gmx_wallcycle wcycle)
	Attempts to complete nonbonded GPU task. More...

float	gpu_wait_finish_task (NbnxmGpu nb, const gmx::StepWorkload &stepWork, AtomLocality aloc, real e_lj, real e_el, gmx::ArrayRef< gmx::RVec > shiftForces, gmx_wallcycle wcycle)
	Wait for the asynchronously launched nonbonded tasks and data transfers to finish. More...

static bool	canSkipNonbondedWork (const NbnxmGpu &nb, InteractionLocality iloc)
	An early return condition for empty NB GPU workloads. More...

NbnxmGpu *	gpu_init (const gmx::DeviceStreamManager &deviceStreamManager, const interaction_const_t ic, const PairlistParams &listParams, const nbnxn_atomdata_t nbat, bool bLocalAndNonlocal)
	Initializes the data structures related to GPU nonbonded calculations. More...

void	gpu_init_pairlist (NbnxmGpu nb, const struct NbnxnPairlistGpu h_nblist, gmx::InteractionLocality iloc)
	Initializes pair-list data for GPU, called at every pair search step. More...

void	gpu_init_atomdata (NbnxmGpu nb, const nbnxn_atomdata_t nbat)
	Initializes atom-data on the GPU, called at every pair search step. More...

void	gpu_pme_loadbal_update_param (const struct nonbonded_verlet_t nbv, const interaction_const_t ic)
	Re-generate the GPU Ewald force table, resets rlist, and update the electrostatic type switching to twin cut-off (or back) if needed.

void	gpu_upload_shiftvec (NbnxmGpu nb, const nbnxn_atomdata_t nbatom)
	Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). More...

void	gpu_clear_outputs (NbnxmGpu *nb, bool computeVirial)
	Clears GPU outputs: nonbonded force, shift force and energy. More...

void	gpu_free (NbnxmGpu *nb)
	Frees all GPU resources used for the nonbonded calculations. More...

struct gmx_wallclock_gpu_nbnxn_t *	gpu_get_timings (NbnxmGpu *nb)
	Returns the GPU timings structure or NULL if GPU is not used or timing is off. More...

void	gpu_reset_timings (struct nonbonded_verlet_t *nbv)
	Resets nonbonded GPU timings. More...

int	gpu_min_ci_balanced (NbnxmGpu *nb)
	Calculates the minimum size of proximity lists to improve SM load balance with GPU non-bonded kernels. More...

bool	gpu_is_kernel_ewald_analytical (const NbnxmGpu *nb)
	Returns if analytical Ewald GPU kernels are used. More...

enum ElecType	nbnxmGpuPickElectrostaticsKernelType (const interaction_const_t *ic, const DeviceInformation &deviceInfo)
	Return the enum value of electrostatics kernel type for given interaction parameters `ic`. More...

enum VdwType	nbnxmGpuPickVdwKernelType (const interaction_const_t *ic, int combRule)
	Return the enum value of VdW kernel type for given `ic` and `combRule`. More...

const DeviceStream *	gpu_get_command_stream (NbnxmGpu *nb, gmx::InteractionLocality iloc)
	Returns an opaque pointer to the GPU command stream Note: CUDA only.

void *	gpu_get_xq (NbnxmGpu *nb)
	Returns an opaque pointer to the GPU coordinate+charge array Note: CUDA only.

DeviceBuffer< gmx::RVec >	gpu_get_f (NbnxmGpu *nb)
	Returns an opaque pointer to the GPU force array Note: CUDA only.

DeviceBuffer< gmx::RVec >	gpu_get_fshift (NbnxmGpu *nb)
	Returns an opaque pointer to the GPU shift force array Note: CUDA only.

static real	gridAtomDensity (int numAtoms, const rvec lowerCorner, const rvec upperCorner)
	Returns the atom density (> 0) of a rectangular grid.

static void	sort_atoms (int dim, gmx_bool Backwards, int dd_zone, bool relevantAtomsAreWithinGridBounds, int *a, int n, gmx::ArrayRef< const gmx::RVec > x, real h0, real invh, int n_per_h, gmx::ArrayRef< int > sort)
	Sorts particle index a on coordinates x along dim. More...

static float	R2F_D (const float x)
	Returns x.

static float	R2F_U (const float x)
	Returns x.

static void	calc_bounding_box (int na, int stride, const real x, BoundingBox bb)
	Computes the bounding box for na coordinates in order x,y,z, bb order xyz0.

static void	calc_bounding_box_x_x4 (int na, const real x, BoundingBox bb)
	Computes the bounding box for na packed coordinates, bb order xyz0.

static void	calc_bounding_box_x_x8 (int na, const real x, BoundingBox bb)
	Computes the bounding box for na coordinates, bb order xyz0.

static void	calc_bounding_box_x_x4_halves (int na, const real x, BoundingBox bb, BoundingBox *bbj)
	Computes the bounding box for na packed coordinates, bb order xyz0.

static void	combine_bounding_box_pairs (const Grid &grid, gmx::ArrayRef< const BoundingBox > bb, gmx::ArrayRef< BoundingBox > bbj)
	Combines pairs of consecutive bounding boxes.

static void	print_bbsizes_simple (FILE *fp, const Grid &grid)
	Prints the average bb size, used for debug output.

static void	print_bbsizes_supersub (FILE *fp, const Grid &grid)
	Prints the average bb size, used for debug output.

static void	sort_cluster_on_flag (int numAtomsInCluster, int atomStart, int atomEnd, const int atinfo, gmx::ArrayRef< int > order, int flags)
	Set non-bonded interaction flags for the current cluster. More...

static void	setCellAndAtomCount (gmx::ArrayRef< int > cell, int cellIndex, gmx::ArrayRef< int > cxy_na, int atomIndex)
	Sets the cell index in the cell array for atom `atomIndex` and increments the atom count for the grid column.

static void	resizeForNumberOfCells (const int numNbnxnAtoms, const int numAtomsMoved, GridSetData gridSetData, nbnxn_atomdata_t nbat)
	Resizes grid and atom data which depend on the number of cells.

static int	numGrids (const GridSet::DomainSetup &domainSetup)
	Returns the number of search grids.

const char *	lookup_kernel_name (Nbnxm::KernelType kernelType)
	Return a string identifying the kernel type. More...

std::unique_ptr < nonbonded_verlet_t >	init_nb_verlet (const gmx::MDLogger &mdlog, const t_inputrec ir, const t_forcerec fr, const t_commrec cr, const gmx_hw_info_t &hardwareInfo, bool useGpuForNonbonded, const gmx::DeviceStreamManager deviceStreamManager, const gmx_mtop_t mtop, matrix box, gmx_wallcycle wcycle)
	Creates an Nbnxm object.

static bool	kernelTypeUsesSimplePairlist (const KernelType kernelType)
	Returns whether the pair-list corresponding to nb_kernel_type is simple.

static bool	kernelTypeIsSimd (const KernelType kernelType)
	Returns whether a SIMD kernel is in use.

static bool	useLjCombRule (const enum VdwType vdwType)
	Returns true if LJ combination rules are used in the non-bonded kernels. More...

void	gpu_copy_xq_to_gpu (NbnxmGpu nb, const struct nbnxn_atomdata_t nbdata, gmx::AtomLocality aloc)
	Launch asynchronously the xq buffer host to device copy. More...

void	gpu_launch_kernel (NbnxmGpu *nb, const gmx::StepWorkload &stepWork, gmx::InteractionLocality iloc)
	Launch asynchronously the nonbonded force calculations. More...

void	gpu_launch_kernel_pruneonly (NbnxmGpu *nb, gmx::InteractionLocality iloc, int numParts)
	Launch asynchronously the nonbonded prune-only kernel. More...

void	gpu_launch_cpyback (NbnxmGpu nb, nbnxn_atomdata_t nbatom, const gmx::StepWorkload &stepWork, gmx::AtomLocality aloc)
	Launch asynchronously the download of short-range forces from the GPU (and energies/shift forces if required).

bool	gpu_try_finish_task (NbnxmGpu nb, const gmx::StepWorkload &stepWork, gmx::AtomLocality aloc, real e_lj, real e_el, gmx::ArrayRef< gmx::RVec > shiftForces, GpuTaskCompletion completionKind, gmx_wallcycle wcycle)
	Attempts to complete nonbonded GPU task. More...

float	gpu_wait_finish_task (NbnxmGpu nb, const gmx::StepWorkload &stepWork, gmx::AtomLocality aloc, real e_lj, real e_el, gmx::ArrayRef< gmx::RVec > shiftForces, gmx_wallcycle wcycle)
	Completes the nonbonded GPU task blocking until GPU tasks and data transfers to finish. More...

void	nbnxn_gpu_init_x_to_nbat_x (const Nbnxm::GridSet &gridSet, NbnxmGpu *gpu_nbv)
	Initialization for X buffer operations on GPU. Called on the NS step and performs (re-)allocations and memory copies. !

void	nbnxn_gpu_x_to_nbat_x (const Nbnxm::Grid &grid, bool setFillerCoords, NbnxmGpu gpu_nbv, DeviceBuffer< gmx::RVec > d_x, GpuEventSynchronizer xReadyOnDevice, gmx::AtomLocality locality, int gridId, int numColumnsMax)
	X buffer operations on GPU: performs conversion from rvec to nb format. More...

void	nbnxnInsertNonlocalGpuDependency (const NbnxmGpu *nb, gmx::InteractionLocality interactionLocality)
	Sync the nonlocal stream with dependent tasks in the local queue. More...

void	nbnxn_wait_x_on_device (NbnxmGpu *nb)
	sync CPU thread on coordinate copy to device More...

void *	getGpuForces (NbnxmGpu *nb)
	Get the pointer to the GPU nonbonded force buffer. More...

void	init_ewald_coulomb_force_table (const EwaldCorrectionTables &tables, NBParamGpu *nbp, const DeviceContext &deviceContext)
	Tabulates the Ewald Coulomb force and initializes the size/scale and the table GPU array. More...

void	printEnvironmentVariableDeprecationMessage (bool isEnvironmentVariableSet, const std::string &environmentVariableSuffix)

enum ElecType	nbnxn_gpu_pick_ewald_kernel_type (const interaction_const_t &ic, const DeviceInformation &deviceInfo)

void	set_cutoff_parameters (NBParamGpu nbp, const interaction_const_t ic, const PairlistParams &listParams)
	Copies all parameters related to the cut-off from ic to nbp.

void	gpu_pme_loadbal_update_param (const nonbonded_verlet_t nbv, const interaction_const_t ic)

void	init_plist (gpu_plist *pl)
	Initializes the pair list data structure.

void	init_timings (gmx_wallclock_gpu_nbnxn_t *t)
	Initializes the timings data structure.

void	gpu_init_pairlist (NbnxmGpu nb, const NbnxnPairlistGpu h_plist, const InteractionLocality iloc)
	This function is documented in the header file.

enum ElecType	nbnxn_gpu_pick_ewald_kernel_type (const interaction_const_t gmx_unused &ic, const DeviceInformation &deviceInfo)
	Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off.

static gmx_bool	nbnxn_simd_supported (const gmx::MDLogger &mdlog, const t_inputrec *ir)
	Returns whether CPU SIMD support exists for the given inputrec. More...

static KernelSetup	pick_nbnxn_kernel_cpu (const t_inputrec *ir, const gmx_hw_info_t &hardwareInfo)
	Returns the most suitable CPU kernel type and Ewald handling.

static KernelSetup	pick_nbnxn_kernel (const gmx::MDLogger &mdlog, gmx_bool use_simd_kernels, const gmx_hw_info_t &hardwareInfo, const NonbondedResource &nonbondedResource, const t_inputrec *ir)
	Returns the most suitable kernel type and Ewald handling.

static int	getMinimumIlistCountForGpuBalancing (NbnxmGpu *nbnxmGpu)
	Gets and returns the minimum i-list count for balacing based on the GPU used or env.var. when set.

static void	validate_global_work_size (const KernelLaunchConfig &config, int work_dim, const DeviceInformation *dinfo)
	Validates the input global work size parameter.

static cl_kernel	selectPruneKernel (cl_kernel kernel_pruneonly[], bool firstPrunePass)
	Return a pointer to the prune kernel version to be executed at the current invocation. More...

static cl_kernel	select_nbnxn_kernel (NbnxmGpu *nb, enum ElecType elecType, enum VdwType vdwType, bool bDoEne, bool bDoPrune)
	Return a pointer to the kernel version to be executed at the current step. OpenCL kernel objects are cached in nb. If the requested kernel is not found in the cache, it will be created and the cache will be updated.

static int	calc_shmem_required_nonbonded (enum VdwType vdwType, bool bPrefetchLjParam)
	Calculates the amount of shared memory required by the nonbonded kernel in use.

static void	fillin_ocl_structures (NBParamGpu nbp, cl_nbparam_params_t nbparams_params)
	Initializes data structures that are going to be sent to the OpenCL device. More...

static void	sync_ocl_event (cl_command_queue stream, cl_event *ocl_event)
	Enqueues a wait for event completion. More...

void	gpu_copy_xq_to_gpu (NbnxmGpu nb, const nbnxn_atomdata_t nbatom, const AtomLocality atomLocality)
	Launch asynchronously the xq buffer host to device copy.

void	gpu_launch_kernel (NbnxmGpu *nb, const gmx::StepWorkload &stepWork, const Nbnxm::InteractionLocality iloc)
	Launch GPU kernel. More...

static int	calc_shmem_required_prune (const int num_threads_z)
	Calculates the amount of shared memory required by the prune kernel. More...

void	gpu_launch_kernel_pruneonly (NbnxmGpu *nb, const InteractionLocality iloc, const int numParts)
	Launch the pairlist prune only kernel for the given locality. `numParts` tells in how many parts, i.e. calls the list will be pruned.

void	gpu_launch_cpyback (NbnxmGpu nb, struct nbnxn_atomdata_t nbatom, const gmx::StepWorkload &stepWork, const AtomLocality aloc)
	Launch asynchronously the download of nonbonded forces from the GPU (and energies/shift forces if required).

static void	init_atomdata_first (cl_atomdata_t *ad, int ntypes, const DeviceContext &deviceContext)
	Initializes the atomdata structure first time, it only gets filled at pair-search.

static void	init_nbparam (NBParamGpu nbp, const interaction_const_t ic, const PairlistParams &listParams, const nbnxn_atomdata_t::Params &nbatParams, const DeviceContext &deviceContext)
	Initializes the nonbonded parameter data structure.

static cl_kernel	nbnxn_gpu_create_kernel (NbnxmGpu nb, const char kernel_name)
	Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure.

static void	nbnxn_ocl_clear_e_fshift (NbnxmGpu *nb)
	Clears nonbonded shift force output array and energy outputs on the GPU.

static void	nbnxn_gpu_init_kernels (NbnxmGpu *nb)
	Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure.

static void	nbnxn_ocl_init_const (cl_atomdata_t atomData, NBParamGpu nbParams, const interaction_const_t *ic, const PairlistParams &listParams, const nbnxn_atomdata_t::Params &nbatParams, const DeviceContext &deviceContext)
	Initializes simulation constant data. More...

static void	nbnxn_ocl_clear_f (NbnxmGpu *nb, int natoms_clear)
	Clears the first natoms_clear elements of the GPU nonbonded force output array.

static void	free_kernel (cl_kernel *kernel_ptr)
	Releases an OpenCL kernel pointer.

static void	free_kernels (cl_kernel *kernels, int count)
	Releases a list of OpenCL kernel pointers.

static void	freeGpuProgram (cl_program program)
	Free the OpenCL program. More...

static void	initAtomdataFirst (NbnxmGpu *nb, int numTypes, const DeviceContext &deviceContext)
	Initialize `atomdata` first time; it only gets filled at pair-search.

static void	initNbparam (NBParamGpu *nbp, const interaction_const_t &ic, const PairlistParams &listParams, const nbnxn_atomdata_t::Params &nbatParams, const DeviceContext &deviceContext)
	Initialize the nonbonded parameter data structure.

static void	convertSigmaEpsilonToC6C12 (const float sigma, const float epsilon, cl::sycl::private_ptr< float > c6, cl::sycl::private_ptr< float > c12)

template<bool doCalcEnergies>
static void	ljForceSwitch (const shift_consts_t dispersionShift, const shift_consts_t repulsionShift, const float rVdwSwitch, const float c6, const float c12, const float rInv, const float r2, cl::sycl::private_ptr< float > fInvR, cl::sycl::private_ptr< float > eLJ)

template<enum VdwType vdwType>
static float	calculateLJEwaldC6Grid (const DeviceAccessor< float, mode::read > a_nbfpComb, const int typeI, const int typeJ)
	Fetch C6 grid contribution coefficients and return the product of these.

template<bool doCalcEnergies, enum VdwType vdwType>
static void	ljEwaldComb (const DeviceAccessor< float, mode::read > a_nbfpComb, const float sh_lj_ewald, const int typeI, const int typeJ, const float r2, const float r2Inv, const float lje_coeff2, const float lje_coeff6_6, const float int_bit, cl::sycl::private_ptr< float > fInvR, cl::sycl::private_ptr< float > eLJ)
	Calculate LJ-PME grid force contribution with geometric or LB combination rule.

template<bool doCalcEnergies>
static void	ljPotentialSwitch (const switch_consts_t vdwSwitch, const float rVdwSwitch, const float rInv, const float r2, cl::sycl::private_ptr< float > fInvR, cl::sycl::private_ptr< float > eLJ)
	Apply potential switch.

static float	pmeCorrF (const float z2)
	Calculate analytical Ewald correction term.

template<typename T >
static T	lerp (T d0, T d1, T t)
	Linear interpolation using exactly two FMA operations. More...

static float	interpolateCoulombForceR (const DeviceAccessor< float, mode::read > a_coulombTab, const float coulombTabScale, const float r)
	Interpolate Ewald coulomb force correction using the F*r table.

static void	reduceForceJShuffle (float3 f, const cl::sycl::nd_item< 1 > itemIdx, const int tidxi, const int aidx, DeviceAccessor< float, mode::read_write > a_f)

static void	reduceForceIAndFShift (cl::sycl::accessor< float, 1, mode::read_write, target::local > sm_buf, const float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster], const bool calcFShift, const cl::sycl::nd_item< 1 > itemIdx, const int tidxi, const int tidxj, const int sci, const int shift, DeviceAccessor< float, mode::read_write > a_f, DeviceAccessor< float, mode::read_write > a_fShift)
	Final i-force reduction. More...

template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>
auto	nbnxmKernel (cl::sycl::handler &cgh, DeviceAccessor< float4, mode::read > a_xq, DeviceAccessor< float, mode::read_write > a_f, DeviceAccessor< float3, mode::read > a_shiftVec, DeviceAccessor< float, mode::read_write > a_fShift, OptionalAccessor< float, mode::read_write, doCalcEnergies > a_energyElec, OptionalAccessor< float, mode::read_write, doCalcEnergies > a_energyVdw, DeviceAccessor< nbnxn_cj4_t, doPruneNBL?mode::read_write:mode::read > a_plistCJ4, DeviceAccessor< nbnxn_sci_t, mode::read > a_plistSci, DeviceAccessor< nbnxn_excl_t, mode::read > a_plistExcl, OptionalAccessor< float2, mode::read, ljComb< vdwType >> a_ljComb, OptionalAccessor< int, mode::read,!ljComb< vdwType >> a_atomTypes, OptionalAccessor< float, mode::read,!ljComb< vdwType >> a_nbfp, OptionalAccessor< float, mode::read, ljEwald< vdwType >> a_nbfpComb, OptionalAccessor< float, mode::read, elecEwaldTab< elecType >> a_coulombTab, const int numTypes, const float rCoulombSq, const float rVdwSq, const float twoKRf, const float ewaldBeta, const float rlistOuterSq, const float ewaldShift, const float epsFac, const float ewaldCoeffLJ, const float cRF, const shift_consts_t dispersionShift, const shift_consts_t repulsionShift, const switch_consts_t vdwSwitch, const float rVdwSwitch, const float ljEwaldShift, const float coulombTabScale, const bool calcShift)
	Main kernel for NBNXM. More...

template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType, class... Args>
cl::sycl::event	launchNbnxmKernel (const DeviceStream &deviceStream, const int numSci, Args &&...args)

template<class... Args>
cl::sycl::event	chooseAndLaunchNbnxmKernel (bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType, Args &&...args)

void	launchNbnxmKernel (NbnxmGpu *nb, const gmx::StepWorkload &stepWork, const InteractionLocality iloc)

template<bool haveFreshList>
auto	nbnxmKernelPruneOnly (cl::sycl::handler &cgh, DeviceAccessor< float4, mode::read > a_xq, DeviceAccessor< float3, mode::read > a_shiftVec, DeviceAccessor< nbnxn_cj4_t, mode::read_write > a_plistCJ4, DeviceAccessor< nbnxn_sci_t, mode::read > a_plistSci, DeviceAccessor< unsigned int, haveFreshList?mode::write:mode::read > a_plistIMask, const float rlistOuterSq, const float rlistInnerSq, const int numParts, const int part)
	Prune-only kernel for NBNXM. More...

template<bool haveFreshList, class... Args>
cl::sycl::event	launchNbnxmKernelPruneOnly (const DeviceStream &deviceStream, const int numSciInPart, Args &&...args)

template<class... Args>
cl::sycl::event	chooseAndLaunchNbnxmKernelPruneOnly (bool haveFreshList, Args &&...args)

void	launchNbnxmKernelPruneOnly (NbnxmGpu *nb, const InteractionLocality iloc, const int numParts, const int part, const int numSciInPart)

static cl::sycl::range< 1 >	flattenRange (cl::sycl::range< 3 > range3d)
	Convert 3D range to 1D.

static cl::sycl::nd_range< 1 >	flattenNDRange (cl::sycl::nd_range< 3 > nd_range3d)
	Convert 3D nd_range to 1D.

template<int rangeX, int rangeY>
static cl::sycl::id< 3 >	unflattenId (cl::sycl::id< 1 > id1d)
	Convert flattened 1D index to 3D.

template<cl::sycl::access::mode Mode, class IndexType >
static void	atomicFetchAdd (DeviceAccessor< float, Mode > acc, const IndexType idx, const float val)
	Convenience wrapper to do atomic addition to a global buffer.

static float	shuffleDown (float var, unsigned int delta, sycl_2020::sub_group sg)

static float	shuffleUp (float var, unsigned int delta, sycl_2020::sub_group sg)

Variables
static constexpr int	c_numBoundingBoxBounds1D = 2
	The number of bounds along one dimension of a bounding box.

static constexpr int	c_sortGridRatio = 4
	Ratio of grid cells to atoms.

static constexpr int	c_sortGridMaxSizeFactor = c_sortGridRatio + 1
	Maximum ratio of holes used, in the worst case all particles end up in the last hole and we need num. atoms extra holes at the end.

constexpr int	c_numElecTypes = static_cast<int>(ElecType::Count)
	Number of possible ElecType values.

constexpr int	c_numVdwTypes = static_cast<int>(VdwType::Count)
	Number of possible VdwType values.

static constexpr gmx::EnumerationArray < KernelType, int >	IClusterSizePerKernelType
	The nbnxn i-cluster size in atoms for each nbnxn kernel type. More...

static constexpr gmx::EnumerationArray < KernelType, int >	JClusterSizePerKernelType
	The nbnxn j-cluster size in atoms for each nbnxn kernel type. More...

static const char *	nb_kfunc_noener_noprune_ptr [c_numElecTypes][c_numVdwTypes]
	Force-only kernel function names.

static const char *	nb_kfunc_ener_noprune_ptr [c_numElecTypes][c_numVdwTypes]
	Force + energy kernel function pointers.

static const char *	nb_kfunc_noener_prune_ptr [c_numElecTypes][c_numVdwTypes]
	Force + pruning kernel function pointers.

static const char *	nb_kfunc_ener_prune_ptr [c_numElecTypes][c_numVdwTypes]
	Force + energy + pruning kernel function pointers.

static unsigned int	gpu_min_ci_balanced_factor = 50
	This parameter should be determined heuristically from the kernel execution times. More...

static constexpr int	c_syclPruneKernelJ4Concurrency = 4
	Macro defining default for the prune kernel's j4 processing concurrency. More...


static constexpr int	c_clSize = c_nbnxnGpuClusterSize
	Convenience constants.


template<enum VdwType vdwType>
constexpr bool	ljComb = EnergyFunctionProperties<ElecType::Count, vdwType>().vdwComb
	Templated constants to shorten kernel function declaration.

template<enum ElecType elecType>
constexpr bool	vdwCutoffCheck = EnergyFunctionProperties<elecType, VdwType::Count>().elecEwaldTwin

template<enum ElecType elecType>
constexpr bool	elecEwald = EnergyFunctionProperties<elecType, VdwType::Count>().elecEwald

template<enum ElecType elecType>
constexpr bool	elecEwaldTab = EnergyFunctionProperties<elecType, VdwType::Count>().elecEwaldTab

template<enum VdwType vdwType>
constexpr bool	ljEwald = EnergyFunctionProperties<ElecType::Count, vdwType>().vdwEwald

Enumeration Type Documentation

enum Nbnxm::ElecType : int

strong

Nbnxm electrostatic GPU kernel flavors.

Types of electrostatics implementations available in the GPU non-bonded force kernels. These represent both the electrostatics types implemented by the kernels (cut-off, RF, and Ewald - a subset of what's defined in enums.h) as well as encode implementation details analytical/tabulated and single or twin cut-off (for Ewald kernels). Note that the cut-off and RF kernels have only analytical flavor and unlike in the CPU kernels, the tabulated kernels are ATM Ewald-only.

The row-order of pointers to different electrostatic kernels defined in nbnxn_cuda.cu by the nb_*_kfunc_ptr function pointer table should match the order of enumerated types below.

Enumerator
Cut	Plain cut-off.
RF	Reaction field.
EwaldTab	Tabulated Ewald with single cut-off.
EwaldTabTwin	Tabulated Ewald with twin cut-off.
EwaldAna	Analytical Ewald with single cut-off.
EwaldAnaTwin	Analytical Ewald with twin cut-off.
Count	Number of valid values.

enum Nbnxm::VdwType : int

strong

Nbnxm VdW GPU kernel flavors.

The enumerates values correspond to the LJ implementations in the GPU non-bonded kernels.

The column-order of pointers to different electrostatic kernels defined in nbnxn_cuda_ocl.cpp/.cu by the nb_*_kfunc_ptr function pointer table should match the order of enumerated types below.

Enumerator
Cut	Plain cut-off.
CutCombGeom	Cut-off with geometric combination rules.
CutCombLB	Cut-off with Lorentz-Berthelot combination rules.
FSwitch	Smooth force switch.
PSwitch	Smooth potential switch.
EwaldGeom	Ewald with geometric combination rules.
EwaldLB	Ewald with Lorentz-Berthelot combination rules.
Count	Number of valid values.

Function Documentation

void Nbnxm::bench	(	int	sizeFactor,
		const KernelBenchOptions &	options
	)

Sets up and runs one or more Nbnxm kernel benchmarks.

The simulated system is a box of 1000 SPC/E water molecules scaled by the factor sizeFactor, which has to be a power of 2. One or more benchmarks are run, as specified by options. Benchmark settings and timings are printed to stdout.

Parameters

[in]	sizeFactor	How much should the system size be increased.
[in]	options	How the benchmark will be run.

static int Nbnxm::calc_shmem_required_prune ( const int num_threads_z )

inlinestatic

Calculates the amount of shared memory required by the prune kernel.

Note that for the sake of simplicity we use the CUDA terminology "shared memory" for OpenCL local memory.

Parameters

[in] num_threads_z cj4 concurrency equal to the number of threads/work items in the 3-rd dimension.

Returns: the amount of local memory in bytes required by the pruning kernel

static bool Nbnxm::canSkipNonbondedWork	(	const NbnxmGpu &	nb,
		InteractionLocality	iloc
	)

inlinestatic

An early return condition for empty NB GPU workloads.

This is currently used for non-local kernels/transfers only. Skipping the local kernel is more complicated, since the local part of the force array also depends on the non-local kernel. The skip of the local kernel is taken care of separately.

static std::optional<std::string> Nbnxm::checkKernelSetup ( const KernelBenchOptions & options )

static

Checks the kernel setup.

Returns an error string when the kernel is not available.

template<typename GpuTimers >

static void Nbnxm::countPruneKernelTime	(	GpuTimers *	timers,
		gmx_wallclock_gpu_nbnxn_t *	timings,
		const InteractionLocality	iloc
	)

static

Count pruning kernel time if either kernel has been triggered.

We do the accounting for either of the two pruning kernel flavors:

1st pass prune: ran during the current step (prior to the force kernel);
rolling prune: ran at the end of the previous step (prior to the current step H2D xq);

Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen after calling this function.

Parameters

[in]	timers	structs with GPU timer objects
[in,out]	timings	GPU task timing data
[in]	iloc	interaction locality

static void Nbnxm::fillin_ocl_structures	(	NBParamGpu *	nbp,
		cl_nbparam_params_t *	nbparams_params
	)

static

Initializes data structures that are going to be sent to the OpenCL device.

The device can't use the same data structures as the host for two main reasons:

OpenCL restrictions (pointers are not accepted inside data structures)
some host side fields are not needed for the OpenCL kernels.

This function is called before the launch of both nbnxn and prune kernels.

static void Nbnxm::freeGpuProgram ( cl_program program )

static

Free the OpenCL program.

The function releases the OpenCL program assuciated with the device that the calling PP rank is running on.

Parameters

program [in] OpenCL program to release.

template<typename AtomDataT >

static void Nbnxm::getGpuAtomRange	(	const AtomDataT *	atomData,
		const AtomLocality	atomLocality,
		int *	atomRangeBegin,
		int *	atomRangeLen
	)

inlinestatic

Calculate atom range and return start index and length.

Parameters

[in]	atomData	Atom descriptor data structure
[in]	atomLocality	Atom locality specifier
[out]	atomRangeBegin	Starting index of the atom range in the atom data array.
[out]	atomRangeLen	Atom range length in the atom data array.

void* Nbnxm::getGpuForces ( NbnxmGpu * nb )

Get the pointer to the GPU nonbonded force buffer.

Parameters

[in] nb The nonbonded data GPU structure

Returns: A pointer to the force buffer in GPU memory

template<typename GpuTimers , typename GpuPairlist >

static void Nbnxm::gpu_accumulate_timings	(	gmx_wallclock_gpu_nbnxn_t *	timings,
		GpuTimers *	timers,
		const GpuPairlist *	plist,
		AtomLocality	atomLocality,
		const gmx::StepWorkload &	stepWork,
		bool	doTiming
	)

inlinestatic

Do the per-step timing accounting of the nonbonded tasks.

Does timing accumulation and call-count increments for the nonbonded kernels. Note that this function should be called after the current step's nonbonded nonbonded tasks have completed with the exception of the rolling pruning kernels that are accounted for during the following step.

NOTE: if timing with multiple GPUs (streams) becomes possible, the counters could end up being inconsistent due to not being incremented on some of the node when this is skipped on empty local domains!

Template Parameters

GpuTimers	GPU timers type
GpuPairlist	Pair list type

Parameters

[out]	timings	Pointer to the NB GPU timings data
[in]	timers	Pointer to GPU timers data
[in]	plist	Pointer to the pair list data
[in]	atomLocality	Atom locality specifier
[in]	stepWork	Force schedule flags
[in]	doTiming	True if timing is enabled.

void Nbnxm::gpu_clear_outputs	(	NbnxmGpu *	nb,
		bool	computeVirial
	)

Clears GPU outputs: nonbonded force, shift force and energy.

This function is documented in the header file.

void Nbnxm::gpu_copy_xq_to_gpu	(	NbnxmGpu *	nb,
		const struct nbnxn_atomdata_t *	nbdata,
		gmx::AtomLocality	aloc
	)

Launch asynchronously the xq buffer host to device copy.

The nonlocal copy is skipped if there is no dependent work to do, neither non-local nonbonded interactions nor bonded GPU work.

Parameters

[in]	nb	GPU nonbonded data.
[in]	nbdata	Host-side atom data structure.
[in]	aloc	Atom locality flag.

void Nbnxm::gpu_free ( NbnxmGpu * nb )

Frees all GPU resources used for the nonbonded calculations.

This function is documented in the header file.

gmx_wallclock_gpu_nbnxn_t * Nbnxm::gpu_get_timings ( NbnxmGpu * nb )

Returns the GPU timings structure or NULL if GPU is not used or timing is off.

This function is documented in the header file.

NbnxmGpu * Nbnxm::gpu_init	(	const gmx::DeviceStreamManager &	deviceStreamManager,
		const interaction_const_t *	ic,
		const PairlistParams &	listParams,
		const nbnxn_atomdata_t *	nbat,
		bool	bLocalAndNonlocal
	)

Initializes the data structures related to GPU nonbonded calculations.

This function is documented in the header file.

void Nbnxm::gpu_init_atomdata	(	NbnxmGpu *	nb,
		const nbnxn_atomdata_t *	nbat
	)

Initializes atom-data on the GPU, called at every pair search step.

This function is documented in the header file.

void Nbnxm::gpu_init_pairlist	(	NbnxmGpu *	nb,
		const struct NbnxnPairlistGpu *	h_nblist,
		gmx::InteractionLocality	iloc
	)

Initializes pair-list data for GPU, called at every pair search step.

bool Nbnxm::gpu_is_kernel_ewald_analytical ( const NbnxmGpu * nb )

Returns if analytical Ewald GPU kernels are used.

void Nbnxm::gpu_launch_kernel	(	NbnxmGpu *	nb,
		const gmx::StepWorkload &	stepWork,
		gmx::InteractionLocality	iloc
	)

Launch asynchronously the nonbonded force calculations.

Also launches the initial pruning of a fresh list after search.

The local and non-local interaction calculations are launched in two separate streams. If there is no work (i.e. empty pair list), the force kernel launch is omitted.

void Nbnxm::gpu_launch_kernel	(	NbnxmGpu *	nb,
		const gmx::StepWorkload &	stepWork,
		const Nbnxm::InteractionLocality	iloc
	)

Launch GPU kernel.

As we execute nonbonded workload in separate queues, before launching the kernel we need to make sure that he following operations have completed:

atomdata allocation and related H2D transfers (every nstlist step);
pair list H2D transfer (every nstlist step);
shift vector H2D transfer (every nstlist step);
force (+shift force and energy) output clearing (every step).

These operations are issued in the local queue at the beginning of the step and therefore always complete before the local kernel launch. The non-local kernel is launched after the local on the same device/context, so this is inherently scheduled after the operations in the local stream (including the above "misc_ops"). However, for the sake of having a future-proof implementation, we use the misc_ops_done event to record the point in time when the above operations are finished and synchronize with this event in the non-local stream.

void Nbnxm::gpu_launch_kernel_pruneonly	(	NbnxmGpu *	nb,
		gmx::InteractionLocality	iloc,
		int	numParts
	)

Launch asynchronously the nonbonded prune-only kernel.

The local and non-local list pruning are launched in their separate streams.

Notes for future scheduling tuning: Currently we schedule the dynamic pruning between two MD steps after both local and nonlocal force D2H transfers completed. We could launch already after the cpyback is launched, but we want to avoid prune kernels (especially in the non-local high prio-stream) competing with nonbonded work.

However, this is not ideal as this schedule does not expose the available concurrency. The dynamic pruning kernel:

should be allowed to overlap with any task other than force compute, including transfers (F D2H and the next step's x H2D as well as force clearing).
we'd prefer to avoid competition with non-bonded force kernels belonging to the same rank and ideally other ranks too.

In the most general case, the former would require scheduling pruning in a separate stream and adding additional event sync points to ensure that force kernels read consistent pair list data. This would lead to some overhead (due to extra cudaStreamWaitEvent calls, 3-5 us/call) which we might be able to live with. The gains from additional overlap might not be significant as long as update+constraints anyway takes longer than pruning, but there will still be use-cases where more overlap may help (e.g. multiple ranks per GPU, no/hbonds only constraints). The above second point is harder to address given that multiple ranks will often share a GPU. Ranks that complete their nonbondeds sooner can schedule pruning earlier and without a third priority level it is difficult to avoid some interference of prune kernels with force tasks (in particular preemption of low-prio local force task).

Parameters

[in,out]	nb	GPU nonbonded data.
[in]	iloc	Interaction locality flag.
[in]	numParts	Number of parts the pair list is split into in the rolling kernel.

int Nbnxm::gpu_min_ci_balanced ( NbnxmGpu * nb )

Calculates the minimum size of proximity lists to improve SM load balance with GPU non-bonded kernels.

This function is documented in the header file.

static void Nbnxm::gpu_reduce_staged_outputs	(	const nb_staging_t &	nbst,
		const InteractionLocality	iLocality,
		const bool	reduceEnergies,
		const bool	reduceFshift,
		real *	e_lj,
		real *	e_el,
		rvec *	fshift
	)

inlinestatic

Reduce data staged internally in the nbnxn module.

Shift forces and electrostatic/LJ energies copied from the GPU into a module-internal staging area are immediately reduced (CPU-side buffers passed) after having waited for the transfers' completion.

Note that this function should always be called after the transfers into the staging buffers has completed.

Template Parameters

StagingData Type of staging data

Parameters

[in]	nbst	Nonbonded staging data
[in]	iLocality	Interaction locality specifier
[in]	reduceEnergies	True if energy reduction should be done
[in]	reduceFshift	True if shift force reduction should be done
[out]	e_lj	Variable to accumulate LJ energy into
[out]	e_el	Variable to accumulate electrostatic energy into
[out]	fshift	Pointer to the array of shift forces to accumulate into

void Nbnxm::gpu_reset_timings ( struct nonbonded_verlet_t * nbv )

Resets nonbonded GPU timings.

This function is documented in the header file.

bool Nbnxm::gpu_try_finish_task	(	NbnxmGpu *	nb,
		const gmx::StepWorkload &	stepWork,
		gmx::AtomLocality	aloc,
		real *	e_lj,
		real *	e_el,
		gmx::ArrayRef< gmx::RVec >	shiftForces,
		GpuTaskCompletion	completionKind,
		gmx_wallcycle *	wcycle
	)

Attempts to complete nonbonded GPU task.

This function attempts to complete the nonbonded task (both GPU and CPU auxiliary work). Success, i.e. that the tasks completed and results are ready to be consumed, is signaled by the return value (always true if blocking wait mode requested).

The completionKind parameter controls whether the behavior is non-blocking (achieved by passing GpuTaskCompletion::Check) or blocking wait until the results are ready (when GpuTaskCompletion::Wait is passed). As the "Check" mode the function will return immediately if the GPU stream still contain tasks that have not completed, it allows more flexible overlapping of work on the CPU with GPU execution.

Note that it is only safe to use the results, and to continue to the next MD step when this function has returned true which indicates successful completion of

All nonbonded GPU tasks: both compute and device transfer(s)
auxiliary tasks: updating the internal module state (timing accumulation, list pruning states) and
internal staging reduction of (fshift, e_el, e_lj).

In GpuTaskCompletion::Check mode this function does the timing and keeps correct count for the nonbonded task (incrementing only once per taks), in the GpuTaskCompletion::Wait mode timing is expected to be done in the caller.

TODO: improve the handling of outputs e.g. by ensuring that this function explcitly returns the force buffer (instead of that being passed only to nbnxn_gpu_launch_cpyback()) and by returning the energy and Fshift contributions for some external/centralized reduction.

Parameters

[in]	nb	The nonbonded data GPU structure
[in]	stepWork	Step schedule flags
[in]	aloc	Atom locality identifier
[out]	e_lj	Pointer to the LJ energy output to accumulate into
[out]	e_el	Pointer to the electrostatics energy output to accumulate into
[out]	shiftForces	Shift forces buffer to accumulate into
[in]	completionKind	Indicates whether nnbonded task completion should only be checked rather than waited for
[out]	wcycle	Pointer to wallcycle data structure

Returns: True if the nonbonded tasks associated with aloc locality have completed

bool Nbnxm::gpu_try_finish_task	(	NbnxmGpu *	nb,
		const gmx::StepWorkload &	stepWork,
		const AtomLocality	aloc,
		real *	e_lj,
		real *	e_el,
		gmx::ArrayRef< gmx::RVec >	shiftForces,
		GpuTaskCompletion	completionKind,
		gmx_wallcycle *	wcycle
	)

Attempts to complete nonbonded GPU task.

See documentation in nbnxm_gpu.h for details.

Todo:: Move into shared source file, perhaps including cuda_runtime.h if needed for any remaining CUDA-specific objects.

void Nbnxm::gpu_upload_shiftvec	(	NbnxmGpu *	nb,
		const nbnxn_atomdata_t *	nbatom
	)

Uploads shift vector to the GPU if the box is dynamic (otherwise just returns).

This function is documented in the header file.

float Nbnxm::gpu_wait_finish_task	(	NbnxmGpu *	nb,
		const gmx::StepWorkload &	stepWork,
		gmx::AtomLocality	aloc,
		real *	e_lj,
		real *	e_el,
		gmx::ArrayRef< gmx::RVec >	shiftForces,
		gmx_wallcycle *	wcycle
	)

Completes the nonbonded GPU task blocking until GPU tasks and data transfers to finish.

Also does timing accounting and reduction of the internal staging buffers. As this is called at the end of the step, it also resets the pair list and pruning flags.

Parameters

[in]	nb	The nonbonded data GPU structure
[in]	stepWork	Step schedule flags
[in]	aloc	Atom locality identifier
[out]	e_lj	Pointer to the LJ energy output to accumulate into
[out]	e_el	Pointer to the electrostatics energy output to accumulate into
[out]	shiftForces	Shift forces buffer to accumulate into
[out]	wcycle	Pointer to wallcycle data structure

float Nbnxm::gpu_wait_finish_task	(	NbnxmGpu *	nb,
		const gmx::StepWorkload &	stepWork,
		AtomLocality	aloc,
		real *	e_lj,
		real *	e_el,
		gmx::ArrayRef< gmx::RVec >	shiftForces,
		gmx_wallcycle *	wcycle
	)

Wait for the asynchronously launched nonbonded tasks and data transfers to finish.

Also does timing accounting and reduction of the internal staging buffers. As this is called at the end of the step, it also resets the pair list and pruning flags.

Parameters

[in]	nb	The nonbonded data GPU structure
[in]	stepWork	Force schedule flags
[in]	aloc	Atom locality identifier
[out]	e_lj	Pointer to the LJ energy output to accumulate into
[out]	e_el	Pointer to the electrostatics energy output to accumulate into
[out]	shiftForces	Shift forces buffer to accumulate into
[out]	wcycle	Pointer to wallcycle data structure

Returns: The number of cycles the gpu wait took

static InteractionLocality Nbnxm::gpuAtomToInteractionLocality ( const AtomLocality atomLocality )

inlinestatic

Convert atom locality to interaction locality.

In the current implementation the this is straightforward conversion: local to local, non-local to non-local.

Parameters

[in] atomLocality Atom locality specifier

Returns: Interaction locality corresponding to the atom locality passed.

static bool Nbnxm::haveGpuShortRangeWork	(	const NbnxmGpu &	nb,
		const gmx::InteractionLocality	iLocality
	)

static

Returns true if there is GPU short-range work for the given interaction locality.

Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal, and therefore if there are GPU offloaded bonded interactions, this function will return true for all interaction localities.

Parameters

[in,out]	nb	Pointer to the nonbonded GPU data structure
[in]	iLocality	Interaction locality identifier

bool Nbnxm::haveGpuShortRangeWork	(	const NbnxmGpu *	nb,
		gmx::AtomLocality	aLocality
	)

Returns true if there is GPU short-range work for the given atom locality.

Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal, and therefore if there are GPU offloaded bonded interactions, this function will return true for both local and nonlocal atom range.

Parameters

[in,out]	nb	Pointer to the nonbonded GPU data structure
[in]	aLocality	Atom locality identifier

void Nbnxm::init_ewald_coulomb_force_table	(	const EwaldCorrectionTables &	tables,
		NBParamGpu *	nbp,
		const DeviceContext &	deviceContext
	)

Tabulates the Ewald Coulomb force and initializes the size/scale and the table GPU array.

If called with an already allocated table, it just re-uploads the table.

template<typename T >

static T Nbnxm::lerp	(	T	d0,
		T	d1,
		T	t
	)

inlinestatic

Linear interpolation using exactly two FMA operations.

Implements numeric equivalent of: (1-t)*d0 + t*d1.

const char * Nbnxm::lookup_kernel_name ( Nbnxm::KernelType kernelType )

Return a string identifying the kernel type.

Parameters

[in] kernelType nonbonded kernel type, takes values from the nbnxn_kernel_type enum

Returns: a string identifying the kernel corresponding to the type passed as argument

enum ElecType Nbnxm::nbnxmGpuPickElectrostaticsKernelType	(	const interaction_const_t *	ic,
		const DeviceInformation &	deviceInfo
	)

Return the enum value of electrostatics kernel type for given interaction parameters ic.

enum VdwType Nbnxm::nbnxmGpuPickVdwKernelType	(	const interaction_const_t *	ic,
		int	combRule
	)

Return the enum value of VdW kernel type for given ic and combRule.

template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>

auto Nbnxm::nbnxmKernel	(	cl::sycl::handler &	cgh,
		DeviceAccessor< float4, mode::read >	a_xq,
		DeviceAccessor< float, mode::read_write >	a_f,
		DeviceAccessor< float3, mode::read >	a_shiftVec,
		DeviceAccessor< float, mode::read_write >	a_fShift,
		OptionalAccessor< float, mode::read_write, doCalcEnergies >	a_energyElec,
		OptionalAccessor< float, mode::read_write, doCalcEnergies >	a_energyVdw,
		DeviceAccessor< nbnxn_cj4_t, doPruneNBL?mode::read_write:mode::read >	a_plistCJ4,
		DeviceAccessor< nbnxn_sci_t, mode::read >	a_plistSci,
		DeviceAccessor< nbnxn_excl_t, mode::read >	a_plistExcl,
		OptionalAccessor< float2, mode::read, ljComb< vdwType >>	a_ljComb,
		OptionalAccessor< int, mode::read,!ljComb< vdwType >>	a_atomTypes,
		OptionalAccessor< float, mode::read,!ljComb< vdwType >>	a_nbfp,
		OptionalAccessor< float, mode::read, ljEwald< vdwType >>	a_nbfpComb,
		OptionalAccessor< float, mode::read, elecEwaldTab< elecType >>	a_coulombTab,
		const int	numTypes,
		const float	rCoulombSq,
		const float	rVdwSq,
		const float	twoKRf,
		const float	ewaldBeta,
		const float	rlistOuterSq,
		const float	ewaldShift,
		const float	epsFac,
		const float	ewaldCoeffLJ,
		const float	cRF,
		const shift_consts_t	dispersionShift,
		const shift_consts_t	repulsionShift,
		const switch_consts_t	vdwSwitch,
		const float	rVdwSwitch,
		const float	ljEwaldShift,
		const float	coulombTabScale,
		const bool	calcShift
	)

Main kernel for NBNXM.

template<bool haveFreshList>

auto Nbnxm::nbnxmKernelPruneOnly	(	cl::sycl::handler &	cgh,
		DeviceAccessor< float4, mode::read >	a_xq,
		DeviceAccessor< float3, mode::read >	a_shiftVec,
		DeviceAccessor< nbnxn_cj4_t, mode::read_write >	a_plistCJ4,
		DeviceAccessor< nbnxn_sci_t, mode::read >	a_plistSci,
		DeviceAccessor< unsigned int, haveFreshList?mode::write:mode::read >	a_plistIMask,
		const float	rlistOuterSq,
		const float	rlistInnerSq,
		const int	numParts,
		const int	part
	)

Prune-only kernel for NBNXM.

void Nbnxm::nbnxn_gpu_x_to_nbat_x	(	const Nbnxm::Grid &	grid,
		bool	setFillerCoords,
		NbnxmGpu *	gpu_nbv,
		DeviceBuffer< gmx::RVec >	d_x,
		GpuEventSynchronizer *	xReadyOnDevice,
		gmx::AtomLocality	locality,
		int	gridId,
		int	numColumnsMax
	)

X buffer operations on GPU: performs conversion from rvec to nb format.

Parameters

[in]	grid	Grid to be converted.
[in]	setFillerCoords	If the filler coordinates are used.
[in,out]	gpu_nbv	The nonbonded data GPU structure.
[in]	d_x	Device-side coordinates in plain rvec format.
[in]	xReadyOnDevice	Event synchronizer indicating that the coordinates are ready in the device memory.
[in]	locality	Copy coordinates for local or non-local atoms.
[in]	gridId	Index of the grid being converted.
[in]	numColumnsMax	Maximum number of columns in the grid.

static void Nbnxm::nbnxn_ocl_init_const	(	cl_atomdata_t *	atomData,
		NBParamGpu *	nbParams,
		const interaction_const_t *	ic,
		const PairlistParams &	listParams,
		const nbnxn_atomdata_t::Params &	nbatParams,
		const DeviceContext &	deviceContext
	)

static

Initializes simulation constant data.

Initializes members of the atomdata and nbparam structs and clears e/fshift output buffers.

static gmx_bool Nbnxm::nbnxn_simd_supported	(	const gmx::MDLogger &	mdlog,
		const t_inputrec *	ir
	)

static

Returns whether CPU SIMD support exists for the given inputrec.

If the return value is FALSE and fplog/cr != NULL, prints a fallback message to fplog/stderr.

void Nbnxm::nbnxn_wait_x_on_device ( NbnxmGpu * nb )

sync CPU thread on coordinate copy to device

Parameters

[in] nb The nonbonded data GPU structure

void Nbnxm::nbnxnInsertNonlocalGpuDependency	(	const NbnxmGpu *	nb,
		gmx::InteractionLocality	interactionLocality
	)

Sync the nonlocal stream with dependent tasks in the local queue.

Parameters

[in]	nb	The nonbonded data GPU structure
[in]	interactionLocality	Local or NonLocal sync point

static void Nbnxm::reduceForceIAndFShift	(	cl::sycl::accessor< float, 1, mode::read_write, target::local >	sm_buf,
		const float3	fCiBuf[c_nbnxnGpuNumClusterPerSupercluster],
		const bool	calcFShift,
		const cl::sycl::nd_item< 1 >	itemIdx,
		const int	tidxi,
		const int	tidxj,
		const int	sci,
		const int	shift,
		DeviceAccessor< float, mode::read_write >	a_f,
		DeviceAccessor< float, mode::read_write >	a_fShift
	)

inlinestatic

Final i-force reduction.

This implementation works only with power of two array sizes.

static cl_kernel Nbnxm::selectPruneKernel	(	cl_kernel	kernel_pruneonly[],
		bool	firstPrunePass
	)

inlinestatic

Return a pointer to the prune kernel version to be executed at the current invocation.

Parameters

[in]	kernel_pruneonly	array of prune kernel objects
[in]	firstPrunePass	true if the first pruning pass is being executed

void Nbnxm::setupGpuShortRangeWork	(	NbnxmGpu *	nb,
		const gmx::GpuBonded *	gpuBonded,
		gmx::InteractionLocality	iLocality
	)

Set up internal flags that indicate what type of short-range work there is.

As nonbondeds and bondeds share input/output buffers and GPU queues, both are considered when checking for work in the current domain.

This function is expected to be called every time the work-distribution can change (i.e. at search/domain decomposition steps).

Parameters

[in,out]	nb	Pointer to the nonbonded GPU data structure
[in]	gpuBonded	Pointer to the GPU bonded data structure
[in]	iLocality	Interaction locality identifier

static void Nbnxm::sort_atoms	(	int	dim,
		gmx_bool	Backwards,
		int	dd_zone,
		bool	relevantAtomsAreWithinGridBounds,
		int *	a,
		int	n,
		gmx::ArrayRef< const gmx::RVec >	x,
		real	h0,
		real	invh,
		int	n_per_h,
		gmx::ArrayRef< int >	sort
	)

static

Sorts particle index a on coordinates x along dim.

Backwards tells if we want decreasing iso increasing coordinates. h0 is the minimum of the coordinate range. invh is the 1/length of the sorting range. n_per_h (>=n) is the expected average number of particles per 1/invh sort is the sorting work array. sort should have a size of at least n_per_h*c_sortGridRatio + n, or easier, allocate at least n*c_sortGridMaxSizeFactor elements.

static void Nbnxm::sort_cluster_on_flag	(	int	numAtomsInCluster,
		int	atomStart,
		int	atomEnd,
		const int *	atinfo,
		gmx::ArrayRef< int >	order,
		int *	flags
	)

static

Set non-bonded interaction flags for the current cluster.

Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.

static void Nbnxm::sync_ocl_event	(	cl_command_queue	stream,
		cl_event *	ocl_event
	)

static

Enqueues a wait for event completion.

Then it releases the event and sets it to 0. Don't use this function when more than one wait will be issued for the event. Equivalent to Cuda Stream Sync.

static bool Nbnxm::useLjCombRule ( const enum VdwType vdwType )

inlinestatic

Returns true if LJ combination rules are used in the non-bonded kernels.

Parameters

[in] vdwType The VdW interaction/implementation type as defined by VdwType enumeration.

Returns: Whether combination rules are used by the run.

static void Nbnxm::validateGpuAtomLocality ( const AtomLocality atomLocality )

inlinestatic

Check that atom locality values are valid for the GPU module.

In the GPU module atom locality "all" is not supported, the local and non-local ranges are treated separately.

Parameters

[in] atomLocality atom locality specifier

Variable Documentation

constexpr int Nbnxm::c_syclPruneKernelJ4Concurrency = 4

static

Macro defining default for the prune kernel's j4 processing concurrency.

The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override.

unsigned int Nbnxm::gpu_min_ci_balanced_factor = 50

static

This parameter should be determined heuristically from the kernel execution times.

This value is best for small systems on a single AMD Radeon R9 290X (and about 5% faster than 40, which is the default for CUDA devices). Larger simulation systems were quite insensitive to the value of this parameter.

constexpr gmx::EnumerationArray<KernelType, int> Nbnxm::IClusterSizePerKernelType

static

Initial value:

= {
    { 0, c_nbnxnCpuIClusterSize, c_nbnxnCpuIClusterSize, c_nbnxnCpuIClusterSize,
      c_nbnxnGpuClusterSize, c_nbnxnGpuClusterSize }
}

The nbnxn i-cluster size in atoms for each nbnxn kernel type.

constexpr gmx::EnumerationArray<KernelType, int> Nbnxm::JClusterSizePerKernelType

static

Initial value:

= {
    { 0, c_nbnxnCpuIClusterSize,
        4  ,   4   / 2,
      c_nbnxnGpuClusterSize, c_nbnxnGpuClusterSize / 2 }
}

The nbnxn j-cluster size in atoms for each nbnxn kernel type.

Description

Enumerations

Functions

Variables

Enumeration Type Documentation

Function Documentation

Variable Documentation