#include "gmxpre.h"
#include <assert.h>
#include <stdlib.h>
#include "thread_mpi/atomic.h"
#include "gromacs/gpu_utils/gputraits_ocl.h"
#include "gromacs/gpu_utils/oclutils.h"
#include "gromacs/hardware/hw_info.h"
#include "gromacs/mdtypes/simulation_workload.h"
#include "gromacs/nbnxm/atomdata.h"
#include "gromacs/nbnxm/gpu_common.h"
#include "gromacs/nbnxm/gpu_common_utils.h"
#include "gromacs/nbnxm/gpu_data_mgmt.h"
#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/nbnxm/nbnxm_gpu.h"
#include "gromacs/nbnxm/pairlist.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/timing/gpu_timing.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "nbnxm_ocl_internal.h"
#include "nbnxm_ocl_types.h"

Include dependency graph for nbnxm_ocl.cpp:

Description

Define OpenCL implementation of nbnxm_gpu.h.

Author: Anca Hamuraru anca@.nosp@m.stre.nosp@m.amcom.nosp@m.puti.nosp@m.ng.eu; Teemu Virolainen teemu.nosp@m.@str.nosp@m.eamco.nosp@m.mput.nosp@m.ing.e.nosp@m.u; Dimitrios Karkoulis dimit.nosp@m.ris..nosp@m.karko.nosp@m.ulis.nosp@m.@gmai.nosp@m.l.co.nosp@m.m; Szilárd Páll pall..nosp@m.szil.nosp@m.ard@g.nosp@m.mail.nosp@m..com

TODO (psz):

Add a static const cl_uint c_pruneKernelWorkDim / c_nbnxnKernelWorkDim = 3;
Rework the copying of OCL data structures done before every invocation of both nb and prune kernels (using fillin_ocl_structures); also consider at the same time calling clSetKernelArg only on the updated parameters (if tracking changed parameters is feasible);
Consider using the event_wait_list argument to clEnqueueNDRangeKernel to mark dependencies on the kernel launched: e.g. the non-local nb kernel's dependency on the misc_ops_and_local_H2D_done event could be better expressed this way.
Consider extracting common sections of the OpenCL and CUDA nbnxn logic, e.g:
- in nbnxn_gpu_launch_kernel_pruneonly() the pre- and post-kernel launch logic is identical in the two implementations, so a 3-way split might allow sharing code;

Functions
static void	Nbnxm::validate_global_work_size (const KernelLaunchConfig &config, int work_dim, const gmx_device_info_t *dinfo)
	Validates the input global work size parameter.

static cl_kernel	Nbnxm::selectPruneKernel (cl_kernel kernel_pruneonly[], bool firstPrunePass)
	Return a pointer to the prune kernel version to be executed at the current invocation. More...

static cl_kernel	Nbnxm::select_nbnxn_kernel (gmx_nbnxn_ocl_t *nb, int eeltype, int evdwtype, bool bDoEne, bool bDoPrune)
	Return a pointer to the kernel version to be executed at the current step. OpenCL kernel objects are cached in nb. If the requested kernel is not found in the cache, it will be created and the cache will be updated.

static int	Nbnxm::calc_shmem_required_nonbonded (int vdwType, bool bPrefetchLjParam)
	Calculates the amount of shared memory required by the nonbonded kernel in use.

static void	Nbnxm::fillin_ocl_structures (cl_nbparam_t nbp, cl_nbparam_params_t nbparams_params)
	Initializes data structures that are going to be sent to the OpenCL device. More...

static void	Nbnxm::sync_ocl_event (cl_command_queue stream, cl_event *ocl_event)
	Enqueues a wait for event completion. More...

void	Nbnxm::gpu_copy_xq_to_gpu (gmx_nbnxn_ocl_t nb, const nbnxn_atomdata_t nbatom, const AtomLocality atomLocality)
	Launch asynchronously the xq buffer host to device copy.

void	Nbnxm::gpu_launch_kernel (gmx_nbnxn_ocl_t *nb, const gmx::StepWorkload &stepWork, const Nbnxm::InteractionLocality iloc)
	Launch GPU kernel. More...

static int	Nbnxm::calc_shmem_required_prune (const int num_threads_z)
	Calculates the amount of shared memory required by the prune kernel. More...

void	Nbnxm::gpu_launch_kernel_pruneonly (gmx_nbnxn_gpu_t *nb, const InteractionLocality iloc, const int numParts)
	Launch the pairlist prune only kernel for the given locality. `numParts` tells in how many parts, i.e. calls the list will be pruned.

void	Nbnxm::gpu_launch_cpyback (gmx_nbnxn_ocl_t nb, struct nbnxn_atomdata_t nbatom, const gmx::StepWorkload &stepWork, const AtomLocality aloc)
	Launch asynchronously the download of nonbonded forces from the GPU (and energies/shift forces if required).

int	Nbnxm::nbnxn_gpu_pick_ewald_kernel_type (const interaction_const_t &ic)
	Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off.

Variables
static const char *	Nbnxm::nb_kfunc_noener_noprune_ptr [eelOclNR][evdwOclNR]
	Force-only kernel function names.

static const char *	Nbnxm::nb_kfunc_ener_noprune_ptr [eelOclNR][evdwOclNR]
	Force + energy kernel function pointers.

static const char *	Nbnxm::nb_kfunc_noener_prune_ptr [eelOclNR][evdwOclNR]
	Force + pruning kernel function pointers.

static const char *	Nbnxm::nb_kfunc_ener_prune_ptr [eelOclNR][evdwOclNR]
	Force + energy + pruning kernel function pointers.


static const int	Nbnxm::c_numClPerSupercl = 8
	Convenience constants.

static const int	Nbnxm::c_clSize = c_nbnxnGpuClusterSize

Description

Functions

Variables