Gromacs
2018
|
#include "gmxpre.h"
#include <assert.h>
#include <stdlib.h>
#include "thread_mpi/atomic.h"
#include "gromacs/gpu_utils/oclutils.h"
#include "gromacs/hardware/hw_info.h"
#include "gromacs/mdlib/force_flags.h"
#include "gromacs/mdlib/nb_verlet.h"
#include "gromacs/mdlib/nbnxn_consts.h"
#include "gromacs/mdlib/nbnxn_gpu.h"
#include "gromacs/mdlib/nbnxn_gpu_common.h"
#include "gromacs/mdlib/nbnxn_gpu_common_utils.h"
#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
#include "gromacs/mdlib/nbnxn_pairlist.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/timing/gpu_timing.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "nbnxn_ocl_internal.h"
#include "nbnxn_ocl_types.h"
Define OpenCL implementation of nbnxn_gpu.h.
TODO (psz):
Macros | |
#define | DEBUG_RUN_STEP 2 |
Specifies which kernel run to debug. | |
Functions | |
static void | validate_global_work_size (size_t *global_work_size, int work_dim, const gmx_device_info_t *dinfo) |
Validates the input global work size parameter. | |
static cl_kernel | selectPruneKernel (cl_kernel kernel_pruneonly[], bool firstPrunePass) |
Return a pointer to the prune kernel version to be executed at the current invocation. More... | |
static cl_kernel | select_nbnxn_kernel (gmx_nbnxn_ocl_t *nb, int eeltype, int evdwtype, bool bDoEne, bool bDoPrune) |
Return a pointer to the kernel version to be executed at the current step. OpenCL kernel objects are cached in nb. If the requested kernel is not found in the cache, it will be created and the cache will be updated. | |
static int | calc_shmem_required_nonbonded (int vdwType, bool bPrefetchLjParam) |
Calculates the amount of shared memory required by the nonbonded kernel in use. | |
static void | fillin_ocl_structures (cl_nbparam_t *nbp, cl_nbparam_params_t *nbparams_params) |
Initializes data structures that are going to be sent to the OpenCL device. More... | |
static void | sync_ocl_event (cl_command_queue stream, cl_event *ocl_event) |
Enqueues a wait for event completion. More... | |
void | nbnxn_gpu_launch_kernel (gmx_nbnxn_ocl_t *nb, const struct nbnxn_atomdata_t *nbatom, int flags, int iloc) |
Launch GPU kernel. More... | |
static int | calc_shmem_required_prune (const int num_threads_z) |
Calculates the amount of shared memory required by the prune kernel. More... | |
void | nbnxn_gpu_launch_kernel_pruneonly (gmx_nbnxn_gpu_t *nb, int iloc, int numParts) |
Launch asynchronously the nonbonded prune-only kernel. More... | |
void | nbnxn_gpu_launch_cpyback (gmx_nbnxn_ocl_t *nb, const struct nbnxn_atomdata_t *nbatom, int flags, int aloc) |
Launch asynchronously the download of nonbonded forces from the GPU (and energies/shift forces if required). | |
int | nbnxn_gpu_pick_ewald_kernel_type (bool bTwinCut) |
Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. | |
Variables | |
static const char * | nb_kfunc_noener_noprune_ptr [eelOclNR][evdwOclNR] |
Force-only kernel function names. More... | |
static const char * | nb_kfunc_ener_noprune_ptr [eelOclNR][evdwOclNR] |
Force + energy kernel function pointers. More... | |
static const char * | nb_kfunc_noener_prune_ptr [eelOclNR][evdwOclNR] |
Force + pruning kernel function pointers. More... | |
static const char * | nb_kfunc_ener_prune_ptr [eelOclNR][evdwOclNR] |
Force + energy + pruning kernel function pointers. More... | |
static const int | c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster |
Convenience constants. | |
static const int | c_clSize = c_nbnxnGpuClusterSize |
|
inlinestatic |
Calculates the amount of shared memory required by the prune kernel.
Note that for the sake of simplicity we use the CUDA terminology "shared memory" for OpenCL local memory.
[in] | num_threads_z | cj4 concurrency equal to the number of threads/work items in the 3-rd dimension. |
|
static |
Initializes data structures that are going to be sent to the OpenCL device.
The device can't use the same data structures as the host for two main reasons:
This function is called before the launch of both nbnxn and prune kernels.
void nbnxn_gpu_launch_kernel | ( | gmx_nbnxn_ocl_t * | nb, |
const struct nbnxn_atomdata_t * | nbatom, | ||
int | flags, | ||
int | iloc | ||
) |
Launch GPU kernel.
Launch asynchronously the nonbonded force calculations.
As we execute nonbonded workload in separate queues, before launching the kernel we need to make sure that he following operations have completed:
These operations are issued in the local queue at the beginning of the step and therefore always complete before the local kernel launch. The non-local kernel is launched after the local on the same device/context, so this is inherently scheduled after the operations in the local stream (including the above "misc_ops"). However, for the sake of having a future-proof implementation, we use the misc_ops_done event to record the point in time when the above operations are finished and synchronize with this event in the non-local stream.
void nbnxn_gpu_launch_kernel_pruneonly | ( | gmx_nbnxn_gpu_t * | nb, |
int | iloc, | ||
int | numParts | ||
) |
Launch asynchronously the nonbonded prune-only kernel.
The local and non-local list pruning are launched in their separate streams.
Notes for future scheduling tuning: Currently we schedule the dynamic pruning between two MD steps after both local and nonlocal force D2H transfers completed. We could launch already after the cpyback is launched, but we want to avoid prune kernels (especially in the non-local high prio-stream) competing with nonbonded work.
However, this is not ideal as this schedule does not expose the available concurrency. The dynamic pruning kernel:
In the most general case, the former would require scheduling pruning in a separate stream and adding additional event sync points to ensure that force kernels read consistent pair list data. This would lead to some overhead (due to extra cudaStreamWaitEvent calls, 3-5 us/call) which we might be able to live with. The gains from additional overlap might not be significant as long as update+constraints anyway takes longer than pruning, but there will still be use-cases where more overlap may help (e.g. multiple ranks per GPU, no/hbonds only constraints). The above second point is harder to address given that multiple ranks will often share a GPU. Ranks that complete their nonbondeds sooner can schedule pruning earlier and without a third priority level it is difficult to avoid some interference of prune kernels with force tasks (in particular preemption of low-prio local force task).
[in,out] | nb | GPU nonbonded data. |
[in] | iloc | Interaction locality flag. |
[in] | numParts | Number of parts the pair list is split into in the rolling kernel. |
|
inlinestatic |
Return a pointer to the prune kernel version to be executed at the current invocation.
[in] | kernel_pruneonly | array of prune kernel objects |
[in] | firstPrunePass | true if the first pruning pass is being executed |
|
static |
Enqueues a wait for event completion.
Then it releases the event and sets it to 0. Don't use this function when more than one wait will be issued for the event. Equivalent to Cuda Stream Sync.
|
static |
Force + energy kernel function pointers.
|
static |
Force + energy + pruning kernel function pointers.
|
static |
Force-only kernel function names.
|
static |
Force + pruning kernel function pointers.