Gromacs
2016.6
|
#include "gmxpre.h"
#include "config.h"
#include <assert.h>
#include <stdlib.h>
#include "gromacs/gpu_utils/oclutils.h"
#include "gromacs/hardware/hw_info.h"
#include "gromacs/mdlib/force_flags.h"
#include "gromacs/mdlib/nb_verlet.h"
#include "gromacs/mdlib/nbnxn_consts.h"
#include "gromacs/mdlib/nbnxn_pairlist.h"
#include "gromacs/timing/gpu_timing.h"
#include "gromacs/mdlib/nbnxn_gpu.h"
#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "nbnxn_ocl_internal.h"
#include "nbnxn_ocl_types.h"
Define OpenCL implementation of nbnxn_gpu.h.
Macros | |
#define | DEBUG_RUN_STEP 2 |
Specifies which kernel run to debug. | |
Functions | |
static void | validate_global_work_size (size_t *global_work_size, int work_dim, gmx_device_info_t *dinfo) |
Validates the input global work size parameter. | |
static cl_kernel | select_nbnxn_kernel (gmx_nbnxn_ocl_t *nb, int eeltype, int evdwtype, bool bDoEne, bool bDoPrune) |
Return a pointer to the kernel version to be executed at the current step. OpenCL kernel objects are cached in nb. If the requested kernel is not found in the cache, it will be created and the cache will be updated. | |
static int | calc_shmem_required (int vdwType, bool bPrefetchLjParam) |
Calculates the amount of shared memory required by the OpenCL kernel in use. | |
static void | fillin_ocl_structures (cl_nbparam_t *nbp, cl_nbparam_params_t *nbparams_params) |
Initializes data structures that are going to be sent to the OpenCL device. More... | |
void | wait_ocl_event (cl_event *ocl_event) |
Waits for the commands associated with the input event to finish. Then it releases the event and sets it to 0. Don't use this function when more than one wait will be issued for the event. | |
void | sync_ocl_event (cl_command_queue stream, cl_event *ocl_event) |
Enqueues a wait for event completion. More... | |
double | ocl_event_elapsed_ms (cl_event *ocl_event) |
Returns the duration in milliseconds for the command associated with the event. More... | |
void | nbnxn_gpu_launch_kernel (gmx_nbnxn_ocl_t *nb, const struct nbnxn_atomdata_t *nbatom, int flags, int iloc) |
Launch GPU kernel. More... | |
void | nbnxn_gpu_launch_cpyback (gmx_nbnxn_ocl_t *nb, const struct nbnxn_atomdata_t *nbatom, int flags, int aloc) |
Launch asynchronously the download of nonbonded forces from the GPU (and energies/shift forces if required). | |
void | nbnxn_gpu_wait_for_gpu (gmx_nbnxn_ocl_t *nb, int flags, int aloc, real *e_lj, real *e_el, rvec *fshift) |
Wait for the asynchronously launched nonbonded calculations and data transfers to finish. | |
int | nbnxn_gpu_pick_ewald_kernel_type (bool bTwinCut) |
Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. | |
Variables | |
static const char * | nb_kfunc_noener_noprune_ptr [eelOclNR][evdwOclNR] |
Force-only kernel function names. More... | |
static const char * | nb_kfunc_ener_noprune_ptr [eelOclNR][evdwOclNR] |
Force + energy kernel function pointers. More... | |
static const char * | nb_kfunc_noener_prune_ptr [eelOclNR][evdwOclNR] |
Force + pruning kernel function pointers. More... | |
static const char * | nb_kfunc_ener_prune_ptr [eelOclNR][evdwOclNR] |
Force + energy + pruning kernel function pointers. More... | |
static const int | c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster |
Convenience constants. | |
static const int | c_clSize = c_nbnxnGpuClusterSize |
static bool | always_ener = (getenv("GMX_GPU_ALWAYS_ENER") != NULL) |
Always/never run the energy/pruning kernels – only for benchmarking purposes. | |
static bool | never_ener = (getenv("GMX_GPU_NEVER_ENER") != NULL) |
static bool | always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL) |
|
static |
Initializes data structures that are going to be sent to the OpenCL device.
The device can't use the same data structures as the host for two main reasons:
void nbnxn_gpu_launch_kernel | ( | gmx_nbnxn_ocl_t * | nb, |
const struct nbnxn_atomdata_t * | nbatom, | ||
int | flags, | ||
int | iloc | ||
) |
Launch GPU kernel.
Launch asynchronously the nonbonded force calculations.
As we execute nonbonded workload in separate queues, before launching the kernel we need to make sure that he following operations have completed:
These operations are issued in the local queue at the beginning of the step and therefore always complete before the local kernel launch. The non-local kernel is launched after the local on the same device/context, so this is inherently scheduled after the operations in the local stream (including the above "misc_ops"). However, for the sake of having a future-proof implementation, we use the misc_ops_done event to record the point in time when the above operations are finished and synchronize with this event in the non-local stream.
double ocl_event_elapsed_ms | ( | cl_event * | ocl_event | ) |
Returns the duration in milliseconds for the command associated with the event.
It then releases the event and sets it to 0. Before calling this function, make sure the command has finished either by calling clFinish or clWaitForEvents. The function returns 0.0 if the input event, *ocl_event, is 0. Don't use this function when more than one wait will be issued for the event.
void sync_ocl_event | ( | cl_command_queue | stream, |
cl_event * | ocl_event | ||
) |
Enqueues a wait for event completion.
Then it releases the event and sets it to 0. Don't use this function when more than one wait will be issued for the event. Equivalent to Cuda Stream Sync.
|
static |
Force + energy kernel function pointers.
|
static |
Force + energy + pruning kernel function pointers.
|
static |
Force-only kernel function names.
|
static |
Force + pruning kernel function pointers.