Gromacs
2016.6
|
#include "gmxpre.h"
#include <assert.h>
#include <math.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/gpu_utils/oclutils.h"
#include "gromacs/hardware/detecthardware.h"
#include "gromacs/hardware/gpu_hw_info.h"
#include "gromacs/math/vectypes.h"
#include "gromacs/mdlib/force_flags.h"
#include "gromacs/mdlib/nb_verlet.h"
#include "gromacs/mdlib/nbnxn_consts.h"
#include "gromacs/mdlib/nbnxn_gpu.h"
#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
#include "gromacs/mdlib/nbnxn_gpu_jit_support.h"
#include "gromacs/mdtypes/interaction_const.h"
#include "gromacs/mdtypes/md_enums.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/timing/gpu_timing.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/real.h"
#include "gromacs/utility/smalloc.h"
#include "nbnxn_ocl_internal.h"
#include "nbnxn_ocl_types.h"
Define OpenCL implementation of nbnxn_gpu_data_mgmt.h.
Functions | |
bool | useLjCombRule (int vdwType) |
Returns true if LJ combination rules are used in the non-bonded kernels. More... | |
void | ocl_free_buffered (cl_mem d_ptr, int *n, int *nalloc) |
Free device buffers. More... | |
void | ocl_realloc_buffered (cl_mem *d_dest, void *h_src, size_t type_size, int *curr_size, int *curr_alloc_size, int req_size, cl_context context, cl_command_queue s, bool bAsync=true, cl_event *copy_event=NULL) |
Reallocation device buffers. More... | |
static void | free_ocl_buffer (cl_mem *buffer) |
Releases the input OpenCL buffer. | |
static void | init_ewald_coulomb_force_table (const interaction_const_t *ic, cl_nbparam_t *nbp, const gmx_device_runtime_data_t *runData) |
Tabulates the Ewald Coulomb force and initializes the size/scale and the table GPU array. More... | |
static void | init_atomdata_first (cl_atomdata_t *ad, int ntypes, gmx_device_runtime_data_t *runData) |
Initializes the atomdata structure first time, it only gets filled at pair-search. | |
static void | set_cutoff_parameters (cl_nbparam_t *nbp, const interaction_const_t *ic) |
Copies all parameters related to the cut-off from ic to nbp. | |
static void | map_interaction_types_to_gpu_kernel_flavors (const interaction_const_t *ic, int combRule, int *gpu_eeltype, int *gpu_vdwtype) |
Returns the kinds of electrostatics and Vdw OpenCL kernels that will be used. More... | |
static void | init_nbparam (cl_nbparam_t *nbp, const interaction_const_t *ic, const nbnxn_atomdata_t *nbat, const gmx_device_runtime_data_t *runData) |
Initializes the nonbonded parameter data structure. | |
void | nbnxn_gpu_pme_loadbal_update_param (const nonbonded_verlet_t *nbv, const interaction_const_t *ic) |
This function is documented in the header file. | |
static void | init_plist (cl_plist_t *pl) |
Initializes the pair list data structure. | |
static void | init_timers (cl_timers_t *t, bool bUseTwoStreams) |
Initializes the timer data structure. | |
static void | init_timings (gmx_wallclock_gpu_t *t) |
Initializes the timings data structure. | |
static void | nbnxn_gpu_create_context (gmx_device_runtime_data_t *runtimeData, const gmx_device_info_t *devInfo, int rank) |
Creates context for OpenCL GPU given by mygpu . More... | |
static cl_kernel | nbnxn_gpu_create_kernel (gmx_nbnxn_ocl_t *nb, const char *kernel_name) |
Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. | |
static void | nbnxn_ocl_clear_e_fshift (gmx_nbnxn_ocl_t *nb) |
Clears nonbonded shift force output array and energy outputs on the GPU. | |
static void | nbnxn_gpu_init_kernels (gmx_nbnxn_ocl_t *nb) |
Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. | |
static void | nbnxn_ocl_init_const (gmx_nbnxn_ocl_t *nb, const interaction_const_t *ic, const nonbonded_verlet_group_t *nbv_group) |
Initializes simulation constant data. More... | |
void | nbnxn_gpu_init (gmx_nbnxn_ocl_t **p_nb, const gmx_gpu_info_t *gpu_info, const gmx_gpu_opt_t *gpu_opt, const interaction_const_t *ic, nonbonded_verlet_group_t *nbv_grp, int my_gpu_index, int rank, gmx_bool bLocalAndNonlocal) |
This function is documented in the header file. | |
static void | nbnxn_ocl_clear_f (gmx_nbnxn_ocl_t *nb, int natoms_clear) |
Clears the first natoms_clear elements of the GPU nonbonded force output array. | |
void | nbnxn_gpu_clear_outputs (gmx_nbnxn_ocl_t *nb, int flags) |
This function is documented in the header file. | |
void | nbnxn_gpu_init_pairlist (gmx_nbnxn_ocl_t *nb, const nbnxn_pairlist_t *h_plist, int iloc) |
This function is documented in the header file. | |
void | nbnxn_gpu_upload_shiftvec (gmx_nbnxn_ocl_t *nb, const nbnxn_atomdata_t *nbatom) |
This function is documented in the header file. | |
void | nbnxn_gpu_init_atomdata (gmx_nbnxn_ocl_t *nb, const struct nbnxn_atomdata_t *nbat) |
This function is documented in the header file. | |
void | free_kernel (cl_kernel *kernel_ptr) |
Releases an OpenCL kernel pointer. | |
void | free_kernels (cl_kernel *kernels, int count) |
Releases a list of OpenCL kernel pointers. | |
static void | free_gpu_device_runtime_data (gmx_device_runtime_data_t *runData) |
Free the OpenCL runtime data (context and program). More... | |
void | nbnxn_gpu_free (gmx_nbnxn_ocl_t *nb) |
This function is documented in the header file. | |
gmx_wallclock_gpu_t * | nbnxn_gpu_get_timings (gmx_nbnxn_ocl_t *nb) |
This function is documented in the header file. | |
void | nbnxn_gpu_reset_timings (nonbonded_verlet_t *nbv) |
This function is documented in the header file. | |
int | nbnxn_gpu_min_ci_balanced (gmx_nbnxn_ocl_t *nb) |
This function is documented in the header file. | |
gmx_bool | nbnxn_gpu_is_kernel_ewald_analytical (const gmx_nbnxn_ocl_t *nb) |
This function is documented in the header file. | |
Variables | |
static unsigned int | gpu_min_ci_balanced_factor = 50 |
This parameter should be determined heuristically from the kernel execution times. More... | |
|
static |
Free the OpenCL runtime data (context and program).
The function releases the OpenCL context and program assuciated with the device that the calling PP rank is running on.
runData | [in] porinter to the structure with runtime data. |
|
static |
Tabulates the Ewald Coulomb force and initializes the size/scale and the table GPU array.
If called with an already allocated table, it just re-uploads the table.
|
static |
Returns the kinds of electrostatics and Vdw OpenCL kernels that will be used.
Respectively, these values are from enum eelOcl and enum evdwOcl.
|
static |
Creates context for OpenCL GPU given by mygpu
.
A fatal error results if creation fails.
[in,out] | runtimeData | runtime data including program and context |
[in] | devInfo | device info struct |
[in] | rank | MPI rank (for error reporting) |
|
static |
Initializes simulation constant data.
Initializes members of the atomdata and nbparam structs and clears e/fshift output buffers.
void ocl_free_buffered | ( | cl_mem | d_ptr, |
int * | n, | ||
int * | nalloc | ||
) |
Free device buffers.
If the pointers to the size variables are NULL no resetting happens.
void ocl_realloc_buffered | ( | cl_mem * | d_dest, |
void * | h_src, | ||
size_t | type_size, | ||
int * | curr_size, | ||
int * | curr_alloc_size, | ||
int | req_size, | ||
cl_context | context, | ||
cl_command_queue | s, | ||
bool | bAsync = true , |
||
cl_event * | copy_event = NULL |
||
) |
Reallocation device buffers.
Reallocation of the memory pointed by d_ptr and copying of the data from the location pointed by h_src host-side pointer is done. Allocation is buffered and therefore freeing is only needed if the previously allocated space is not enough. The H2D copy is launched in command queue s and can be done synchronously or asynchronously (the default is the latter). If copy_event is not NULL, on return it will contain an event object identifying the H2D copy. The event can further be used to queue a wait for this operation or to query profiling information. OpenCL equivalent of cu_realloc_buffered.
bool useLjCombRule | ( | int | vdwType | ) |
Returns true if LJ combination rules are used in the non-bonded kernels.
Full doc in nbnxn_ocl_internal.h
|
static |
This parameter should be determined heuristically from the kernel execution times.
This value is best for small systems on a single AMD Radeon R9 290X (and about 5% faster than 40, which is the default for CUDA devices). Larger simulation systems were quite insensitive to the value of this parameter.