#include "gmxpre.h"
#include "config.h"
#include <cstdlib>
#include "gromacs/fft/parallel_3dfft.h"
#include "gromacs/gpu_utils/gmxsycl.h"
#include "gromacs/gpu_utils/syclutils.h"
#include "gromacs/math/vec.h"
#include "gromacs/timing/wallcycle.h"
#include "pme_gpu_grid.h"
#include "pme_gpu_types.h"
#include "pme_gpu_types_host.h"
#include "pme_gpu_types_host_impl.h"

Include dependency graph for pme_gpu_grid_sycl.cpp:

Description

Implements PME GPU halo exchange and PME GPU - Host FFT grid conversion functions. These functions are used for PME decomposition in mixed-mode.

Author: Mark Abraham mark..nosp@m.j.ab.nosp@m.raham.nosp@m.@gma.nosp@m.il.co.nosp@m.m

Classes
class	PackHaloExternal
	Handles a kernel which packs non-contiguous overlap data in all 8 neighboring directions. More...

class	UnpackHaloExternal
	Handles a kernel which gathers data from halo region in all 8 neighboring directions. More...

class	UnpackAndAddHaloInternal
	Handles a kernel which adds grid overlap data received from neighboring ranks. More...

class	PackHaloInternal
	Handles a kernel which packs non-contiguous overlap data in all 8 neighboring directions. More...

class	GridConverter< pmeToFft >
	Builds a kernel to convert between PME and FFT grids. More...

Typedefs
using	mode = sycl::access_mode

Functions
template<typename Kernel , int subGroupSize, class... Args>
static void	submit (const DeviceStream &deviceStream, size_t myGridX, size_t myGridY, sycl::uint3 pmeSize, Args &&...args)
	Submits a GPU grid kernel. More...

void	pmeGpuGridHaloExchange (const PmeGpu pmeGpu, gmx_wallcycle wcycle)
	Grid Halo exchange after PME spread ToDo: Current implementation transfers halo region from/to only immediate neighbours And, expects that overlapSize <= local grid width. Implement exchange with multiple neighbors to remove this limitation ToDo: Current implementation synchronizes pmeStream to make sure data is ready on GPU after spread. Consider using events for this synchnozation. More...

void	pmeGpuGridHaloExchangeReverse (const PmeGpu pmeGpu, gmx_wallcycle wcycle)
	Grid reverse Halo exchange before PME gather ToDo: Current implementation transfers halo region from/to only immediate neighbours And, expects that overlapSize <= local grid width. Implement exchange with multiple neighbors to remove this limitation ToDo: Current implementation synchronizes pmeStream to make sure data is ready on GPU after FFT to PME grid conversion. Consider using events for this synchnozation. More...

template<bool pmeToFft>
void	convertPmeGridToFftGrid (const PmeGpu pmeGpu, float h_fftRealGrid, gmx_parallel_3dfft_t *fftSetup, const int gridIndex)
	Copy PME Grid with overlap region to host FFT grid and vice-versa. Used in mixed mode PME decomposition. More...

template<bool pmeToFft>
void	convertPmeGridToFftGrid (const PmeGpu pmeGpu, DeviceBuffer< float > d_fftRealGrid, const int gridIndex)
	Copy PME Grid with overlap region to device FFT grid and vice-versa. Used in full GPU PME decomposition. More...

template void	convertPmeGridToFftGrid< true > (const PmeGpu pmeGpu, float h_fftRealGrid, gmx_parallel_3dfft_t *fftSetup, const int gridIndex)

template void	convertPmeGridToFftGrid< false > (const PmeGpu pmeGpu, float h_fftRealGrid, gmx_parallel_3dfft_t *fftSetup, const int gridIndex)

template void	convertPmeGridToFftGrid< true > (const PmeGpu pmeGpu, DeviceBuffer< float > d_fftRealGrid, const int gridIndex)

template void	convertPmeGridToFftGrid< false > (const PmeGpu pmeGpu, DeviceBuffer< float > d_fftRealGrid, const int gridIndex)

Variables
static constexpr int	sc_subGroupSize = 32
	Sub-group size for conversion kernels. More...

Function Documentation

template<bool pmeToFft>

void convertPmeGridToFftGrid	(	const PmeGpu *	pmeGpu,
		float *	h_fftRealGrid,
		gmx_parallel_3dfft_t *	fftSetup,
		int	gridIndex
	)

Copy PME Grid with overlap region to host FFT grid and vice-versa. Used in mixed mode PME decomposition.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	h_fftRealGrid	FFT grid on host
[in]	fftSetup	Host FFT setup structure
[in]	gridIndex	Grid index which is to be converted

Template Parameters

pmeToFft A boolean which tells if this is conversion from PME grid to FFT grid or reverse

template<bool pmeToFft>

void convertPmeGridToFftGrid	(	const PmeGpu *	pmeGpu,
		DeviceBuffer< float > *	d_fftRealGrid,
		int	gridIndex
	)

Copy PME Grid with overlap region to device FFT grid and vice-versa. Used in full GPU PME decomposition.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	d_fftRealGrid	FFT grid on device
[in]	gridIndex	Grid index which is to be converted

Template Parameters

pmeToFft A boolean which tells if this is conversion from PME grid to FFT grid or reverse

void pmeGpuGridHaloExchange	(	const PmeGpu *	pmeGpu,
		gmx_wallcycle *	wcycle
	)

Grid Halo exchange after PME spread ToDo: Current implementation transfers halo region from/to only immediate neighbours And, expects that overlapSize <= local grid width. Implement exchange with multiple neighbors to remove this limitation ToDo: Current implementation synchronizes pmeStream to make sure data is ready on GPU after spread. Consider using events for this synchnozation.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	wcycle	The wallclock counter.

void pmeGpuGridHaloExchangeReverse	(	const PmeGpu *	pmeGpu,
		gmx_wallcycle *	wcycle
	)

Grid reverse Halo exchange before PME gather ToDo: Current implementation transfers halo region from/to only immediate neighbours And, expects that overlapSize <= local grid width. Implement exchange with multiple neighbors to remove this limitation ToDo: Current implementation synchronizes pmeStream to make sure data is ready on GPU after FFT to PME grid conversion. Consider using events for this synchnozation.

Parameters

[in]	pmeGpu	The PME GPU structure.
[in]	wcycle	The wallclock counter.

template<typename Kernel , int subGroupSize, class... Args>

static void submit	(	const DeviceStream &	deviceStream,
		size_t	myGridX,
		size_t	myGridY,
		sycl::uint3	pmeSize,
		Args &&...	args
	)

static

Submits a GPU grid kernel.

Template Parameters

Kernel	The class containing a static kernel() method to return the kernel to execute
subGroupSize	Size of the sub-group.

Parameters

[in]	deviceStream	The device stream upon which to submit
[in]	myGridX,myGridY	Local domain size in X and Y dimension
[in]	pmeSize	Local PME grid size
[in]	args	Parameter pack to pass to the kernel

Variable Documentation

constexpr int sc_subGroupSize = 32

static

Sub-group size for conversion kernels.

Chosen to match relevant hardware widths on supported hardware. In particular, PonteVecchio is 32-wide.

Description

Classes

Typedefs

Functions

Variables

Function Documentation

Variable Documentation