#include "config.h"
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <algorithm>
#include "impl_reference_definitions.h"
#include "impl_reference_simd_float.h"

Include dependency graph for impl_reference_util_float.h:

This graph shows which files directly or indirectly include this file:

Description

Reference impl., higher-level single prec. SIMD utility functions.

Author: Erik Lindahl erik..nosp@m.lind.nosp@m.ahl@s.nosp@m.cili.nosp@m.felab.nosp@m..se

Higher-level SIMD utility functions, single precision.
These include generic functions to work with triplets of data, typically coordinates, and a few utility functions to load and update data in the nonbonded kernels. These functions should be available on all implementations, although some wide SIMD implementations (width>=8) also provide special optional versions to work with half or quarter registers to improve the performance in the nonbonded kernels.
static const int	gmx::c_simdBestPairAlignmentFloat = 2
	Best alignment to use for aligned pairs of float data. More...

template<int align>
static void gmx_simdcall	gmx::gatherLoadTranspose (const float base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat *v3)
	Load 4 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets, and transpose into 4 SIMD float variables. More...

template<int align>
static void gmx_simdcall	gmx::gatherLoadTranspose (const float base, const std::int32_t offset[], SimdFloat v0, SimdFloat *v1)
	Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets, and transpose into 2 SIMD float variables. More...

template<int align>
static void gmx_simdcall	gmx::gatherLoadUTranspose (const float base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
	Load 3 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets, and transpose into 3 SIMD float variables. More...

template<int align>
static void gmx_simdcall	gmx::transposeScatterStoreU (float *base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
	Transpose and store 3 SIMD floats to 3 consecutive addresses at GMX_SIMD_FLOAT_WIDTH offsets. More...

template<int align>
static void gmx_simdcall	gmx::transposeScatterIncrU (float *base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
	Transpose and add 3 SIMD floats to 3 consecutive addresses at GMX_SIMD_FLOAT_WIDTH offsets. More...

template<int align>
static void gmx_simdcall	gmx::transposeScatterDecrU (float *base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
	Transpose and subtract 3 SIMD floats to 3 consecutive addresses at GMX_SIMD_FLOAT_WIDTH offsets. More...

static void gmx_simdcall	gmx::expandScalarsToTriplets (SimdFloat scalar, SimdFloat triplets0, SimdFloat triplets1, SimdFloat *triplets2)
	Expand each element of float SIMD variable into three identical consecutive elements in three SIMD outputs. More...

template<int align>
static void gmx_simdcall	gmx::gatherLoadBySimdIntTranspose (const float base, SimdFInt32 offset, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat *v3)
	Load 4 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets specified by a SIMD integer, transpose into 4 SIMD float variables. More...

template<int align>
static void gmx_simdcall	gmx::gatherLoadUBySimdIntTranspose (const float base, SimdFInt32 offset, SimdFloat v0, SimdFloat *v1)
	Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets (unaligned) specified by SIMD integer, transpose into 2 SIMD floats. More...

template<int align>
static void gmx_simdcall	gmx::gatherLoadBySimdIntTranspose (const float base, SimdFInt32 offset, SimdFloat v0, SimdFloat *v1)
	Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets specified by a SIMD integer, transpose into 2 SIMD float variables. More...

static float gmx_simdcall	gmx::reduceIncr4ReturnSum (float *m, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat v3)
	Reduce each of four SIMD floats, add those values to four consecutive floats in memory, return sum. More...

Functions
Higher-level SIMD utilities accessing partial (half-width) SIMD floats.
These functions are optional. The are only useful for SIMD implementation where the width is 8 or larger, and where it would be inefficient to process 48, 88, or more, interactions in parallel. Currently, only Intel provides very wide SIMD implementations, but these also come with excellent support for loading, storing, accessing and shuffling parts of the register in so-called 'lanes' of 4 bytes each. We can use this to load separate parts into the low/high halves of the register in the inner loop of the nonbonded kernel, which e.g. makes it possible to process 44 nonbonded interactions as a pattern of 28. We can also use implementations with width 16 or greater. To make this more generic, when GMX_SIMD_HAVE_HSIMD_UTIL_REAL is 1, the SIMD implementation provides seven special routines that: Load the low/high parts of a SIMD variable from different pointers Load half the SIMD width from one pointer, and duplicate in low/high parts Load two reals, put 1st one in all low elements, and 2nd in all high ones. Store the low/high parts of a SIMD variable to different pointers Subtract both SIMD halves from a single half-SIMD-width memory location. Load aligned pairs (LJ parameters) from two base pointers, with a common offset list, and put these in the low/high SIMD halves. Reduce each half of two SIMD registers (i.e., 4 parts in total), increment four adjacent memory positions, and return the total sum. Remember: this is ONLY used when the native SIMD width is large. You will just waste time if you implement it for normal 16-byte SIMD architectures. This is part of the new C++ SIMD interface, so these functions are only available when using C++. Since some Gromacs code reliying on the SIMD module is still C (not C++), we have kept the C-style naming for now - this will change once we are entirely C++.
static SimdFloat gmx_simdcall	gmx::loadDualHsimd (const float m0, const float m1)
	Load low & high parts of SIMD float from different locations. More...

static SimdFloat gmx_simdcall	gmx::loadDuplicateHsimd (const float *m)
	Load half-SIMD-width float data, spread to both halves. More...

static SimdFloat gmx_simdcall	gmx::loadU1DualHsimd (const float *m)
	Load two floats, spread 1st in low half, 2nd in high half. More...

static void gmx_simdcall	gmx::storeDualHsimd (float m0, float m1, SimdFloat a)
	Store low & high parts of SIMD float to different locations. More...

static void gmx_simdcall	gmx::incrDualHsimd (float m0, float m1, SimdFloat a)
	Add each half of SIMD variable to separate memory adresses. More...

static void gmx_simdcall	gmx::decr3Hsimd (float *m, SimdFloat a0, SimdFloat a1, SimdFloat a2)
	Add the two halves of three SIMD floats, subtract the sum from three half-SIMD-width consecutive floats in memory. More...

template<int align>
static void gmx_simdcall	gmx::gatherLoadTransposeHsimd (const float base0, const float base1, const std::int32_t offset[], SimdFloat v0, SimdFloat v1)
	Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH/2 offsets, transpose into SIMD float (low half from base0, high from base1). More...

static float gmx_simdcall	gmx::reduceIncr4ReturnSumHsimd (float *m, SimdFloat v0, SimdFloat v1)
	Reduce the 4 half-SIMD-with floats in 2 SIMD variables (sum halves), increment four consecutive floats in memory, return sum. More...

static SimdFloat gmx_simdcall	gmx::loadUNDuplicate4 (const float *m)
	Load N floats and duplicate them 4 times each. More...

static SimdFloat gmx_simdcall	gmx::load4DuplicateN (const float *m)
	Load 4 floats and duplicate them N times each. More...

static SimdFloat gmx_simdcall	gmx::loadU4NOffset (const float *m, int offset)
	Load floats in blocks of 4 at fixed offsets. More...

Description

Higher-level SIMD utility functions, single precision.

Functions