Gromacs
2022.2
|
#include "config.h"
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <algorithm>
#include "impl_reference_definitions.h"
#include "impl_reference_simd_double.h"
Reference impl., higher-level double prec. SIMD utility functions.
Higher-level SIMD utility functions, double precision. | |
These include generic functions to work with triplets of data, typically coordinates, and a few utility functions to load and update data in the nonbonded kernels. These functions should be available on all implementations. | |
static const int | gmx::c_simdBestPairAlignmentDouble = 2 |
Best alignment to use for aligned pairs of double data. More... | |
template<int align> | |
static void gmx_simdcall | gmx::gatherLoadTranspose (const double *base, const std::int32_t offset[], SimdDouble *v0, SimdDouble *v1, SimdDouble *v2, SimdDouble *v3) |
Load 4 consecutive double from each of GMX_SIMD_DOUBLE_WIDTH offsets, and transpose into 4 SIMD double variables. More... | |
template<int align> | |
static void gmx_simdcall | gmx::gatherLoadTranspose (const double *base, const std::int32_t offset[], SimdDouble *v0, SimdDouble *v1) |
Load 2 consecutive double from each of GMX_SIMD_DOUBLE_WIDTH offsets, and transpose into 2 SIMD double variables. More... | |
template<int align> | |
static void gmx_simdcall | gmx::gatherLoadUTranspose (const double *base, const std::int32_t offset[], SimdDouble *v0, SimdDouble *v1, SimdDouble *v2) |
Load 3 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets, and transpose into 3 SIMD double variables. More... | |
template<int align> | |
static void gmx_simdcall | gmx::transposeScatterStoreU (double *base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2) |
Transpose and store 3 SIMD doubles to 3 consecutive addresses at GMX_SIMD_DOUBLE_WIDTH offsets. More... | |
template<int align> | |
static void gmx_simdcall | gmx::transposeScatterIncrU (double *base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2) |
Transpose and add 3 SIMD doubles to 3 consecutive addresses at GMX_SIMD_DOUBLE_WIDTH offsets. More... | |
template<int align> | |
static void gmx_simdcall | gmx::transposeScatterDecrU (double *base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2) |
Transpose and subtract 3 SIMD doubles to 3 consecutive addresses at GMX_SIMD_DOUBLE_WIDTH offsets. More... | |
static void gmx_simdcall | gmx::expandScalarsToTriplets (SimdDouble scalar, SimdDouble *triplets0, SimdDouble *triplets1, SimdDouble *triplets2) |
Expand each element of double SIMD variable into three identical consecutive elements in three SIMD outputs. More... | |
template<int align> | |
static void gmx_simdcall | gmx::gatherLoadBySimdIntTranspose (const double *base, SimdDInt32 offset, SimdDouble *v0, SimdDouble *v1, SimdDouble *v2, SimdDouble *v3) |
Load 4 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets specified by a SIMD integer, transpose into 4 SIMD double variables. More... | |
template<int align> | |
static void gmx_simdcall | gmx::gatherLoadUBySimdIntTranspose (const double *base, SimdDInt32 offset, SimdDouble *v0, SimdDouble *v1) |
Load 2 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets (unaligned) specified by SIMD integer, transpose into 2 SIMD doubles. More... | |
template<int align> | |
static void gmx_simdcall | gmx::gatherLoadBySimdIntTranspose (const double *base, SimdDInt32 offset, SimdDouble *v0, SimdDouble *v1) |
Load 2 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets specified by a SIMD integer, transpose into 2 SIMD double variables. More... | |
static double gmx_simdcall | gmx::reduceIncr4ReturnSum (double *m, SimdDouble v0, SimdDouble v1, SimdDouble v2, SimdDouble v3) |
Reduce each of four SIMD doubles, add those values to four consecutive doubles in memory, return sum. More... | |
Functions | |
Higher-level SIMD utilities accessing partial (half-width) SIMD doubles. | |
See the single-precision versions for documentation. Since double precision is typically half the width of single, this double version is likely only useful with 512-bit and larger implementations. | |
static SimdDouble gmx_simdcall | gmx::loadDualHsimd (const double *m0, const double *m1) |
Load low & high parts of SIMD double from different locations. More... | |
static SimdDouble gmx_simdcall | gmx::loadDuplicateHsimd (const double *m) |
Load half-SIMD-width double data, spread to both halves. More... | |
static SimdDouble gmx_simdcall | gmx::loadU1DualHsimd (const double *m) |
Load two doubles, spread 1st in low half, 2nd in high half. More... | |
static void gmx_simdcall | gmx::storeDualHsimd (double *m0, double *m1, SimdDouble a) |
Store low & high parts of SIMD double to different locations. More... | |
static void gmx_simdcall | gmx::incrDualHsimd (double *m0, double *m1, SimdDouble a) |
Add each half of SIMD variable to separate memory adresses. More... | |
static void gmx_simdcall | gmx::decr3Hsimd (double *m, SimdDouble a0, SimdDouble a1, SimdDouble a2) |
Add the two halves of three SIMD doubles, subtract the sum from three half-SIMD-width consecutive doubles in memory. More... | |
template<int align> | |
static void gmx_simdcall | gmx::gatherLoadTransposeHsimd (const double *base0, const double *base1, std::int32_t offset[], SimdDouble *v0, SimdDouble *v1) |
Load 2 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH/2 offsets, transpose into SIMD double (low half from base0, high from base1). More... | |
static double gmx_simdcall | gmx::reduceIncr4ReturnSumHsimd (double *m, SimdDouble v0, SimdDouble v1) |
Reduce the 4 half-SIMD-with doubles in 2 SIMD variables (sum halves), increment four consecutive doubles in memory, return sum. More... | |
static SimdDouble gmx_simdcall | gmx::loadUNDuplicate4 (const double *m) |
Load N doubles and duplicate them 4 times each. More... | |
static SimdDouble gmx_simdcall | gmx::load4DuplicateN (const double *m) |
Load 4 doubles and duplicate them N times each. More... | |
static SimdDouble gmx_simdcall | gmx::loadU4NOffset (const double *m, int offset) |
Load doubles in blocks of 4 at fixed offsets. More... | |