Gromacs
2024.4
|
#include "gmxpre.h"
#include "pme_load_balancing.h"
#include <cassert>
#include <cmath>
#include <algorithm>
#include "gromacs/domdec/dlb.h"
#include "gromacs/domdec/domdec.h"
#include "gromacs/domdec/domdec_network.h"
#include "gromacs/domdec/domdec_struct.h"
#include "gromacs/domdec/partition.h"
#include "gromacs/ewald/ewald_utils.h"
#include "gromacs/ewald/pme.h"
#include "gromacs/fft/calcgrid.h"
#include "gromacs/gmxlib/network.h"
#include "gromacs/math/functions.h"
#include "gromacs/math/vec.h"
#include "gromacs/mdlib/dispersioncorrection.h"
#include "gromacs/mdlib/forcerec.h"
#include "gromacs/mdtypes/commrec.h"
#include "gromacs/mdtypes/forcerec.h"
#include "gromacs/mdtypes/inputrec.h"
#include "gromacs/mdtypes/interaction_const.h"
#include "gromacs/mdtypes/md_enums.h"
#include "gromacs/mdtypes/state.h"
#include "gromacs/nbnxm/gpu_data_mgmt.h"
#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/pbcutil/pbc.h"
#include "gromacs/timing/wallcycle.h"
#include "gromacs/timing/walltime_accounting.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/enumerationhelpers.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/logger.h"
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/strconvert.h"
#include "pme_internal.h"
#include "pme_pp.h"
This file contains function definitions necessary for managing automatic load balance of PME calculations (Coulomb and LJ).
Classes | |
struct | pme_setup_t |
Parameters and settings for one PP-PME setup. More... | |
Enumerations | |
enum | PmeLoadBalancingLimit : int { No, Box, DD, PmeGrid, MaxScaling, Count } |
Enumeration whose values describe the effect limiting the load balancing. | |
Functions | |
static const char * | enumValueToString (PmeLoadBalancingLimit enumValue) |
Descriptive strings for PmeLoadBalancingLimit enumValue . | |
bool | pme_loadbal_is_active (const pme_load_balancing_t *pme_lb) |
Return whether PME load balancing is active. | |
void | pme_loadbal_init (pme_load_balancing_t **pme_lb_p, t_commrec *cr, const gmx::MDLogger &mdlog, const t_inputrec &ir, const matrix box, const interaction_const_t &ic, const nonbonded_verlet_t &nbv, gmx_pme_t *pmedata, gmx_bool bUseGPU) |
Initialize the PP-PME load balacing data and infrastructure. More... | |
static gmx_bool | pme_loadbal_increase_cutoff (pme_load_balancing_t *pme_lb, int pme_order, const gmx_domdec_t *dd) |
Try to increase the cutoff during load balancing. | |
static void | print_grid (FILE *fp_err, FILE *fp_log, const char *pre, const char *desc, const pme_setup_t *set, double cycles) |
Print the PME grid. | |
static int | pme_loadbal_end (pme_load_balancing_t *pme_lb) |
Return the index of the last setup used in PME load balancing. | |
static void | print_loadbal_limited (FILE *fp_err, FILE *fp_log, int64_t step, pme_load_balancing_t *pme_lb) |
Print descriptive string about what limits PME load balancing. | |
static void | switch_to_stage1 (pme_load_balancing_t *pme_lb) |
Switch load balancing to stage 1. More... | |
static void | pme_load_balance (pme_load_balancing_t *pme_lb, t_commrec *cr, FILE *fp_err, FILE *fp_log, const gmx::MDLogger &mdlog, const t_inputrec &ir, const matrix box, gmx::ArrayRef< const gmx::RVec > x, double cycles, interaction_const_t *ic, struct nonbonded_verlet_t *nbv, struct gmx_pme_t **pmedata, int64_t step) |
Process the timings and try to adjust the PME grid and Coulomb cut-off. More... | |
static void | continue_pme_loadbal (pme_load_balancing_t *pme_lb, gmx_bool bDlbUnlocked) |
Prepare for another round of PME load balancing. More... | |
void | pme_loadbal_do (pme_load_balancing_t *pme_lb, t_commrec *cr, FILE *fp_err, FILE *fp_log, const gmx::MDLogger &mdlog, const t_inputrec &ir, t_forcerec *fr, const matrix box, gmx::ArrayRef< const gmx::RVec > x, gmx_wallcycle *wcycle, int64_t step, int64_t step_rel, gmx_bool *bPrinting, bool useGpuPmePpCommunication) |
Process cycles and PME load balance when necessary. More... | |
static int | pme_grid_points (const pme_setup_t *setup) |
Return product of the number of PME grid points in each dimension. | |
static void | print_pme_loadbal_setting (FILE *fplog, const char *name, const pme_setup_t *setup) |
Print one load-balancing setting. | |
static void | print_pme_loadbal_settings (pme_load_balancing_t *pme_lb, FILE *fplog, const gmx::MDLogger &mdlog, gmx_bool bNonBondedOnGPU) |
Print all load-balancing settings. | |
void | pme_loadbal_done (pme_load_balancing_t *pme_lb, FILE *fplog, const gmx::MDLogger &mdlog, gmx_bool bNonBondedOnGPU) |
Finish the PME load balancing and print the settings when fplog!=NULL. | |
Variables | |
const int | PMETunePeriod = 50 |
After 50 nstlist periods of not observing imbalance: never tune PME. | |
const real | loadBalanceTriggerFactor = 1.05 |
Trigger PME load balancing at more than 5% PME overload. | |
const real | c_maxSpacingScaling = 1.7 |
Scale the grid by a most at factor 1.7. More... | |
const real | gridpointsScaleFactor = 0.8 |
In the initial scan, step by grids that are at least a factor 0.8 coarser. | |
const real | relativeEfficiencyFactor = 1.05 |
In the initial scan, try to skip grids with uneven x/y/z spacing, checking if the "efficiency" is more than 5% worse than the previous grid. | |
const real | maxRelativeSlowdownAccepted = 1.12 |
Rerun until a run is 12% slower setups than the fastest run so far. | |
const real | maxFluctuationAccepted = 1.02 |
If setups get more than 2% faster, do another round to avoid choosing a slower setup due to acceleration or fluctuations. | |
const int | c_numFirstTuningIntervalSkip = 5 |
Number of nstlist long tuning intervals to skip before starting. | |
const int | c_numFirstTuningIntervalSkipWithSepPme = 3 |
Number of nstlist long tuning intervals to skip before starting. | |
const int | c_numPostSwitchTuningIntervalSkip = 1 |
Number of nstlist long tuning intervals to skip after switching to a new setting. | |
const double | c_startupTimeDelay = 5.0 |
Number of seconds to delay the tuning at startup to allow processors clocks to ramp up. | |
|
static |
Prepare for another round of PME load balancing.
[in,out] | pme_lb | Pointer to PME load balancing struct |
[in] | bDlbUnlocked | TRUE is DLB was locked and is now unlocked |
If the conditions (e.g. DLB off/on, CPU/GPU throttling etc.) changed, the PP/PME balance might change and re-balancing can improve performance. This function adds 2 stages and adjusts the considered setup range.
|
static |
Process the timings and try to adjust the PME grid and Coulomb cut-off.
The adjustment is done to generate a different non-bonded PP and PME load. With separate PME ranks (PP and PME on different processes) or with a GPU (PP on GPU, PME on CPU), PP and PME run on different resources and changing the load will affect the load balance and performance. The total time for a set of integration steps is monitored and a range of grid/cut-off setups is scanned. After calling pme_load_balance many times and acquiring enough statistics, the best performing setup is chosen. Here we try to take into account fluctuations and changes due to external factors as well as DD load balancing.
void pme_loadbal_do | ( | pme_load_balancing_t * | pme_lb, |
struct t_commrec * | cr, | ||
FILE * | fp_err, | ||
FILE * | fp_log, | ||
const gmx::MDLogger & | mdlog, | ||
const t_inputrec & | ir, | ||
t_forcerec * | fr, | ||
const matrix | box, | ||
gmx::ArrayRef< const gmx::RVec > | x, | ||
gmx_wallcycle * | wcycle, | ||
int64_t | step, | ||
int64_t | step_rel, | ||
gmx_bool * | bPrinting, | ||
bool | useGpuPmePpCommunication | ||
) |
Process cycles and PME load balance when necessary.
Process the cycles measured over the last nstlist steps and then either continue balancing or check if we need to trigger balancing. Should be called after the WallCycleCounter::Step cycle counter has been stopped. Returns if the load balancing is printing to fp_err.
void pme_loadbal_init | ( | pme_load_balancing_t ** | pme_lb_p, |
t_commrec * | cr, | ||
const gmx::MDLogger & | mdlog, | ||
const t_inputrec & | ir, | ||
const matrix | box, | ||
const interaction_const_t & | ic, | ||
const nonbonded_verlet_t & | nbv, | ||
gmx_pme_t * | pmedata, | ||
gmx_bool | bUseGPU | ||
) |
Initialize the PP-PME load balacing data and infrastructure.
Initialize the PP-PME load balacing data and infrastructure. The actual load balancing might start right away, later or never. The PME grid in pmedata is reused for smaller grids to lower the memory usage.
|
static |
Switch load balancing to stage 1.
In this stage, only reasonably fast setups are run again.
const real c_maxSpacingScaling = 1.7 |
Scale the grid by a most at factor 1.7.
This still leaves room for about 4-4.5x decrease in grid spacing while limiting the cases where large imbalance leads to extreme cutoff scaling for marginal benefits.
This should help to avoid: