Fix out of memory errors for larger reframe tests

这个提交包含在:
nmannall
2024-01-23 14:35:56 +00:00
父节点 753a38dab4
当前提交 7c7322b557
共有 3 个文件被更改,包括 14 次插入12 次删除

查看文件

@@ -42,13 +42,20 @@ class GprmaxBaseTest(rfm.RunOnlyRegressionTest):
valid_systems = ["archer2:compute"]
valid_prog_environs = ["PrgEnv-cray"]
executable = "time -p python -m gprMax --log-level 25"
postrun_cmds = [
"sacct --format=JobID,State,Submit,Start,End,Elapsed,NodeList,ReqMem,MaxRSS,MaxVMSize --units=M -j $SLURM_JOBID"
]
exclusive_access = True
@run_after("init")
def setup_omp(self):
def setup_env_vars(self):
"""Set OMP_NUM_THREADS environment variable from num_cpus_per_task"""
self.env_vars["OMP_NUM_THREADS"] = self.num_cpus_per_task
# Avoid inheriting slurm memory environment variables from any previous slurm job (i.e. the reframe job)
self.prerun_cmds.append("unset SLURM_MEM_PER_NODE")
self.prerun_cmds.append("unset SLURM_MEM_PER_CPU")
@run_after("init")
def inject_dependencies(self):
"""Test depends on the Python virtual environment building correctly"""

查看文件

@@ -2,10 +2,8 @@
#SBATCH --job-name=gprMax-benchmarks
#SBATCH --time=24:0:0
#SBATCH --ntasks=1
#SBATCH --mem=4G
#SBATCH --partition=serial
#SBATCH --qos=serial
#SBATCH --qos=serial
#SBATCH --output=output/archer2/rfm_bench_%J.out
# Set the number of threads to 1
@@ -17,4 +15,4 @@ source ../.venv/bin/activate
reframe -C configuration/archer2_settings.py -c reframe_benchmarks.py -c base_tests.py -r --performance-report
sacct --format=JobID,State,Submit,Start,End,Elapsed,NodeList --units=M -j $SLURM_JOBID
sacct --format=JobID,State,Submit,Start,End,Elapsed,NodeList,ReqMem --units=M -j $SLURM_JOBID

查看文件

@@ -17,20 +17,17 @@ class SingleNodeBenchmark(GprmaxBaseTest):
num_tasks = 1
omp_threads = parameter([1, 2, 4, 8, 16, 32, 64, 128])
domain = parameter([0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
cpu_freq = parameter([2250000])
cpu_freq = parameter([2000000, 2250000])
time_limit = "4h"
@run_after("init")
def setup_omp(self):
def setup_env_vars(self):
self.num_cpus_per_task = self.omp_threads
super().setup_omp()
self.env_vars["SLURM_CPU_FREQ_REQ"] = self.cpu_freq
super().setup_env_vars()
@run_after("init")
def create_model_file(self):
def set_model_file(self):
input_file = f"benchmark_model_{self.domain}.in"
self.executable_opts = [input_file]
self.keep_files = [input_file]
@run_after("init")
def set_cpu_freq(self):
self.env_vars["SLURM_CPU_FREQ_REQ"] = self.cpu_freq