From 65c463e7a9eeea311afcfbeb884e1d6393fa943a Mon Sep 17 00:00:00 2001 From: Craig Warren Date: Tue, 10 Mar 2020 17:28:58 +0000 Subject: [PATCH] Fix deviceID selection for GPUs. --- gprMax/config.py | 42 ++++++++++++++++++++++++++---------------- gprMax/contexts.py | 14 +++++++++----- gprMax/utilities.py | 25 ++++++++++++++++++------- 3 files changed, 53 insertions(+), 28 deletions(-) diff --git a/gprMax/config.py b/gprMax/config.py index fa968804..31bc69f1 100644 --- a/gprMax/config.py +++ b/gprMax/config.py @@ -31,7 +31,7 @@ from scipy.constants import epsilon_0 as e0 from scipy.constants import mu_0 as m0 from .exceptions import GeneralError -from .utilities import detect_check_gpus +from .utilities import detect_gpus from .utilities import get_host_info from .utilities import get_terminal_width @@ -71,8 +71,15 @@ class ModelConfig: # N.B. This will happen if the requested snapshots are too large to fit # on the memory of the GPU. If True this will slow performance significantly if sim_config.general['cuda']: - gpu = sim_config.set_model_gpu() - self.cuda = {'gpu': gpu, + # If a list of lists of GPU deviceIDs is found, flatten it + if any(isinstance(element, list) for element in sim_config.args.gpu): + deviceID = [val for sublist in sim_config.args.gpu for val in sublist] + + # If no deviceID is given default to using deviceID 0. Else if either + # a single deviceID or list of deviceIDs is given use first one. + deviceID = 0 if not deviceID else deviceID[0] + + self.cuda = {'gpu': sim_config.set_model_gpu(deviceID), 'snapsgpu2cpu': False} # Total memory usage for all grids in the model. Starts with 50MB overhead. @@ -221,20 +228,12 @@ class SimulationConfig: # provides best performance. self.general['precision'] = 'single' self.cuda = {'gpus': [], # gpus: list of GPU objects - 'gpus_str': [], # gpus_str: list of strings describing GPU(s) 'nvcc_opts': None} # nvcc_opts: nvcc compiler options # Suppress nvcc warnings on Microsoft Windows if sys.platform == 'win32': self.cuda['nvcc_opts'] = ['-w'] - # Flatten a list of lists - if any(isinstance(element, list) for element in self.args.gpu): - self.args.gpu = [val for sublist in self.args.gpu for val in sublist] - - # If no deviceID is given default to 0 - if not self.args.gpu: - self.args.gpu = [0] - - self.cuda['gpus'] = detect_check_gpus(self.args.gpu) + # List of GPU objects of available GPUs + self.cuda['gpus'] = detect_gpus() # Subgrid parameter may not exist if user enters via CLI try: @@ -259,14 +258,25 @@ class SimulationConfig: self._set_model_start_end() self._set_single_model() - def set_model_gpu(self, deviceID=0): - """Specify GPU object for model. Defaults to first GPU deviceID in - list of deviceID given. + def set_model_gpu(self, deviceID): + """Specify GPU object for model. + + Args: + deviceID (int): Requested deviceID of GPU + + Returns: + gpu (GPU object): Requested GPU object. """ + + found = False for gpu in self.cuda['gpus']: if gpu.deviceID == deviceID: + found = True return gpu + if not found: + raise GeneralError(f'GPU with device ID {deviceID} does not exist') + def _set_precision(self): """Data type (precision) for electromagnetic field output. diff --git a/gprMax/contexts.py b/gprMax/contexts.py index 75ef5266..94692aa4 100644 --- a/gprMax/contexts.py +++ b/gprMax/contexts.py @@ -18,10 +18,11 @@ import datetime import logging -import time +import sys import gprMax.config as config from ._version import __version__, codename +from .exceptions import GeneralError from .model_build_run import ModelBuildRun from .solvers import create_solver from .solvers import create_G @@ -144,19 +145,22 @@ class MPIContext(Context): self.print_host_info() if config.sim_config.general['cuda']: self.print_gpu_info() - - time.sleep(0.1) + sys.stdout.flush() # Contruct MPIExecutor executor = self.MPIExecutor(self._run_model, comm=self.comm) + # Check GPU resources versus number of MPI tasks + if executor.is_master(): + if config.sim_config.general['cuda']: + if executor.size - 1 > len(config.sim_config.cuda['gpus']): + raise GeneralError(f'Not enough GPU resources for number of MPI tasks requested. Number of MPI tasks should be equal to number of GPUs + 1.') + # Create job list jobs = [] for i in self.model_range: jobs.append({'i': i}) - - # Send the workers to their work loop executor.start() if executor.is_master(): diff --git a/gprMax/utilities.py b/gprMax/utilities.py index 50338751..365f5a5f 100644 --- a/gprMax/utilities.py +++ b/gprMax/utilities.py @@ -503,15 +503,11 @@ class GPU: self.totalmem = drv.Device(self.deviceID).total_memory() -def detect_check_gpus(deviceIDs): +def detect_gpus(): """Get information about Nvidia GPU(s). - Args: - deviceIDs (list): List of integers of device IDs. - Returns: gpus (list): Detected GPU(s) object(s). - gpus_str (list): Printable strings of information on GPU(s). """ try: @@ -529,6 +525,23 @@ def detect_check_gpus(deviceIDs): else: deviceIDsavail = range(drv.Device.count()) + # Gather information about selected/detected GPUs + gpus = [] + for ID in deviceIDsavail: + gpu = GPU(deviceID=ID) + gpu.get_gpu_info(drv) + gpus.append(gpu) + + return gpus + + +def check_gpus(gpus): + """Check if requested Nvidia GPU(s) deviceID(s) exist. + + Args: + gpus (list): List of GPU object(s). + """ + # Check if requested device ID(s) exist for ID in deviceIDs: if ID not in deviceIDsavail: @@ -541,8 +554,6 @@ def detect_check_gpus(deviceIDs): gpu.get_gpu_info(drv) gpus.append(gpu) - return gpus - def timer(): """Function to return time in fractional seconds."""