Fix deviceID selection for GPUs.

2025-08-08 07:24:19 +08:00 · 2020-03-10 17:28:58 +00:00
--- a/gprMax/config.py
+++ b/gprMax/config.py
@@ -31,7 +31,7 @@ from scipy.constants import epsilon_0 as e0
 from scipy.constants import mu_0 as m0

 from .exceptions import GeneralError
-from .utilities import detect_check_gpus
+from .utilities import detect_gpus
 from .utilities import get_host_info
 from .utilities import get_terminal_width

@@ -71,8 +71,15 @@ class ModelConfig:
        #     N.B. This will happen if the requested snapshots are too large to fit
        #     on the memory of the GPU. If True this will slow performance significantly
        if sim_config.general['cuda']:
-            gpu = sim_config.set_model_gpu()
-            self.cuda = {'gpu': gpu,
+            # If a list of lists of GPU deviceIDs is found, flatten it
+            if any(isinstance(element, list) for element in sim_config.args.gpu):
+                deviceID = [val for sublist in sim_config.args.gpu for val in sublist]
+
+            # If no deviceID is given default to using deviceID 0. Else if either
+            # a single deviceID or list of deviceIDs is given use first one.
+            deviceID = 0 if not deviceID else deviceID[0]
+
+            self.cuda = {'gpu': sim_config.set_model_gpu(deviceID),
                         'snapsgpu2cpu': False}

        # Total memory usage for all grids in the model. Starts with 50MB overhead.
@@ -221,20 +228,12 @@ class SimulationConfig:
            # provides best performance.
            self.general['precision'] = 'single'
            self.cuda = {'gpus': [], # gpus: list of GPU objects
-                         'gpus_str': [], # gpus_str: list of strings describing GPU(s)
                         'nvcc_opts': None} # nvcc_opts: nvcc compiler options
            # Suppress nvcc warnings on Microsoft Windows
            if sys.platform == 'win32': self.cuda['nvcc_opts'] = ['-w']

-            # Flatten a list of lists
-            if any(isinstance(element, list) for element in self.args.gpu):
-                self.args.gpu = [val for sublist in self.args.gpu for val in sublist]
-
-            # If no deviceID is given default to 0
-            if not self.args.gpu:
-                self.args.gpu = [0]
-
-            self.cuda['gpus'] = detect_check_gpus(self.args.gpu)
+            # List of GPU objects of available GPUs
+            self.cuda['gpus'] = detect_gpus()

        # Subgrid parameter may not exist if user enters via CLI
        try:
@@ -259,14 +258,25 @@ class SimulationConfig:
        self._set_model_start_end()
        self._set_single_model()

-    def set_model_gpu(self, deviceID=0):
-        """Specify GPU object for model. Defaults to first GPU deviceID in
-            list of deviceID given.
+    def set_model_gpu(self, deviceID):
+        """Specify GPU object for model.
+
+        Args:
+            deviceID (int): Requested deviceID of GPU
+
+        Returns:
+            gpu (GPU object): Requested GPU object.
        """
+
+        found = False
        for gpu in self.cuda['gpus']:
            if gpu.deviceID == deviceID:
+                found = True
                return gpu

+        if not found:
+            raise GeneralError(f'GPU with device ID {deviceID} does not exist')
+
    def _set_precision(self):
        """Data type (precision) for electromagnetic field output.

--- a/gprMax/contexts.py
+++ b/gprMax/contexts.py
@@ -18,10 +18,11 @@

 import datetime
 import logging
-import time
+import sys

 import gprMax.config as config
 from ._version import __version__, codename
+from .exceptions import GeneralError
 from .model_build_run import ModelBuildRun
 from .solvers import create_solver
 from .solvers import create_G
@@ -144,19 +145,22 @@ class MPIContext(Context):
            self.print_host_info()
            if config.sim_config.general['cuda']:
                self.print_gpu_info()
-
-        time.sleep(0.1)
+            sys.stdout.flush()

        # Contruct MPIExecutor
        executor = self.MPIExecutor(self._run_model, comm=self.comm)

+        # Check GPU resources versus number of MPI tasks
+        if executor.is_master():
+            if config.sim_config.general['cuda']:
+                if executor.size - 1 > len(config.sim_config.cuda['gpus']):
+                    raise GeneralError(f'Not enough GPU resources for number of MPI tasks requested. Number of MPI tasks should be equal to number of GPUs + 1.')
+
        # Create job list
        jobs = []
        for i in self.model_range:
            jobs.append({'i': i})

-
-
        # Send the workers to their work loop
        executor.start()
        if executor.is_master():
--- a/gprMax/utilities.py
+++ b/gprMax/utilities.py
@@ -503,15 +503,11 @@ class GPU:
        self.totalmem = drv.Device(self.deviceID).total_memory()


-def detect_check_gpus(deviceIDs):
+def detect_gpus():
    """Get information about Nvidia GPU(s).

-    Args:
-        deviceIDs (list): List of integers of device IDs.
-
    Returns:
        gpus (list): Detected GPU(s) object(s).
-        gpus_str (list): Printable strings of information on GPU(s).
    """

    try:
@@ -529,6 +525,23 @@ def detect_check_gpus(deviceIDs):
    else:
        deviceIDsavail = range(drv.Device.count())

+    # Gather information about selected/detected GPUs
+    gpus = []
+    for ID in deviceIDsavail:
+        gpu = GPU(deviceID=ID)
+        gpu.get_gpu_info(drv)
+        gpus.append(gpu)
+
+    return gpus
+
+
+def check_gpus(gpus):
+    """Check if requested Nvidia GPU(s) deviceID(s) exist.
+
+    Args:
+        gpus (list): List of GPU object(s).
+    """
+
    # Check if requested device ID(s) exist
    for ID in deviceIDs:
        if ID not in deviceIDsavail:
@@ -541,8 +554,6 @@ def detect_check_gpus(deviceIDs):
        gpu.get_gpu_info(drv)
        gpus.append(gpu)

-    return gpus
-

 def timer():
    """Function to return time in fractional seconds."""