Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,13 +145,13 @@ somd2 --help | grep -A2 ' --gcmc'
```

> [!NOTE]
> GCMC is currently only supported when using the CUDA platform and isn't
> available on macOS, where the `pycuda` package is not available.
> GCMC is only supported when using the CUDA or OpenCL platforms.

Make sure that `nvcc` is in your `PATH`. If you require a different `nvcc` to that
provided by conda, you can set the `PYCUDA_NVCC` environment variable to point
to the desired `nvcc` binary. Depending on your setup, you may also need to install
the `cuda-nvvm` package from `conda-forge`.
When using the CUDA platform, make sure that `nvcc` is in your `PATH`. If you
require a different `nvcc` to that provided by conda, you can set the
`PYCUDA_NVCC` environment variable to point to the desired `nvcc` binary.
Depending on your setup, you may also need to install the `cuda-nvvm` package
from `conda-forge`.

## Analysis

Expand Down
5 changes: 3 additions & 2 deletions src/somd2/runner/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,8 +535,8 @@ def __init__(self, system, config):

# GCMC specific validation.
if self._config.gcmc:
if self._config.platform != "cuda":
msg = "GCMC simulations require the CUDA platform."
if self._config.platform not in ["cuda", "opencl"]:
msg = "GCMC simulations require the CUDA or OpenCL platform."
_logger.error(msg)
raise ValueError(msg)

Expand Down Expand Up @@ -747,6 +747,7 @@ def __init__(self, system, config):
"tolerance": self._config.gcmc_tolerance,
"restart": self._is_restart,
"overwrite": self._config.overwrite,
"platform": config.platform,
"no_logger": True,
}
else:
Expand Down
108 changes: 81 additions & 27 deletions src/somd2/runner/_repex.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,15 +329,15 @@ def _create_dynamics(
# Work out the memory used by this dynamics object and GCMC sampler.
mem_used = used_mem - used_mem_before

# Work out the estimate for all replicas on this device.
est_total = mem_used * contexts_per_device[device]
# Work out the estimated total after all replicas have been created.
est_total = mem_used * contexts_per_device[device] + used_mem_before

# If this exceeds the total memory, raise an error.
if est_total > total_mem:
msg = (
f"Not enough memory on device {device} for all assigned replicas. "
f"Estimated memory usage: {est_total / 1e9:.2f} GB, "
f"Available memory: {total_mem / 1e9:.2f} GB."
f"Estimated memory usage: {est_total / (1024**3):.2f} GB, "
f"Available memory: {total_mem / (1024**3):.2f} GB."
)
_logger.error(msg)
raise MemoryError(msg)
Expand All @@ -347,8 +347,15 @@ def _create_dynamics(
_logger.warning(
f"Device {device} will have less than 20% free memory "
f"after creating all assigned replicas. "
f"{est_total / 1e9:.2f} GB, "
f"Available memory: {total_mem / 1e9:.2f} GB."
f"{est_total / (1024**3):.2f} GB, "
f"Available memory: {total_mem / (1024**3):.2f} GB."
)

else:
_logger.info(
f"Estimated memory usage on device {device} after creating all replicas: "
f"{est_total / (1024**3):.2f} GB, "
f"Available memory: {total_mem / (1024**3):.2f} GB."
)

_logger.info(
Expand Down Expand Up @@ -515,34 +522,78 @@ def get_swaps(self):
return self._num_swaps

@staticmethod
def _check_device_memory(index):
def _check_device_memory(device_index=0):
"""
Check the memory usage of the specified CUDA device.
Check the memory usage of the specified GPU device.

Parameters
----------

index: int
The index of the CUDA device.
The index of the GPU device.
"""
try:
from pynvml import (
nvmlInit,
nvmlShutdown,
nvmlDeviceGetHandleByIndex,
nvmlDeviceGetMemoryInfo,
)
import pyopencl as cl

nvmlInit()
handle = nvmlDeviceGetHandleByIndex(index)
info = nvmlDeviceGetMemoryInfo(handle)
result = (info.used, info.free, info.total)
nvmlShutdown()
except Exception as e:
msg = f"Could not determine memory usage for device {index}: {e}"
# Get the device.
platforms = cl.get_platforms()
all_devices = []
for platform in platforms:
try:
devices = platform.get_devices(device_type=cl.device_type.GPU)
all_devices.extend(devices)
except:
continue

if device_index >= len(all_devices):
msg = f"Device index {device_index} out of range. Found {len(all_devices)} GPU(s)."
_logger.error(msg)
raise IndexError(msg)

return result
device = all_devices[device_index]
total = device.global_mem_size

# NVIDIA: Use pynvml
if "NVIDIA" in device.vendor:
try:
import pynvml

pynvml.nvmlInit()

# Find matching device by name
device_count = pynvml.nvmlDeviceGetCount()
for i in range(device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
name = pynvml.nvmlDeviceGetName(handle)

if name in device.name or device.name in name:
memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
pynvml.nvmlShutdown()
return (memory.used, memory.free, memory.total)

pynvml.nvmlShutdown()
except Exception as e:
msg = f"Could not get NVIDIA GPU memory info for device {device_index}: {e}"
_logger.error(msg)
raise RuntimeError(msg) from e

# AMD: Use OpenCL extension
elif "AMD" in device.vendor or "Advanced Micro Devices" in device.vendor:
try:
free_memory_info = device.get_info(0x4038)
free_kb = (
free_memory_info[0]
if isinstance(free_memory_info, list)
else free_memory_info
)
free = free_kb * 1024
used = total - free
return (used, free, total)
except Exception as e:
msg = (
f"Could not get AMD GPU memory info for device {device_index}: {e}"
)
_logger.error(msg)
raise RuntimeError(msg) from e


class RepexRunner(_RunnerBase):
Expand Down Expand Up @@ -582,9 +633,12 @@ def __init__(self, system, config):
# Call the base class constructor.
super().__init__(system, config)

# Make sure we're using the CUDA platform.
if self._config.platform != "cuda":
msg = "Currently replica exchange simulations can only be run on the CUDA platform."
# Make sure we're using the CUDA or OpenCL platform.
if self._config.platform not in ["cuda", "opencl"]:
msg = (
"Currently replica exchange simulations can only be "
"run on the CUDA and OpenCL platforms."
)
_logger.error(msg)
raise ValueError(msg)

Expand Down
Loading