Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 88 additions & 21 deletions python/tune_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# limitations under the License.

import argparse
import functools
import html
import logging
import os
Expand Down Expand Up @@ -43,6 +44,7 @@
32.0: 3.938,
64.0: 7.563,
}
BAR1_BLACKWELL_MIN_MIB = 32768 # 32 GiB


@dataclass
Expand Down Expand Up @@ -193,6 +195,16 @@ def is_any_integrated_gpu():
return False


def _dmabuf_gpu_path_available():
"""
Returns True if the kernel exposes the dma-buf path that recent NVIDIA drivers
use for GPUDirect in place of nvidia-peermem. The patched DPDK shipped with
this repo (dpdk_patches/dmabuf.patch) takes this path on platforms that
expose it, which is why peermem is not required there.
"""
return os.path.exists("/dev/dma_heap/system")


def check_peermem_kernel():
"""
Check if the nvidia-peermem module for GPUDirect is loaded in the kernel.
Expand All @@ -214,6 +226,14 @@ def check_peermem_kernel():
"(e.g. GB10 / DGX Spark) where peermem does not apply. Use kind: host_pinned "
"in the daqiri YAML for GPUDirect on this platform."
)
elif _dmabuf_gpu_path_available():
logging.info(
"nvidia-peermem module is not loaded, but /dev/dma_heap/system is "
"available. The patched DPDK shipped with this repo "
"(dpdk_patches/dmabuf.patch) takes the dma-buf GPUDirect path on "
"platforms that expose it and does not need peermem. If you are "
"building DAQIRI against stock DPDK, load nvidia-peermem."
)
else:
logging.warning("nvidia-peermem module is not loaded. GPUDirect may not work.")

Expand Down Expand Up @@ -264,11 +284,16 @@ def check_gpudirect_support():
logging.warning(f"GPU {i}: {name.value.decode()} does not have GPUDirect support.")


@functools.lru_cache(maxsize=1)
def get_nic_info():
"""
Parses the output of `ibdev2netdev -v` to extract and return a list of tuples,
where each tuple contains the interface name and its PCIe address.

Cached with lru_cache so --check all (which calls this from check_mrrs,
check_max_payload_size, and check_mtu_size) only invokes ibdev2netdev once
and only emits the "ibdev2netdev not found" warning once per run.

Returns:
List[Tuple[str, str]]: A list of tuples containing the IF name and PCIe address
"""
Expand All @@ -288,16 +313,17 @@ def get_nic_info():
return vals

except FileNotFoundError:
print(
"The ibdev2netdev command is not found. Ensure that it is installed and available in your PATH."
logging.warning(
"The ibdev2netdev command is not found (try: apt install infiniband-diags). "
"Skipping NIC-dependent checks (mrrs, mps, mtu)."
)
return [], []
return []
except subprocess.CalledProcessError as e:
print(f"Error while executing ibdev2netdev: {e}")
return [], []
logging.error(f"Error while executing ibdev2netdev: {e}")
return []
except Exception as e:
print(f"An unexpected error occurred: {e}")
return [], []
logging.error(f"Unexpected error while running ibdev2netdev: {e}")
return []
Comment on lines 315 to +326
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 ibdev2netdev warning emitted multiple times under --check all

get_nic_info() now owns the warning, but --check all calls it three times independently — once each from check_mrrs(), check_max_payload_size(), and check_mtu_size() — so a missing ibdev2netdev produces three copies of the same warning in a single run. The PR's stated goal is collapsing redundant output (done for the CPU governor), but this case is left un-collapsed. A caller-side guard (cache the result, or emit the warning only once with a module-level flag) would be consistent with that goal.

Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!



def get_online_cpus():
Expand Down Expand Up @@ -328,30 +354,50 @@ def get_online_cpus():
def check_cpu_governor():
"""
Checks if the CPU frequency governor is set to 'performance' for all online CPUs.
Aggregates results by governor value and logs one summary line per distinct
governor (e.g. 256/256 online CPUs set to 'performance'), plus separate counts
for CPUs whose scaling_governor file is missing or unreadable.
"""
online_cpus = get_online_cpus()
total = len(online_cpus)

by_governor = defaultdict(list)
missing = []
permission_denied = []

for cpu in online_cpus:
scaling_governor_path = f"/sys/devices/system/cpu/cpu{cpu}/cpufreq/scaling_governor"

try:
with open(scaling_governor_path, "r") as f:
governor = f.read().strip()

if governor == "performance":
logging.info(f"CPU {cpu}: Governor is correctly set to 'performance'.")
else:
logging.warning(f"CPU {cpu}: Governor is set to '{governor}', not 'performance'.")

by_governor[f.read().strip()].append(cpu)
except FileNotFoundError:
logging.error(
f"CPU {cpu}: Scaling governor file not found. This CPU may not support frequency scaling."
)
missing.append(cpu)
except PermissionError:
logging.error(
f"CPU {cpu}: Permission denied while accessing scaling governor file. Run as root."
permission_denied.append(cpu)

for governor, cpus in sorted(by_governor.items()):
if governor == "performance":
logging.info(
f"CPU governor: {len(cpus)}/{total} online CPUs set to 'performance'."
)
else:
logging.warning(
f"CPU governor: {len(cpus)}/{total} online CPUs set to '{governor}', "
"expected 'performance'."
)

if missing:
logging.error(
f"CPU governor: scaling_governor file not found on {len(missing)}/{total} "
"online CPUs. The cpufreq driver may not be loaded (e.g. amd-pstate, "
"intel_pstate, or cppc_cpufreq). Performance scaling cannot be checked."
)
if permission_denied:
logging.error(
f"CPU governor: permission denied reading scaling_governor on "
f"{len(permission_denied)}/{total} online CPUs. Run as root."
)


def check_mrrs():
"""
Expand Down Expand Up @@ -610,6 +656,16 @@ def check_bar1_size():
"There is no resizable BAR1 to enlarge on platforms like GB10 / DGX Spark."
)
return
# On RTX PRO 6000 Blackwell Server Edition (96 GB GDDR7) the generic
# > 1024 MiB threshold passes trivially even with Resizable BAR disabled
# (the card still exposes a multi-GiB BAR1 in some platform configs).
# 32 GiB is the conservative "rebar is fully unlocked" floor: well below
# the 96 GB card capacity but high enough that any platform exposing less
# is almost certainly missing Resizable BAR / Above 4G Decoding in BIOS.
# The threshold is applied per-GPU via gpu_info below so heterogeneous
# boxes (e.g. RTX PRO 6000 + H100) only get the Blackwell rule on the
# Blackwell card.
gpu_info_by_bdf = get_nvidia_gpu_info_by_bdf()
try:
# Run nvidia-smi to get BAR1 memory information
result = subprocess.run(
Expand Down Expand Up @@ -640,7 +696,18 @@ def check_bar1_size():

# Once BAR1 size is found, log it
if current_gpu is not None and bar1_total is not None:
if bar1_total > 1024:
gpu_bdf = normalize_pci_address(current_gpu) or current_gpu
gpu_name = gpu_info_by_bdf.get(gpu_bdf, {}).get("name", "")
gpu_is_blackwell = "Blackwell Server Edition" in gpu_name
if gpu_is_blackwell and bar1_total < BAR1_BLACKWELL_MIN_MIB:
logging.warning(
f"GPU {current_gpu} ({gpu_name}): BAR1 size is {bar1_total} MiB. "
f"Expected at least {BAR1_BLACKWELL_MIN_MIB} MiB "
f"({BAR1_BLACKWELL_MIN_MIB // 1024} GiB) with Resizable BAR fully "
"enabled. Check the system BIOS for the Resizable BAR / Above 4G "
"Decoding settings."
)
elif bar1_total > 1024:
logging.info(f"GPU {current_gpu}: BAR1 size is {bar1_total} MiB.")
else:
logging.warning(
Expand Down