Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 46 additions & 7 deletions charts/weka-operator/resources/weka_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import sys
import threading
import time
from dataclasses import dataclass
from dataclasses import dataclass, asdict
from functools import lru_cache, partial
from os.path import exists
from textwrap import dedent
Expand All @@ -30,6 +30,7 @@ class SignOptions:
class Disk:
path: str
is_mounted: bool
serial_id: Optional[str]


MODE = os.environ.get("MODE")
Expand Down Expand Up @@ -111,6 +112,36 @@ class Disk:
)


async def get_serial_id_fallback(device_path: str) -> Optional[str]:
"""
Fallback method to get serial ID for a device using udev data.
This is useful for non-nvme devices where lsblk might not report a serial.
"""
device_name = os.path.basename(device_path)
logging.info(f"Attempting fallback to get serial for {device_name}")
try:
# Get major:minor device number
dev_index_out, _, ec = await run_command(f"cat /sys/block/{device_name}/dev")
if ec != 0:
logging.warning(f"Fallback failed: could not get dev index for {device_name}")
return None
dev_index = dev_index_out.decode().strip()

# Get serial from udev data
serial_id_cmd = f"cat /host/run/udev/data/b{dev_index} | grep ID_SERIAL="
serial_id_out, _, ec = await run_command(serial_id_cmd)
if ec != 0:
logging.warning(f"Fallback failed: could not get ID_SERIAL from udev for {device_name}")
return None

serial_id = serial_id_out.decode().strip().split("=")[-1]
logging.info(f"Fallback successful for {device_name}, found serial: {serial_id}")
return serial_id
except Exception as e:
logging.error(f"Exception during serial ID fallback for {device_name}: {e}")
return None


async def sign_drives_by_pci_info(vendor_id: str, device_id: str, options: dict) -> List[str]:
logging.info("Signing drives. Vendor ID: %s, Device ID: %s", vendor_id, device_id)

Expand Down Expand Up @@ -142,7 +173,7 @@ async def find_disks() -> List[Disk]:
"""
logging.info("Finding disks and checking mount status")
# Use -J for JSON output, -p for full paths, -o to specify columns
cmd = "nsenter --mount --pid --target 1 -- lsblk -p -J -o NAME,TYPE,MOUNTPOINT"
cmd = "nsenter --mount --pid --target 1 -- lsblk -p -J -o NAME,TYPE,MOUNTPOINT,SERIAL"
stdout, stderr, ec = await run_command(cmd, capture_stdout=True)
if ec != 0:
logging.error(f"Failed to execute lsblk: {stderr.decode()}")
Expand All @@ -169,8 +200,13 @@ def has_mountpoint(device_info: dict) -> bool:
for device in data.get("blockdevices", []):
if device.get("type") == "disk":
is_mounted = has_mountpoint(device)
logging.info(f"Found disk: {device['name']}, mounted: {is_mounted}")
disks.append(Disk(path=device["name"], is_mounted=is_mounted))
serial_id = device.get("serial")
device_path = device["name"]
if not serial_id:
logging.warning(f"lsblk did not return serial for {device_path}. Using fallback.")
serial_id = await get_serial_id_fallback(device_path)
logging.info(f"Found disk: {device_path}, mounted: {is_mounted}, serial: {serial_id}")
disks.append(Disk(path=device_path, is_mounted=is_mounted, serial_id=serial_id))

return disks

Expand Down Expand Up @@ -297,10 +333,11 @@ async def get_block_device_path_by_serial(serial: str):

async def discover_drives():
drives = await find_weka_drives()
raw_disks = await find_disks()
write_results(dict(
err=None,
drives=drives,
raw_drives=await find_disks(),
raw_drives=[asdict(d) for d in raw_disks],
))


Expand Down Expand Up @@ -1466,8 +1503,8 @@ async def configure_agent(agent_handle_drivers=False):
#TODO: once moving to 4.3+ only switch to ignore_driver_spec. Problem that 4.2 had it in different category
# and check by skip_driver_install is sort of abuse of not anymore existing flag to have something to validate by
if ! grep -q "skip_driver_install" /etc/wekaio/service.conf; then
sed -i "/\[os\]/a skip_driver_install={ignore_driver_flag}" /etc/wekaio/service.conf
sed -i "/\[os\]/a ignore_driver_spec={ignore_driver_flag}" /etc/wekaio/service.conf
sed -i "/\\[os\\]/a skip_driver_install={ignore_driver_flag}" /etc/wekaio/service.conf
sed -i "/\\[os\\]/a ignore_driver_spec={ignore_driver_flag}" /etc/wekaio/service.conf
else
sed -i "s/skip_driver_install=.*/skip_driver_install={ignore_driver_flag}/g" /etc/wekaio/service.conf
fi
Expand Down Expand Up @@ -2318,6 +2355,8 @@ async def main():
await asyncio.sleep(3) # a hack to give kernel a chance to update paths, as it's not instant
await discover_drives()
elif instruction.get('type') and instruction['type'] == 'debug':
# TODO: Wrap this as conditional based on payload, as might fail in some cases
print(await discover_drives())
pass # nothing to do, but runtime will wait for termination signal
# TODO: Should we support generic command proxy? security concern?
elif instruction.get('type') and instruction['type'] == 'umount':
Expand Down
Loading