From 86d5c635a3a1643160241272ef9e95368ff83bc2 Mon Sep 17 00:00:00 2001
From: Steven Miller <sjmiller609@gmail.com>
Date: Fri, 30 Jan 2026 11:43:43 -0500
Subject: [PATCH 1/5] Use resources module for input validation

---
 cmd/api/api/instances.go              |  28 +++--
 cmd/api/config/config.go              |   8 +-
 cmd/api/main.go                       |   6 +
 integration/systemd_test.go           |   2 -
 integration/vgpu_test.go              |   3 -
 lib/instances/create.go               |  45 ++-----
 lib/instances/errors.go               |   3 +
 lib/instances/manager.go              |  41 +++++--
 lib/instances/manager_test.go         |   4 -
 lib/instances/qemu_test.go            |   2 -
 lib/instances/resource_limits_test.go | 170 +-------------------------
 lib/instances/standby.go              |   6 +
 lib/instances/start.go                |  31 +++++
 lib/instances/stop.go                 |  13 +-
 lib/oapi/oapi.go                      |  68 +++++++----
 lib/providers/providers.go            |  14 +--
 lib/resources/resource.go             |  62 ++++++++++
 openapi.yaml                          |   6 +
 18 files changed, 230 insertions(+), 282 deletions(-)

diff --git a/cmd/api/api/instances.go b/cmd/api/api/instances.go
index e95536d4..62621dec 100644
--- a/cmd/api/api/instances.go
+++ b/cmd/api/api/instances.go
@@ -252,6 +252,11 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst
 				Code:    "name_conflict",
 				Message: err.Error(),
 			}, nil
+		case errors.Is(err, instances.ErrInsufficientResources):
+			return oapi.CreateInstance409JSONResponse{
+				Code:    "insufficient_resources",
+				Message: err.Error(),
+			}, nil
 		default:
 			log.ErrorContext(ctx, "failed to create instance", "error", err, "image", request.Body.Image)
 			return oapi.CreateInstance500JSONResponse{
@@ -309,15 +314,15 @@ func (s *ApiService) GetInstanceStats(ctx context.Context, request oapi.GetInsta
 // vmStatsToOAPI converts vm_metrics.VMStats to oapi.InstanceStats
 func vmStatsToOAPI(s *vm_metrics.VMStats) oapi.InstanceStats {
 	stats := oapi.InstanceStats{
-		InstanceId:           s.InstanceID,
-		InstanceName:         s.InstanceName,
-		CpuSeconds:           s.CPUSeconds(),
-		MemoryRssBytes:       int64(s.MemoryRSSBytes),
-		MemoryVmsBytes:       int64(s.MemoryVMSBytes),
-		NetworkRxBytes:       int64(s.NetRxBytes),
-		NetworkTxBytes:       int64(s.NetTxBytes),
-		AllocatedVcpus:       s.AllocatedVcpus,
-		AllocatedMemoryBytes: s.AllocatedMemoryBytes,
+		InstanceId:             s.InstanceID,
+		InstanceName:           s.InstanceName,
+		CpuSeconds:             s.CPUSeconds(),
+		MemoryRssBytes:         int64(s.MemoryRSSBytes),
+		MemoryVmsBytes:         int64(s.MemoryVMSBytes),
+		NetworkRxBytes:         int64(s.NetRxBytes),
+		NetworkTxBytes:         int64(s.NetTxBytes),
+		AllocatedVcpus:         s.AllocatedVcpus,
+		AllocatedMemoryBytes:   s.AllocatedMemoryBytes,
 		MemoryUtilizationRatio: s.MemoryUtilizationRatio(),
 	}
 	return stats
@@ -464,6 +469,11 @@ func (s *ApiService) StartInstance(ctx context.Context, request oapi.StartInstan
 				Code:    "invalid_state",
 				Message: err.Error(),
 			}, nil
+		case errors.Is(err, instances.ErrInsufficientResources):
+			return oapi.StartInstance409JSONResponse{
+				Code:    "insufficient_resources",
+				Message: err.Error(),
+			}, nil
 		default:
 			log.ErrorContext(ctx, "failed to start instance", "error", err)
 			return oapi.StartInstance500JSONResponse{
diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go
index 0fd01f16..79dcd0e5 100644
--- a/cmd/api/config/config.go
+++ b/cmd/api/config/config.go
@@ -69,8 +69,7 @@ type Config struct {
 	MaxMemoryPerInstance string // Max memory for a single VM (0 = unlimited)
 
 	// Resource limits - aggregate
-	MaxTotalVcpus         int    // Aggregate vCPU limit across all instances (0 = unlimited)
-	MaxTotalMemory        string // Aggregate memory limit across all instances (0 = unlimited)
+	// Note: CPU/memory aggregate limits are now handled via oversubscription ratios (OVERSUB_CPU, OVERSUB_MEMORY)
 	MaxTotalVolumeStorage string // Total volume storage limit (0 = unlimited)
 
 	// OpenTelemetry configuration
@@ -166,9 +165,8 @@ func Load() *Config {
 		MaxVcpusPerInstance:  getEnvInt("MAX_VCPUS_PER_INSTANCE", 16),
 		MaxMemoryPerInstance: getEnv("MAX_MEMORY_PER_INSTANCE", "32GB"),
 
-		// Resource limits - aggregate (0 or empty = unlimited)
-		MaxTotalVcpus:         getEnvInt("MAX_TOTAL_VCPUS", 0),
-		MaxTotalMemory:        getEnv("MAX_TOTAL_MEMORY", ""),
+		// Resource limits - aggregate
+		// Note: CPU/memory aggregate limits are now handled via oversubscription ratios
 		MaxTotalVolumeStorage: getEnv("MAX_TOTAL_VOLUME_STORAGE", ""),
 
 		// OpenTelemetry configuration
diff --git a/cmd/api/main.go b/cmd/api/main.go
index fef9f322..7f5e4265 100644
--- a/cmd/api/main.go
+++ b/cmd/api/main.go
@@ -225,6 +225,12 @@ func run() error {
 		logger.Warn("failed to reconcile mdev devices", "error", err)
 	}
 
+	// Wire up resource validator for aggregate limit checking
+	// This enables the instance manager to validate CPU, memory, network, and GPU
+	// availability before creating or starting instances.
+	app.InstanceManager.SetResourceValidator(app.ResourceManager)
+	logger.Info("Resource validator configured")
+
 	// Initialize ingress manager (starts Caddy daemon and DNS server for dynamic upstreams)
 	logger.Info("Initializing ingress manager...")
 	if err := app.IngressManager.Initialize(app.Ctx); err != nil {
diff --git a/integration/systemd_test.go b/integration/systemd_test.go
index bba12df0..79973d7c 100644
--- a/integration/systemd_test.go
+++ b/integration/systemd_test.go
@@ -67,8 +67,6 @@ func TestSystemdMode(t *testing.T) {
 		MaxOverlaySize:       100 * 1024 * 1024 * 1024,
 		MaxVcpusPerInstance:  0,
 		MaxMemoryPerInstance: 0,
-		MaxTotalVcpus:        0,
-		MaxTotalMemory:       0,
 	}
 
 	instanceManager := instances.NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil)
diff --git a/integration/vgpu_test.go b/integration/vgpu_test.go
index ea0d668f..8c02eabc 100644
--- a/integration/vgpu_test.go
+++ b/integration/vgpu_test.go
@@ -76,8 +76,6 @@ func TestVGPU(t *testing.T) {
 		MaxOverlaySize:       100 * 1024 * 1024 * 1024,
 		MaxVcpusPerInstance:  0,
 		MaxMemoryPerInstance: 0,
-		MaxTotalVcpus:        0,
-		MaxTotalMemory:       0,
 	}
 
 	instanceManager := instances.NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil)
@@ -272,4 +270,3 @@ func checkVGPUTestPrerequisites() (string, string) {
 
 	return "vGPU test requires at least one available VF (all VFs are in use)", ""
 }
-
diff --git a/lib/instances/create.go b/lib/instances/create.go
index 904e589a..35088d49 100644
--- a/lib/instances/create.go
+++ b/lib/instances/create.go
@@ -8,7 +8,6 @@ import (
 	"strings"
 	"time"
 
-	"github.com/nrednav/cuid2"
 	"github.com/kernel/hypeman/lib/devices"
 	"github.com/kernel/hypeman/lib/hypervisor"
 	"github.com/kernel/hypeman/lib/images"
@@ -16,6 +15,7 @@ import (
 	"github.com/kernel/hypeman/lib/network"
 	"github.com/kernel/hypeman/lib/system"
 	"github.com/kernel/hypeman/lib/volumes"
+	"github.com/nrednav/cuid2"
 	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/trace"
 	"gvisor.dev/gvisor/pkg/cleanup"
@@ -48,31 +48,6 @@ var systemDirectories = []string{
 	"/var",
 }
 
-// AggregateUsage represents total resource usage across all instances
-type AggregateUsage struct {
-	TotalVcpus  int
-	TotalMemory int64 // in bytes
-}
-
-// calculateAggregateUsage calculates total resource usage across all running instances
-func (m *manager) calculateAggregateUsage(ctx context.Context) (AggregateUsage, error) {
-	instances, err := m.listInstances(ctx)
-	if err != nil {
-		return AggregateUsage{}, err
-	}
-
-	var usage AggregateUsage
-	for _, inst := range instances {
-		// Only count running/paused instances (those consuming resources)
-		if inst.State == StateRunning || inst.State == StatePaused || inst.State == StateCreated {
-			usage.TotalVcpus += inst.Vcpus
-			usage.TotalMemory += inst.Size + inst.HotplugSize
-		}
-	}
-
-	return usage, nil
-}
-
 // generateVsockCID converts first 8 chars of instance ID to a unique CID
 // CIDs 0-2 are reserved (hypervisor, loopback, host)
 // Returns value in range 3 to 4294967295
@@ -174,18 +149,12 @@ func (m *manager) createInstance(
 		return nil, fmt.Errorf("total memory %d (size + hotplug_size) exceeds maximum allowed %d per instance", totalMemory, m.limits.MaxMemoryPerInstance)
 	}
 
-	// Validate aggregate resource limits
-	if m.limits.MaxTotalVcpus > 0 || m.limits.MaxTotalMemory > 0 {
-		usage, err := m.calculateAggregateUsage(ctx)
-		if err != nil {
-			log.WarnContext(ctx, "failed to calculate aggregate usage, skipping limit check", "error", err)
-		} else {
-			if m.limits.MaxTotalVcpus > 0 && usage.TotalVcpus+vcpus > m.limits.MaxTotalVcpus {
-				return nil, fmt.Errorf("total vcpus would be %d, exceeds aggregate limit of %d", usage.TotalVcpus+vcpus, m.limits.MaxTotalVcpus)
-			}
-			if m.limits.MaxTotalMemory > 0 && usage.TotalMemory+totalMemory > m.limits.MaxTotalMemory {
-				return nil, fmt.Errorf("total memory would be %d, exceeds aggregate limit of %d", usage.TotalMemory+totalMemory, m.limits.MaxTotalMemory)
-			}
+	// Validate aggregate resource limits via ResourceValidator
+	if m.resourceValidator != nil {
+		needsGPU := req.GPU != nil && req.GPU.Profile != ""
+		if err := m.resourceValidator.ValidateAllocation(ctx, vcpus, totalMemory, req.NetworkBandwidthDownload, req.NetworkBandwidthUpload, req.DiskIOBps, needsGPU); err != nil {
+			log.ErrorContext(ctx, "resource validation failed", "error", err)
+			return nil, fmt.Errorf("%w: %v", ErrInsufficientResources, err)
 		}
 	}
 
diff --git a/lib/instances/errors.go b/lib/instances/errors.go
index 1090104e..9925bb02 100644
--- a/lib/instances/errors.go
+++ b/lib/instances/errors.go
@@ -17,4 +17,7 @@ var (
 
 	// ErrAmbiguousName is returned when multiple instances have the same name
 	ErrAmbiguousName = errors.New("multiple instances with the same name")
+
+	// ErrInsufficientResources is returned when resources (CPU, memory, network, GPU) are not available
+	ErrInsufficientResources = errors.New("insufficient resources")
 )
diff --git a/lib/instances/manager.go b/lib/instances/manager.go
index 09c9616e..23860d28 100644
--- a/lib/instances/manager.go
+++ b/lib/instances/manager.go
@@ -41,6 +41,9 @@ type Manager interface {
 	// ListRunningInstancesInfo returns info needed for utilization metrics collection.
 	// Used by the resource manager for VM utilization tracking.
 	ListRunningInstancesInfo(ctx context.Context) ([]resources.InstanceUtilizationInfo, error)
+	// SetResourceValidator sets the validator for aggregate resource limit checking.
+	// Called after initialization to avoid circular dependencies.
+	SetResourceValidator(v ResourceValidator)
 }
 
 // ResourceLimits contains configurable resource limits for instances
@@ -48,21 +51,28 @@ type ResourceLimits struct {
 	MaxOverlaySize       int64 // Maximum overlay disk size in bytes per instance
 	MaxVcpusPerInstance  int   // Maximum vCPUs per instance (0 = unlimited)
 	MaxMemoryPerInstance int64 // Maximum memory in bytes per instance (0 = unlimited)
-	MaxTotalVcpus        int   // Maximum total vCPUs across all instances (0 = unlimited)
-	MaxTotalMemory       int64 // Maximum total memory in bytes across all instances (0 = unlimited)
+}
+
+// ResourceValidator validates if resources can be allocated
+type ResourceValidator interface {
+	// ValidateAllocation checks if the requested resources are available.
+	// Returns nil if allocation is allowed, or a detailed error describing
+	// which resource is insufficient and the current capacity/usage.
+	ValidateAllocation(ctx context.Context, vcpus int, memoryBytes int64, networkDownloadBps int64, networkUploadBps int64, diskIOBps int64, needsGPU bool) error
 }
 
 type manager struct {
-	paths          *paths.Paths
-	imageManager   images.Manager
-	systemManager  system.Manager
-	networkManager network.Manager
-	deviceManager  devices.Manager
-	volumeManager  volumes.Manager
-	limits         ResourceLimits
-	instanceLocks  sync.Map      // map[string]*sync.RWMutex - per-instance locks
-	hostTopology   *HostTopology // Cached host CPU topology
-	metrics        *Metrics
+	paths             *paths.Paths
+	imageManager      images.Manager
+	systemManager     system.Manager
+	networkManager    network.Manager
+	deviceManager     devices.Manager
+	volumeManager     volumes.Manager
+	limits            ResourceLimits
+	resourceValidator ResourceValidator // Optional validator for aggregate resource limits
+	instanceLocks     sync.Map          // map[string]*sync.RWMutex - per-instance locks
+	hostTopology      *HostTopology     // Cached host CPU topology
+	metrics           *Metrics
 
 	// Hypervisor support
 	vmStarters        map[hypervisor.Type]hypervisor.VMStarter
@@ -106,6 +116,12 @@ func NewManager(p *paths.Paths, imageManager images.Manager, systemManager syste
 	return m
 }
 
+// SetResourceValidator sets the resource validator for aggregate limit checking.
+// This is called after initialization to avoid circular dependencies.
+func (m *manager) SetResourceValidator(v ResourceValidator) {
+	m.resourceValidator = v
+}
+
 // getHypervisor creates a hypervisor client for the given socket and type.
 // Used for connecting to already-running VMs (e.g., for state queries).
 func (m *manager) getHypervisor(socketPath string, hvType hypervisor.Type) (hypervisor.Hypervisor, error) {
@@ -366,4 +382,3 @@ func (m *manager) ListRunningInstancesInfo(ctx context.Context) ([]resources.Ins
 
 	return infos, nil
 }
-
diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go
index 839f2b90..e06e659d 100644
--- a/lib/instances/manager_test.go
+++ b/lib/instances/manager_test.go
@@ -54,8 +54,6 @@ func setupTestManager(t *testing.T) (*manager, string) {
 		MaxOverlaySize:       100 * 1024 * 1024 * 1024, // 100GB
 		MaxVcpusPerInstance:  0,                        // unlimited
 		MaxMemoryPerInstance: 0,                        // unlimited
-		MaxTotalVcpus:        0,                        // unlimited
-		MaxTotalMemory:       0,                        // unlimited
 	}
 	mgr := NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil).(*manager)
 
@@ -938,8 +936,6 @@ func TestStorageOperations(t *testing.T) {
 		MaxOverlaySize:       100 * 1024 * 1024 * 1024, // 100GB
 		MaxVcpusPerInstance:  0,                        // unlimited
 		MaxMemoryPerInstance: 0,                        // unlimited
-		MaxTotalVcpus:        0,                        // unlimited
-		MaxTotalMemory:       0,                        // unlimited
 	}
 	manager := NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil).(*manager)
 
diff --git a/lib/instances/qemu_test.go b/lib/instances/qemu_test.go
index 6ff8118e..e276676c 100644
--- a/lib/instances/qemu_test.go
+++ b/lib/instances/qemu_test.go
@@ -52,8 +52,6 @@ func setupTestManagerForQEMU(t *testing.T) (*manager, string) {
 		MaxOverlaySize:       100 * 1024 * 1024 * 1024, // 100GB
 		MaxVcpusPerInstance:  0,                        // unlimited
 		MaxMemoryPerInstance: 0,                        // unlimited
-		MaxTotalVcpus:        0,                        // unlimited
-		MaxTotalMemory:       0,                        // unlimited
 	}
 	mgr := NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, hypervisor.TypeQEMU, nil, nil).(*manager)
 
diff --git a/lib/instances/resource_limits_test.go b/lib/instances/resource_limits_test.go
index 767d9e7c..fe100aeb 100644
--- a/lib/instances/resource_limits_test.go
+++ b/lib/instances/resource_limits_test.go
@@ -2,10 +2,8 @@ package instances
 
 import (
 	"context"
-	"os"
 	"syscall"
 	"testing"
-	"time"
 
 	"github.com/kernel/hypeman/cmd/api/config"
 	"github.com/kernel/hypeman/lib/devices"
@@ -171,15 +169,11 @@ func TestResourceLimits_StructValues(t *testing.T) {
 		MaxOverlaySize:       10 * 1024 * 1024 * 1024, // 10GB
 		MaxVcpusPerInstance:  4,
 		MaxMemoryPerInstance: 8 * 1024 * 1024 * 1024, // 8GB
-		MaxTotalVcpus:        16,
-		MaxTotalMemory:       32 * 1024 * 1024 * 1024, // 32GB
 	}
 
 	assert.Equal(t, int64(10*1024*1024*1024), limits.MaxOverlaySize)
 	assert.Equal(t, 4, limits.MaxVcpusPerInstance)
 	assert.Equal(t, int64(8*1024*1024*1024), limits.MaxMemoryPerInstance)
-	assert.Equal(t, 16, limits.MaxTotalVcpus)
-	assert.Equal(t, int64(32*1024*1024*1024), limits.MaxTotalMemory)
 }
 
 func TestResourceLimits_ZeroMeansUnlimited(t *testing.T) {
@@ -188,8 +182,6 @@ func TestResourceLimits_ZeroMeansUnlimited(t *testing.T) {
 		MaxOverlaySize:       100 * 1024 * 1024 * 1024,
 		MaxVcpusPerInstance:  0, // unlimited
 		MaxMemoryPerInstance: 0, // unlimited
-		MaxTotalVcpus:        0, // unlimited
-		MaxTotalMemory:       0, // unlimited
 	}
 
 	mgr := createTestManager(t, limits)
@@ -200,166 +192,8 @@ func TestResourceLimits_ZeroMeansUnlimited(t *testing.T) {
 	assert.Equal(t, int64(0), mgr.limits.MaxMemoryPerInstance)
 }
 
-func TestAggregateUsage_NoInstances(t *testing.T) {
-	limits := ResourceLimits{
-		MaxOverlaySize: 100 * 1024 * 1024 * 1024,
-	}
-	mgr := createTestManager(t, limits)
-
-	usage, err := mgr.calculateAggregateUsage(context.Background())
-	require.NoError(t, err)
-
-	assert.Equal(t, 0, usage.TotalVcpus)
-	assert.Equal(t, int64(0), usage.TotalMemory)
-}
-
-func TestAggregateUsage_StructValues(t *testing.T) {
-	usage := AggregateUsage{
-		TotalVcpus:  8,
-		TotalMemory: 16 * 1024 * 1024 * 1024,
-	}
-
-	assert.Equal(t, 8, usage.TotalVcpus)
-	assert.Equal(t, int64(16*1024*1024*1024), usage.TotalMemory)
-}
-
-// TestAggregateLimits_EnforcedAtRuntime is an integration test that verifies
-// aggregate resource limits are enforced when creating VMs.
-// It creates one VM, then tries to create another that would exceed the total limit.
-func TestAggregateLimits_EnforcedAtRuntime(t *testing.T) {
-	// Skip in short mode - this is an integration test
-	if testing.Short() {
-		t.Skip("skipping integration test in short mode")
-	}
-
-	// Require KVM access
-	if _, err := os.Stat("/dev/kvm"); os.IsNotExist(err) {
-		t.Skip("/dev/kvm not available - skipping VM test")
-	}
-
-	ctx := context.Background()
-	tmpDir := t.TempDir()
-
-	cfg := &config.Config{
-		DataDir:    tmpDir,
-		BridgeName: "vmbr0",
-		SubnetCIDR: "10.100.0.0/16",
-		DNSServer:  "1.1.1.1",
-	}
-
-	p := paths.New(tmpDir)
-	imageManager, err := images.NewManager(p, 1, nil)
-	require.NoError(t, err)
-
-	systemManager := system.NewManager(p)
-	networkManager := network.NewManager(p, cfg, nil)
-	deviceManager := devices.NewManager(p)
-	volumeManager := volumes.NewManager(p, 0, nil)
-
-	// Set small aggregate limits:
-	// - MaxTotalVcpus: 2 (first VM gets 1, second wants 2 -> denied)
-	// - MaxTotalMemory: 6GB (first VM gets 2.5GB, second wants 4GB -> denied)
-	limits := ResourceLimits{
-		MaxOverlaySize:       100 * 1024 * 1024 * 1024, // 100GB
-		MaxVcpusPerInstance:  4,                        // per-instance limit (high)
-		MaxMemoryPerInstance: 8 * 1024 * 1024 * 1024,   // 8GB per-instance (high)
-		MaxTotalVcpus:        2,                        // aggregate: only 2 total
-		MaxTotalMemory:       6 * 1024 * 1024 * 1024,   // aggregate: only 6GB total (allows first 2.5GB VM)
-	}
-
-	mgr := NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil).(*manager)
-
-	// Cleanup any orphaned processes on test end
-	t.Cleanup(func() {
-		cleanupTestProcesses(t, mgr)
-	})
-
-	// Pull a small image
-	t.Log("Pulling alpine image...")
-	alpineImage, err := imageManager.CreateImage(ctx, images.CreateImageRequest{
-		Name: "docker.io/library/alpine:latest",
-	})
-	require.NoError(t, err)
-
-	// Wait for image to be ready
-	t.Log("Waiting for image build...")
-	for i := 0; i < 120; i++ {
-		img, err := imageManager.GetImage(ctx, alpineImage.Name)
-		if err == nil && img.Status == images.StatusReady {
-			break
-		}
-		if err == nil && img.Status == images.StatusFailed {
-			t.Fatalf("Image build failed: %s", *img.Error)
-		}
-		time.Sleep(1 * time.Second)
-	}
-	t.Log("Image ready")
-
-	// Ensure system files
-	t.Log("Ensuring system files...")
-	err = systemManager.EnsureSystemFiles(ctx)
-	require.NoError(t, err)
-
-	// Check initial aggregate usage (should be 0)
-	usage, err := mgr.calculateAggregateUsage(ctx)
-	require.NoError(t, err)
-	assert.Equal(t, 0, usage.TotalVcpus, "Initial vCPUs should be 0")
-	assert.Equal(t, int64(0), usage.TotalMemory, "Initial memory should be 0")
-
-	// Create first VM: 1 vCPU, 2GB + 512MB = 2.5GB memory
-	t.Log("Creating first instance (1 vCPU, 2.5GB memory)...")
-	inst1, err := mgr.CreateInstance(ctx, CreateInstanceRequest{
-		Name:           "small-vm-1",
-		Image:          "docker.io/library/alpine:latest",
-		Vcpus:          1,
-		Size:           2 * 1024 * 1024 * 1024, // 2GB (needs extra room for initrd with NVIDIA libs)
-		HotplugSize:    512 * 1024 * 1024,      // 512MB
-		OverlaySize:    1 * 1024 * 1024 * 1024,
-		NetworkEnabled: false,
-	})
-	require.NoError(t, err)
-	require.NotNil(t, inst1)
-	t.Logf("First instance created: %s", inst1.Id)
-
-	// Verify aggregate usage increased
-	usage, err = mgr.calculateAggregateUsage(ctx)
-	require.NoError(t, err)
-	assert.Equal(t, 1, usage.TotalVcpus, "Should have 1 vCPU in use")
-	assert.Equal(t, int64(2*1024*1024*1024+512*1024*1024), usage.TotalMemory, "Should have 2.5GB memory in use")
-	t.Logf("Aggregate usage after first VM: %d vCPUs, %d bytes memory", usage.TotalVcpus, usage.TotalMemory)
-
-	// Try to create second VM: 2 vCPUs (would exceed MaxTotalVcpus=2)
-	// Note: 2 vCPUs alone is fine (under MaxVcpusPerInstance=4), but 1+2=3 exceeds MaxTotalVcpus=2
-	t.Log("Attempting to create second instance (2 vCPUs) - should be denied...")
-	_, err = mgr.CreateInstance(ctx, CreateInstanceRequest{
-		Name:           "big-vm-2",
-		Image:          "docker.io/library/alpine:latest",
-		Vcpus:          2, // This would make total 3, exceeding limit of 2
-		Size:           256 * 1024 * 1024,
-		HotplugSize:    256 * 1024 * 1024,
-		OverlaySize:    1 * 1024 * 1024 * 1024,
-		NetworkEnabled: false,
-	})
-	require.Error(t, err, "Should deny creation due to aggregate vCPU limit")
-	assert.Contains(t, err.Error(), "exceeds aggregate limit")
-	t.Logf("Second instance correctly denied: %v", err)
-
-	// Verify aggregate usage didn't change (failed creation shouldn't affect it)
-	usage, err = mgr.calculateAggregateUsage(ctx)
-	require.NoError(t, err)
-	assert.Equal(t, 1, usage.TotalVcpus, "vCPUs should still be 1")
-
-	// Clean up first instance
-	t.Log("Deleting first instance...")
-	err = mgr.DeleteInstance(ctx, inst1.Id)
-	require.NoError(t, err)
-
-	// Verify aggregate usage is back to 0
-	usage, err = mgr.calculateAggregateUsage(ctx)
-	require.NoError(t, err)
-	assert.Equal(t, 0, usage.TotalVcpus, "vCPUs should be 0 after deletion")
-	t.Log("Test passed: aggregate limits enforced correctly")
-}
+// Note: Aggregate resource limits are now handled by ResourceValidator in lib/resources.
+// Tests for aggregate limits should be in lib/resources/resource_test.go.
 
 // cleanupTestProcesses kills any Cloud Hypervisor processes started during test
 func cleanupTestProcesses(t *testing.T, mgr *manager) {
diff --git a/lib/instances/standby.go b/lib/instances/standby.go
index ba19df99..f3b1cfa2 100644
--- a/lib/instances/standby.go
+++ b/lib/instances/standby.go
@@ -47,6 +47,12 @@ func (m *manager) standbyInstance(
 		return nil, fmt.Errorf("%w: cannot standby from state %s", ErrInvalidState, inst.State)
 	}
 
+	// 2b. Block standby for vGPU instances (driver limitation - NVIDIA vGPU doesn't support snapshots)
+	if inst.GPUMdevUUID != "" || inst.GPUProfile != "" {
+		log.ErrorContext(ctx, "standby not supported for vGPU instances", "instance_id", id, "gpu_profile", inst.GPUProfile)
+		return nil, fmt.Errorf("%w: standby is not supported for instances with vGPU attached (driver limitation)", ErrInvalidState)
+	}
+
 	// 3. Get network allocation BEFORE killing VMM (while we can still query it)
 	// This is needed to delete the TAP device after VMM shuts down
 	var networkAlloc *network.Allocation
diff --git a/lib/instances/start.go b/lib/instances/start.go
index 9f495f57..aaa7cb8c 100644
--- a/lib/instances/start.go
+++ b/lib/instances/start.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"time"
 
+	"github.com/kernel/hypeman/lib/devices"
 	"github.com/kernel/hypeman/lib/logger"
 	"github.com/kernel/hypeman/lib/network"
 	"go.opentelemetry.io/otel/trace"
@@ -45,6 +46,16 @@ func (m *manager) startInstance(
 		return nil, fmt.Errorf("%w: cannot start from state %s, must be Stopped", ErrInvalidState, inst.State)
 	}
 
+	// 2b. Validate aggregate resource limits before allocating resources
+	if m.resourceValidator != nil {
+		needsGPU := stored.GPUProfile != ""
+		totalMemory := stored.Size + stored.HotplugSize
+		if err := m.resourceValidator.ValidateAllocation(ctx, stored.Vcpus, totalMemory, stored.NetworkBandwidthDownload, stored.NetworkBandwidthUpload, stored.DiskIOBps, needsGPU); err != nil {
+			log.ErrorContext(ctx, "resource validation failed for start", "instance_id", id, "error", err)
+			return nil, fmt.Errorf("%w: %v", ErrInsufficientResources, err)
+		}
+	}
+
 	// 3. Get image info (needed for buildHypervisorConfig)
 	log.DebugContext(ctx, "getting image info", "instance_id", id, "image", stored.Image)
 	imageInfo, err := m.imageManager.GetImage(ctx, stored.Image)
@@ -81,6 +92,26 @@ func (m *manager) startInstance(
 		})
 	}
 
+	// 4b. Recreate vGPU mdev if this instance had a GPU profile
+	// Note: GPU availability was already validated in step 2b
+	if stored.GPUProfile != "" {
+		log.InfoContext(ctx, "creating vGPU mdev for start", "instance_id", id, "profile", stored.GPUProfile)
+		mdev, err := devices.CreateMdev(ctx, stored.GPUProfile, id)
+		if err != nil {
+			log.ErrorContext(ctx, "failed to create mdev", "instance_id", id, "profile", stored.GPUProfile, "error", err)
+			return nil, fmt.Errorf("create vGPU mdev for profile %s: %w", stored.GPUProfile, err)
+		}
+		stored.GPUMdevUUID = mdev.UUID
+		log.InfoContext(ctx, "created vGPU mdev", "instance_id", id, "profile", stored.GPUProfile, "uuid", mdev.UUID)
+		// Add mdev cleanup to stack
+		cu.Add(func() {
+			log.DebugContext(ctx, "destroying mdev on cleanup", "instance_id", id, "uuid", mdev.UUID)
+			if err := devices.DestroyMdev(ctx, mdev.UUID); err != nil {
+				log.WarnContext(ctx, "failed to destroy mdev on cleanup", "instance_id", id, "uuid", mdev.UUID, "error", err)
+			}
+		})
+	}
+
 	// 5. Regenerate config disk with new network configuration
 	instForConfig := &Instance{StoredMetadata: *stored}
 	log.DebugContext(ctx, "regenerating config disk", "instance_id", id)
diff --git a/lib/instances/stop.go b/lib/instances/stop.go
index 20888e8a..973f7641 100644
--- a/lib/instances/stop.go
+++ b/lib/instances/stop.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"time"
 
+	"github.com/kernel/hypeman/lib/devices"
 	"github.com/kernel/hypeman/lib/logger"
 	"github.com/kernel/hypeman/lib/network"
 	"go.opentelemetry.io/otel/trace"
@@ -71,10 +72,20 @@ func (m *manager) stopInstance(
 		}
 	}
 
-	// 6. Update metadata (clear PID, set StoppedAt)
+	// 6. Destroy vGPU mdev device if present (frees vGPU slot for other VMs)
+	if inst.GPUMdevUUID != "" {
+		log.InfoContext(ctx, "destroying vGPU mdev on stop", "instance_id", id, "uuid", inst.GPUMdevUUID)
+		if err := devices.DestroyMdev(ctx, inst.GPUMdevUUID); err != nil {
+			// Log error but continue - mdev cleanup is best-effort
+			log.WarnContext(ctx, "failed to destroy mdev on stop", "instance_id", id, "uuid", inst.GPUMdevUUID, "error", err)
+		}
+	}
+
+	// 7. Update metadata (clear PID, mdev UUID, set StoppedAt)
 	now := time.Now()
 	stored.StoppedAt = &now
 	stored.HypervisorPID = nil
+	stored.GPUMdevUUID = "" // Clear mdev UUID since we destroyed it
 
 	meta = &metadata{StoredMetadata: *stored}
 	if err := m.saveMetadata(meta); err != nil {
diff --git a/lib/oapi/oapi.go b/lib/oapi/oapi.go
index be37cc74..8ca161fa 100644
--- a/lib/oapi/oapi.go
+++ b/lib/oapi/oapi.go
@@ -3645,6 +3645,7 @@ type CreateInstanceResponse struct {
 	JSON201      *Instance
 	JSON400      *Error
 	JSON401      *Error
+	JSON409      *Error
 	JSON500      *Error
 }
 
@@ -5304,6 +5305,13 @@ func ParseCreateInstanceResponse(rsp *http.Response) (*CreateInstanceResponse, e
 		}
 		response.JSON401 = &dest
 
+	case strings.Contains(rsp.Header.Get("Content-Type"), "json") && rsp.StatusCode == 409:
+		var dest Error
+		if err := json.Unmarshal(bodyBytes, &dest); err != nil {
+			return nil, err
+		}
+		response.JSON409 = &dest
+
 	case strings.Contains(rsp.Header.Get("Content-Type"), "json") && rsp.StatusCode == 500:
 		var dest Error
 		if err := json.Unmarshal(bodyBytes, &dest); err != nil {
@@ -8452,6 +8460,15 @@ func (response CreateInstance401JSONResponse) VisitCreateInstanceResponse(w http
 	return json.NewEncoder(w).Encode(response)
 }
 
+type CreateInstance409JSONResponse Error
+
+func (response CreateInstance409JSONResponse) VisitCreateInstanceResponse(w http.ResponseWriter) error {
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(409)
+
+	return json.NewEncoder(w).Encode(response)
+}
+
 type CreateInstance500JSONResponse Error
 
 func (response CreateInstance500JSONResponse) VisitCreateInstanceResponse(w http.ResponseWriter) error {
@@ -10384,31 +10401,32 @@ var swaggerSpec = []string{
 	"rPm+tdcHjHxaeaVVoAFrMlmS12l7eat7UfhsOvpNVL58gj+0vjZaXxlcKxW/vDLAV1T9bML1b3NOkCOb",
 	"D9rwysWf/cFUvvt1PVmMLNXQq/jibY4QLook57YC18MLkKM5xpX5b0sfakGQK7UDh7qnJ12bv95knc8D",
 	"xO/Jo+rmce9aoh33/t2pR8mYTjOeyXJAO5QrILKo7VphwA9Nfy3Ec6MG+x1j6eA+Rce9K6g/8P4rqc71",
-	"DTXM25aDXaM8u1b3ozwXRzXttWc3wx/acyvtuQSu1dpznhL1a6rPZpBvpj87fPMB3F5h/iNq0A/t2gaz",
-	"Pu7SYW+Fx7VWUIs0wKtlv8WNb3HQnw9+/3qpSwf2MMNPuQk4j5wmWMiaZlXwe8OHwf3yvvtXAR8yir0o",
-	"V17zK1vm7kXMp+tvXuQ9uWsGnqsX18yVaXtvrkO+RzmiIsWRJDEJFbqd0XAG1zD0b9C/uaWB0/R9fu9y",
-	"a4heQHhn+SYoDN6RRFAcQ25vHpu09O/nSfJ+uJyS4ersDD4yNzBM8oX3Q+TSMOQ0JnWr8rUKvYoYS4Ve",
-	"2csiHb3hgsexySP8XsOztL4te+GiuKJ6zXyXLxi5tR3SCXpfuofxvuEihkPCl3qXvhHld5vTuJu1KI4E",
-	"AM5cCScsariEoaHmv4KxM/BmFmp5HcRM4yvfBlmazEs+za/vV1AZp2lb9LXTBCyeJ8kKHEadUup6qSKe",
-	"qb9IFRFhKqta7G5CbtTBoflD4RtTB7RSCM0US/CByl5t9oIqMNWOXY0F89c8SQJTlS3BvpoJn3+tpt7h",
-	"sj2md6Z0d+aHzNjkVkyV2ZeuxdQkhy3WAck8vMbbG9PgD6+5uKom3xgN7/8oojQLCsVOWDRewN4W5WIe",
-	"1p0A2MhiZSDv7Lq8NOLeNdKIrTLzh6eRAj/+4FQScgGlqaUrFfdwgrdKFkeJ3DtQm6qo+dR1Vu/V2dlW",
-	"E9GY6saNJCN+mMM2jvIPL1OgXNfDoxZTqRLnC1jlLNQEoRptdGezVkr5jXmme1/KTgolLORCKpIYg32S",
-	"xXCxDaLWbX4AXC7R0UVUSUhW3QWXVak8wzUbk4mWhykRemz9OWQ/K2wPn1l7oXBOvueGBr8PuxYSloIp",
-	"h1UT1Gp1MNLU5Sr12U55etVPntJzMFSrJUIk6sT0xtS9Q3OJYv2wtdLSNfVDvnT2g0+nrLxCju9Wq8HZ",
-	"HJn/CBzutMbWXAXIB8fWXpAysTj+AxvtZ2tyLV8TG5ZQdLArlVLsX7MzooRugwVBIY9jyNpv9PftVPBw",
-	"G8q7hSmNTJ03mBwwvObXCYx4fH4J7UyG9e41038sFxirT9TVKTvdfr3G92dKS/4P1nPMAleRhX/Df7h1",
-	"Nj8KaKQh2UCiPF2lifP0hyJuK8b+MFsfpNkKZ7H5ajpTgUNQiqWtCew3UW0hre0P5uF03Ym+wuHsyhVj",
-	"+D60XZu7fd0wboEPgijtmiJibt3fP03yPL3+A71ZpQHnlgBKTDk2wS8FTNmOPxp2f/kwtDIcNwpCu1fa",
-	"chktvhvaum/JZ+fgblSU4fFQyNxgmlsJ5Bcve59EuQzYStvMlXeCmnS5aumqk3XLRfJMAs3ch1SUZcnr",
-	"cfWvWV6AzCXw1NZV15lWKKLyxvRgrac+8teJM3aeLRZ3zRRHIY5Dk9Y9L5hmihzKBuvrTamI4Fejt2IQ",
-	"z0bnleJkXhHsIZkcfpyA3SuXHAOMs+rUyvDvK9vmPoK/rTDbIPTbreBHlGyLwO8SsNoUODHN++giS1Mu",
-	"lETqlkPFYAnhNpAHdcyjxRDl3zFkisxZFmerg+UlLaBCk/72rFL1pNSB+zIVpOcqQkTmVp2FsVGPluup",
-	"NJRMyfWjrxfBXlcduptWYSnNpbof1TWivMSJLdegYWvh5bpoVZTBV1IqL/MSZlLxxPV7eoI6OFO8NyVM",
-	"A7eoqJIKPqdRvTDnd1JN7wzf0SRL8lLUL55CYV9horGgxDvEAjqcInchIZGE4KytDSvvLRfds3vxaWUp",
-	"vhwTc9y0Uaf8htcaivyheou1jumQXHGOYiymZOsPc3nY0lpxd/j0pHZz+AFeyJg77Cv0jJZXMNqZtC0t",
-	"za9x/SJ3d9zv5Yur78cKK6VYfIA3gOe5mtl06+P7QsHB/YmE+77tcfWAvXba2prXwGY60D36EOYlD3GM",
-	"IjInMU+htqxpG3SDTMS2UuZwe1ubabE25IaHg8NB8PHdx/8fAAD//6qDr0pR3QAA",
+	"DTXM25aDXaM8u1b3ozwXRzXttWc3wx/acyvtuQSu1dpznhL1a6rPZpBvpj87fPMB3F5h/qFB34cGLbPJ",
+	"hIaUMFWkElqKarHZwB7gvRJmnfCl0+gKE26tQRd5ilcrJxZ5v0UkQj74/SvOLl/Zw4yP5SYiPnKqaiEM",
+	"m3XV7w0fBvfLnO9fR33IKPaiXBrOrw2ayyExn66/GpL35O5BeO6GXDNXR+69YervUY6oSHEkSUxChW5n",
+	"NJzBPRH9G/RvrpHgNH2fXwzdGqIXEH9avqoKg3ckERTHkHycxyZv/vt5krwfLueMuDo7g4/MFRGTHeL9",
+	"ELk8ETmNSd2qfO9DryLGUqFX9jZLR2+44HFsEh2/1/AsrW/L3ggp7tBeM9/tEEZubYd0gt6XLoq8b7gp",
+	"4pDwpd6lb0T53eY882YtiiMBgDN31gmLGm6JaKj574jsDLypj1reVzHT+MrXVZYm85JP8/wCFVTGadoW",
+	"fe00AYvnSbICh1GnlFtfqohn6i9SRUSY0q8Wu5uQG3VwaP5Q+MYUKq1UajPVHHygsnevvaAKTDlmVwTC",
+	"/DVPksCUjUuwr6jD59/7qXe4bDDqnSld7vkhMza5tlNl9qV7OzXJYauJQLYRr3X5xjT4w2suruzKN0bD",
+	"b2DpFbOgUI2FReMF7G1Rz+ZhXVqAjSxWBvLOrstLI+5dI43YMjh/eBop8OMPTiUhF1A7W7padg8nuqxk",
+	"cZTIvQPFs4qiVF1n9V6dnW01EY0pv9xIMuKHOWwDPf/wMgXqiT08ajGlNHG+gFXOQk0QqtFGdzZrpdbg",
+	"mGe696X0qVBjQy6kIokx2CdZDDfvIKzeJjDA5RoiXUSVhGzaXXBZlepHXLMxmWh5mBKhx9afQ3q2wvbw",
+	"mbUXCufke25o8PuwayGjKphyWDVBrVaoI01dMlWf7ZTnf/3kKT0HQ7Vaw0SiTkxvTGE+NJco1g9bKy1d",
+	"U+DkS6dn+HTKykv4+K7dGpzNkfmPwOFOa2zNlah8cGztBSkTi+M/sNF+tibX8jWxYY1HB7tSrcf+NTsj",
+	"Sug2WBAU8jiGsgJGf99OBQ+3of5cmNLIFKKDyQHDa36dwIjH55fQzqSA714z/cdyBbT6RF0htdPt12t8",
+	"f6b25f9gPccscBVZ+Df8h1tn86OARhqSDSTK01WaOE9/KOK2pO0Ps/VBmq1wFpuvpjMVOASlWNqixX4T",
+	"1Vb62v5gHk7XnegrHM6uXLWI70Pbtcnl1w3jFvggiNKuKSImLcD90yTP8/8/0KtfGnBuCaDElGMT/FLA",
+	"1BX5o2H3l4+TK8Nxoyi5e6Utl3Lju6Gt+5Z8dg4uUK0Mj4dC5gbT3EogAXrZ+yTKdcpW2mau/hQUzctV",
+	"S1c+rVuu4mcyfOY+pKJuTF4wrH/N8gppLsOotq66zrRCEZU3pgdrPfWRv5CdsfNsNbtrpjgKcRyavPN5",
+	"RTdThVE2WF9vSlUOvxq9FYN4NjovZSfzkmUPyeTw4wTsXrkmGmCcVadWxqdf2Tb3EZ1uhdkGseluBT8i",
+	"01tEppeA1aYCi2neRxdZmnKhJFK3HEoaSwi3gUStYx4thij/jiFTBc+yOFu+LK+5ASWk9LdnlbIspQ7c",
+	"l6kgPVeyIjLX/iyMjXq0XPCloaZLrh99vRD7uurQ3bRMTGku1f2orhHlNVhsPQkNWwsv10WrqhG+mld5",
+	"HZowk4onrt/TE9TBmeK9KWEauEXJl1TwOY3qlUO/k3J/Z/iOJlmS18p+8RQqDwsTjQU16CEW0OEUuQsJ",
+	"iSQEZ21tWBpwuSqg3YtPq5vx5ZiY46aNOuU3vHdRJDjVW6x1TIfkinMUYzElW3+YuxmW1orLzacnD/9C",
+	"xtxhX6FntLyC0c6kbWlpfo3rF7m7434vX1x9P1ZYKQfkA7yiPM/VzKZbH98XCg7uTyTc922PqwfstdPW",
+	"1rwGNtOB7tGHMC95iGMUkTmJeQrFb03boBtkIralPIfb29pMi7UhNzwcHA6Cj+8+/v8AAAD//3a75jLy",
+	"3QAA",
 }
 
 // GetSwagger returns the content of the embedded swagger specification file
diff --git a/lib/providers/providers.go b/lib/providers/providers.go
index 2c88ea00..3ea7cf8b 100644
--- a/lib/providers/providers.go
+++ b/lib/providers/providers.go
@@ -105,22 +105,12 @@ func ProvideInstanceManager(p *paths.Paths, cfg *config.Config, imageManager ima
 		maxMemoryPerInstance = int64(memSize)
 	}
 
-	// Parse max total memory (empty or "0" means unlimited)
-	var maxTotalMemory int64
-	if cfg.MaxTotalMemory != "" && cfg.MaxTotalMemory != "0" {
-		var memSize datasize.ByteSize
-		if err := memSize.UnmarshalText([]byte(cfg.MaxTotalMemory)); err != nil {
-			return nil, fmt.Errorf("failed to parse MAX_TOTAL_MEMORY '%s': %w", cfg.MaxTotalMemory, err)
-		}
-		maxTotalMemory = int64(memSize)
-	}
-
+	// Note: Aggregate CPU/memory limits are now handled via oversubscription ratios
+	// in the ResourceManager, wired up via SetResourceValidator after initialization.
 	limits := instances.ResourceLimits{
 		MaxOverlaySize:       int64(maxOverlaySize),
 		MaxVcpusPerInstance:  cfg.MaxVcpusPerInstance,
 		MaxMemoryPerInstance: maxMemoryPerInstance,
-		MaxTotalVcpus:        cfg.MaxTotalVcpus,
-		MaxTotalMemory:       maxTotalMemory,
 	}
 
 	meter := otel.GetMeterProvider().Meter("hypeman")
diff --git a/lib/resources/resource.go b/lib/resources/resource.go
index 39e7e555..09d51840 100644
--- a/lib/resources/resource.go
+++ b/lib/resources/resource.go
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"sync"
 
+	"github.com/c2h5oh/datasize"
 	"github.com/kernel/hypeman/cmd/api/config"
 	"github.com/kernel/hypeman/lib/logger"
 	"github.com/kernel/hypeman/lib/paths"
@@ -360,6 +361,67 @@ func (m *Manager) CanAllocate(ctx context.Context, rt ResourceType, amount int64
 	return amount <= status.Available, nil
 }
 
+// ValidateAllocation checks if the requested resources can be allocated.
+// Returns nil if allocation is allowed, or a detailed error describing
+// which resource is insufficient and the current capacity/usage.
+// Parameters match instances.AllocationRequest to implement instances.ResourceValidator.
+func (m *Manager) ValidateAllocation(ctx context.Context, vcpus int, memoryBytes int64, networkDownloadBps int64, networkUploadBps int64, diskIOBps int64, needsGPU bool) error {
+	// Check CPU
+	if vcpus > 0 {
+		status, err := m.GetStatus(ctx, ResourceCPU)
+		if err != nil {
+			return fmt.Errorf("check CPU capacity: %w", err)
+		}
+		if int64(vcpus) > status.Available {
+			return fmt.Errorf("insufficient CPU: requested %d vCPUs, but only %d available (currently allocated: %d, effective limit: %d with %.1fx oversubscription)",
+				vcpus, status.Available, status.Allocated, status.EffectiveLimit, status.OversubRatio)
+		}
+	}
+
+	// Check Memory
+	if memoryBytes > 0 {
+		status, err := m.GetStatus(ctx, ResourceMemory)
+		if err != nil {
+			return fmt.Errorf("check memory capacity: %w", err)
+		}
+		if memoryBytes > status.Available {
+			return fmt.Errorf("insufficient memory: requested %s, but only %s available (currently allocated: %s, effective limit: %s with %.1fx oversubscription)",
+				datasize.ByteSize(memoryBytes).HR(), datasize.ByteSize(status.Available).HR(), datasize.ByteSize(status.Allocated).HR(), datasize.ByteSize(status.EffectiveLimit).HR(), status.OversubRatio)
+		}
+	}
+
+	// Check Network (use max of download/upload since they share physical link)
+	netBandwidth := networkDownloadBps
+	if networkUploadBps > netBandwidth {
+		netBandwidth = networkUploadBps
+	}
+	if netBandwidth > 0 {
+		status, err := m.GetStatus(ctx, ResourceNetwork)
+		if err != nil {
+			return fmt.Errorf("check network capacity: %w", err)
+		}
+		if netBandwidth > status.Available {
+			return fmt.Errorf("insufficient network bandwidth: requested %s/s, but only %s/s available (currently allocated: %s/s, effective limit: %s/s with %.1fx oversubscription)",
+				datasize.ByteSize(netBandwidth).HR(), datasize.ByteSize(status.Available).HR(), datasize.ByteSize(status.Allocated).HR(), datasize.ByteSize(status.EffectiveLimit).HR(), status.OversubRatio)
+		}
+	}
+
+	// Check GPU if needed
+	if needsGPU {
+		gpuStatus := GetGPUStatus()
+		if gpuStatus == nil {
+			return fmt.Errorf("insufficient GPU: no GPU available on this host")
+		}
+		availableSlots := gpuStatus.TotalSlots - gpuStatus.UsedSlots
+		if availableSlots <= 0 {
+			return fmt.Errorf("insufficient GPU: all %d %s slots are in use",
+				gpuStatus.TotalSlots, gpuStatus.Mode)
+		}
+	}
+
+	return nil
+}
+
 // CPUCapacity returns the raw CPU capacity (number of vCPUs).
 func (m *Manager) CPUCapacity() int64 {
 	m.mu.RLock()
diff --git a/openapi.yaml b/openapi.yaml
index bd85c463..219c553a 100644
--- a/openapi.yaml
+++ b/openapi.yaml
@@ -1304,6 +1304,12 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/Error"
+        409:
+          description: Conflict - insufficient resources or name already exists
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
         500:
           description: Internal server error
           content:

From 2915dc1e4fbe920781411b3d25c6864e5f5ffc9b Mon Sep 17 00:00:00 2001
From: Steven Miller <sjmiller609@gmail.com>
Date: Fri, 30 Jan 2026 12:58:49 -0500
Subject: [PATCH 2/5] Use oversubscription manager in tests

---
 lib/builds/manager_test.go            |  6 +++++-
 lib/instances/create.go               |  2 +-
 lib/instances/manager_test.go         | 22 ++++++++++++++++++----
 lib/instances/qemu_test.go            | 10 ++++++++++
 lib/instances/resource_limits_test.go |  8 +++++++-
 lib/instances/start.go                |  2 +-
 lib/resources/resource.go             | 15 +++++++++++----
 7 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/lib/builds/manager_test.go b/lib/builds/manager_test.go
index 7357ac8f..5a9e82cc 100644
--- a/lib/builds/manager_test.go
+++ b/lib/builds/manager_test.go
@@ -126,6 +126,10 @@ func (m *mockInstanceManager) ListRunningInstancesInfo(ctx context.Context) ([]r
 	return nil, nil
 }
 
+func (m *mockInstanceManager) SetResourceValidator(v instances.ResourceValidator) {
+	// no-op for mock
+}
+
 // mockVolumeManager implements volumes.Manager for testing
 type mockVolumeManager struct {
 	volumes               map[string]*volumes.Volume
@@ -611,7 +615,7 @@ func TestBuildQueue_ConcurrencyLimit(t *testing.T) {
 	queue := NewBuildQueue(2) // Max 2 concurrent
 
 	started := make(chan string, 5)
-	
+
 	// Enqueue 5 builds with blocking start functions
 	for i := 0; i < 5; i++ {
 		id := string(rune('A' + i))
diff --git a/lib/instances/create.go b/lib/instances/create.go
index 35088d49..a28d0e26 100644
--- a/lib/instances/create.go
+++ b/lib/instances/create.go
@@ -149,7 +149,7 @@ func (m *manager) createInstance(
 		return nil, fmt.Errorf("total memory %d (size + hotplug_size) exceeds maximum allowed %d per instance", totalMemory, m.limits.MaxMemoryPerInstance)
 	}
 
-	// Validate aggregate resource limits via ResourceValidator
+	// Validate aggregate resource limits via ResourceValidator (if configured)
 	if m.resourceValidator != nil {
 		needsGPU := req.GPU != nil && req.GPU.Profile != ""
 		if err := m.resourceValidator.ValidateAllocation(ctx, vcpus, totalMemory, req.NetworkBandwidthDownload, req.NetworkBandwidthUpload, req.DiskIOBps, needsGPU); err != nil {
diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go
index e06e659d..4bfb9b4f 100644
--- a/lib/instances/manager_test.go
+++ b/lib/instances/manager_test.go
@@ -24,6 +24,7 @@ import (
 	"github.com/kernel/hypeman/lib/ingress"
 	"github.com/kernel/hypeman/lib/network"
 	"github.com/kernel/hypeman/lib/paths"
+	"github.com/kernel/hypeman/lib/resources"
 	"github.com/kernel/hypeman/lib/system"
 	"github.com/kernel/hypeman/lib/vmm"
 	"github.com/kernel/hypeman/lib/volumes"
@@ -57,6 +58,15 @@ func setupTestManager(t *testing.T) (*manager, string) {
 	}
 	mgr := NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil).(*manager)
 
+	// Set up resource validation using the real ResourceManager
+	resourceMgr := resources.NewManager(cfg, p)
+	resourceMgr.SetInstanceLister(mgr)
+	resourceMgr.SetImageLister(imageManager)
+	resourceMgr.SetVolumeLister(volumeManager)
+	err = resourceMgr.Initialize(context.Background())
+	require.NoError(t, err)
+	mgr.SetResourceValidator(resourceMgr)
+
 	// Register cleanup to kill any orphaned Cloud Hypervisor processes
 	t.Cleanup(func() {
 		cleanupOrphanedProcesses(t, mgr)
@@ -920,10 +930,14 @@ func TestStorageOperations(t *testing.T) {
 	tmpDir := t.TempDir()
 
 	cfg := &config.Config{
-		DataDir:    tmpDir,
-		BridgeName: "vmbr0",
-		SubnetCIDR: "10.100.0.0/16",
-		DNSServer:  "1.1.1.1",
+		DataDir:        tmpDir,
+		BridgeName:     "vmbr0",
+		SubnetCIDR:     "10.100.0.0/16",
+		DNSServer:      "1.1.1.1",
+		OversubCPU:     1.0,
+		OversubMemory:  1.0,
+		OversubDisk:    1.0,
+		OversubNetwork: 1.0,
 	}
 
 	p := paths.New(tmpDir)
diff --git a/lib/instances/qemu_test.go b/lib/instances/qemu_test.go
index e276676c..4f34384d 100644
--- a/lib/instances/qemu_test.go
+++ b/lib/instances/qemu_test.go
@@ -23,6 +23,7 @@ import (
 	"github.com/kernel/hypeman/lib/ingress"
 	"github.com/kernel/hypeman/lib/network"
 	"github.com/kernel/hypeman/lib/paths"
+	"github.com/kernel/hypeman/lib/resources"
 	"github.com/kernel/hypeman/lib/system"
 	"github.com/kernel/hypeman/lib/volumes"
 	"github.com/stretchr/testify/assert"
@@ -55,6 +56,15 @@ func setupTestManagerForQEMU(t *testing.T) (*manager, string) {
 	}
 	mgr := NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, hypervisor.TypeQEMU, nil, nil).(*manager)
 
+	// Set up resource validation using the real ResourceManager
+	resourceMgr := resources.NewManager(cfg, p)
+	resourceMgr.SetInstanceLister(mgr)
+	resourceMgr.SetImageLister(imageManager)
+	resourceMgr.SetVolumeLister(volumeManager)
+	err = resourceMgr.Initialize(context.Background())
+	require.NoError(t, err)
+	mgr.SetResourceValidator(resourceMgr)
+
 	// Register cleanup to kill any orphaned QEMU processes
 	t.Cleanup(func() {
 		cleanupOrphanedQEMUProcesses(t, mgr)
diff --git a/lib/instances/resource_limits_test.go b/lib/instances/resource_limits_test.go
index fe100aeb..5b50f1af 100644
--- a/lib/instances/resource_limits_test.go
+++ b/lib/instances/resource_limits_test.go
@@ -150,7 +150,13 @@ func TestValidateVolumeAttachments_OverlayCountsAsTwoDevices(t *testing.T) {
 func createTestManager(t *testing.T, limits ResourceLimits) *manager {
 	t.Helper()
 	tmpDir := t.TempDir()
-	cfg := &config.Config{DataDir: tmpDir}
+	cfg := &config.Config{
+		DataDir:        tmpDir,
+		OversubCPU:     1.0,
+		OversubMemory:  1.0,
+		OversubDisk:    1.0,
+		OversubNetwork: 1.0,
+	}
 	p := paths.New(cfg.DataDir)
 
 	imageMgr, err := images.NewManager(p, 1, nil)
diff --git a/lib/instances/start.go b/lib/instances/start.go
index aaa7cb8c..9eaf60ed 100644
--- a/lib/instances/start.go
+++ b/lib/instances/start.go
@@ -46,7 +46,7 @@ func (m *manager) startInstance(
 		return nil, fmt.Errorf("%w: cannot start from state %s, must be Stopped", ErrInvalidState, inst.State)
 	}
 
-	// 2b. Validate aggregate resource limits before allocating resources
+	// 2b. Validate aggregate resource limits before allocating resources (if configured)
 	if m.resourceValidator != nil {
 		needsGPU := stored.GPUProfile != ""
 		totalMemory := stored.Size + stored.HotplugSize
diff --git a/lib/resources/resource.go b/lib/resources/resource.go
index 09d51840..b107c17d 100644
--- a/lib/resources/resource.go
+++ b/lib/resources/resource.go
@@ -215,19 +215,26 @@ func (m *Manager) Initialize(ctx context.Context) error {
 }
 
 // GetOversubRatio returns the oversubscription ratio for a resource type.
+// Returns 1.0 (no oversubscription) if the config value is not set or <= 0.
 func (m *Manager) GetOversubRatio(rt ResourceType) float64 {
+	var ratio float64
 	switch rt {
 	case ResourceCPU:
-		return m.cfg.OversubCPU
+		ratio = m.cfg.OversubCPU
 	case ResourceMemory:
-		return m.cfg.OversubMemory
+		ratio = m.cfg.OversubMemory
 	case ResourceDisk:
-		return m.cfg.OversubDisk
+		ratio = m.cfg.OversubDisk
 	case ResourceNetwork:
-		return m.cfg.OversubNetwork
+		ratio = m.cfg.OversubNetwork
 	default:
 		return 1.0
 	}
+	// Default to 1.0 (no oversubscription) if not configured
+	if ratio <= 0 {
+		return 1.0
+	}
+	return ratio
 }
 
 // GetStatus returns the current status of a specific resource type.

From ea8ca34f08d7870edbba88eeee04788da855cb6f Mon Sep 17 00:00:00 2001
From: Steven Miller <sjmiller609@gmail.com>
Date: Fri, 30 Jan 2026 14:28:57 -0500
Subject: [PATCH 3/5] Handle disk io

---
 lib/instances/manager.go  |  1 +
 lib/resources/disk.go     | 39 +++++++++++++++++++++++++++++++++++++++
 lib/resources/resource.go | 22 ++++++++++++++++++++++
 3 files changed, 62 insertions(+)

diff --git a/lib/instances/manager.go b/lib/instances/manager.go
index 23860d28..8411d193 100644
--- a/lib/instances/manager.go
+++ b/lib/instances/manager.go
@@ -340,6 +340,7 @@ func (m *manager) ListInstanceAllocations(ctx context.Context) ([]resources.Inst
 			VolumeOverlayBytes: volumeOverlayBytes,
 			NetworkDownloadBps: inst.NetworkBandwidthDownload,
 			NetworkUploadBps:   inst.NetworkBandwidthUpload,
+			DiskIOBps:          inst.DiskIOBps,
 			State:              string(inst.State),
 			VolumeBytes:        volumeBytes,
 		})
diff --git a/lib/resources/disk.go b/lib/resources/disk.go
index 6ea0bdd8..2b6bf76d 100644
--- a/lib/resources/disk.go
+++ b/lib/resources/disk.go
@@ -126,3 +126,42 @@ func parseDiskIOLimit(limit string) (int64, error) {
 
 	return int64(ds.Bytes()), nil
 }
+
+// DiskIOResource implements Resource for disk I/O bandwidth tracking.
+type DiskIOResource struct {
+	capacity       int64 // bytes per second
+	instanceLister InstanceLister
+}
+
+// NewDiskIOResource creates a disk I/O resource with the given capacity.
+func NewDiskIOResource(capacity int64, instLister InstanceLister) *DiskIOResource {
+	return &DiskIOResource{capacity: capacity, instanceLister: instLister}
+}
+
+// Type returns the resource type.
+func (d *DiskIOResource) Type() ResourceType {
+	return ResourceDiskIO
+}
+
+// Capacity returns the total disk I/O capacity in bytes per second.
+func (d *DiskIOResource) Capacity() int64 {
+	return d.capacity
+}
+
+// Allocated returns total disk I/O allocated across all active instances.
+func (d *DiskIOResource) Allocated(ctx context.Context) (int64, error) {
+	if d.instanceLister == nil {
+		return 0, nil
+	}
+	instances, err := d.instanceLister.ListInstanceAllocations(ctx)
+	if err != nil {
+		return 0, err
+	}
+	var total int64
+	for _, inst := range instances {
+		if isActiveState(inst.State) {
+			total += inst.DiskIOBps
+		}
+	}
+	return total, nil
+}
diff --git a/lib/resources/resource.go b/lib/resources/resource.go
index b107c17d..dcd90ab1 100644
--- a/lib/resources/resource.go
+++ b/lib/resources/resource.go
@@ -21,6 +21,7 @@ const (
 	ResourceMemory  ResourceType = "memory"
 	ResourceDisk    ResourceType = "disk"
 	ResourceNetwork ResourceType = "network"
+	ResourceDiskIO  ResourceType = "disk_io"
 )
 
 // SourceType identifies how a resource capacity was determined.
@@ -100,6 +101,7 @@ type InstanceAllocation struct {
 	VolumeOverlayBytes int64  // Sum of volume overlay sizes
 	NetworkDownloadBps int64  // Download rate limit (external→VM)
 	NetworkUploadBps   int64  // Upload rate limit (VM→external)
+	DiskIOBps          int64  // Disk I/O rate limit (bytes/sec)
 	State              string // Only count running/paused/created instances
 	VolumeBytes        int64  // Sum of attached volume base sizes (for per-instance reporting)
 }
@@ -211,6 +213,10 @@ func (m *Manager) Initialize(ctx context.Context) error {
 	}
 	m.resources[ResourceNetwork] = net
 
+	// Discover disk I/O (reuses existing DiskIOCapacity method)
+	diskIO := NewDiskIOResource(m.DiskIOCapacity(), m.instanceLister)
+	m.resources[ResourceDiskIO] = diskIO
+
 	return nil
 }
 
@@ -227,6 +233,8 @@ func (m *Manager) GetOversubRatio(rt ResourceType) float64 {
 		ratio = m.cfg.OversubDisk
 	case ResourceNetwork:
 		ratio = m.cfg.OversubNetwork
+	case ResourceDiskIO:
+		ratio = m.cfg.OversubDiskIO
 	default:
 		return 1.0
 	}
@@ -413,6 +421,20 @@ func (m *Manager) ValidateAllocation(ctx context.Context, vcpus int, memoryBytes
 		}
 	}
 
+	// Check Disk I/O
+	if diskIOBps > 0 {
+		status, err := m.GetStatus(ctx, ResourceDiskIO)
+		if err != nil {
+			return fmt.Errorf("check disk I/O capacity: %w", err)
+		}
+		if diskIOBps > status.Available {
+			return fmt.Errorf("insufficient disk I/O: requested %s/s, but only %s/s available (currently allocated: %s/s, effective limit: %s/s with %.1fx oversubscription)",
+				datasize.ByteSize(diskIOBps).HR(), datasize.ByteSize(status.Available).HR(),
+				datasize.ByteSize(status.Allocated).HR(), datasize.ByteSize(status.EffectiveLimit).HR(),
+				status.OversubRatio)
+		}
+	}
+
 	// Check GPU if needed
 	if needsGPU {
 		gpuStatus := GetGPUStatus()

From 481cedb33e858d86d05032e0c50b79466c48e62c Mon Sep 17 00:00:00 2001
From: Steven Miller <sjmiller609@gmail.com>
Date: Fri, 30 Jan 2026 14:44:37 -0500
Subject: [PATCH 4/5] Add disk IO in resources returned by API

---
 cmd/api/api/resources.go  |   3 +
 lib/oapi/oapi.go          | 138 ++++++++++++++++++++------------------
 lib/resources/resource.go |   9 +++
 openapi.yaml              |   7 ++
 4 files changed, 90 insertions(+), 67 deletions(-)

diff --git a/cmd/api/api/resources.go b/cmd/api/api/resources.go
index 6df93288..ebb6ad95 100644
--- a/cmd/api/api/resources.go
+++ b/cmd/api/api/resources.go
@@ -25,11 +25,13 @@ func (s *ApiService) GetResources(ctx context.Context, _ oapi.GetResourcesReques
 	}
 
 	// Convert to API response
+	diskIO := convertResourceStatus(status.DiskIO)
 	resp := oapi.Resources{
 		Cpu:         convertResourceStatus(status.CPU),
 		Memory:      convertResourceStatus(status.Memory),
 		Disk:        convertResourceStatus(status.Disk),
 		Network:     convertResourceStatus(status.Network),
+		DiskIo:      &diskIO,
 		Allocations: make([]oapi.ResourceAllocation, 0, len(status.Allocations)),
 	}
 
@@ -53,6 +55,7 @@ func (s *ApiService) GetResources(ctx context.Context, _ oapi.GetResourcesReques
 			DiskBytes:          &alloc.DiskBytes,
 			NetworkDownloadBps: &alloc.NetworkDownloadBps,
 			NetworkUploadBps:   &alloc.NetworkUploadBps,
+			DiskIoBps:          &alloc.DiskIOBps,
 		})
 	}
 
diff --git a/lib/oapi/oapi.go b/lib/oapi/oapi.go
index 8ca161fa..64630c42 100644
--- a/lib/oapi/oapi.go
+++ b/lib/oapi/oapi.go
@@ -712,6 +712,9 @@ type ResourceAllocation struct {
 	// DiskBytes Disk allocated in bytes (overlay + volumes)
 	DiskBytes *int64 `json:"disk_bytes,omitempty"`
 
+	// DiskIoBps Disk I/O bandwidth limit in bytes/sec
+	DiskIoBps *int64 `json:"disk_io_bps,omitempty"`
+
 	// InstanceId Instance identifier
 	InstanceId *string `json:"instance_id,omitempty"`
 
@@ -758,6 +761,7 @@ type Resources struct {
 	Cpu           ResourceStatus       `json:"cpu"`
 	Disk          ResourceStatus       `json:"disk"`
 	DiskBreakdown *DiskBreakdown       `json:"disk_breakdown,omitempty"`
+	DiskIo        *ResourceStatus      `json:"disk_io,omitempty"`
 
 	// Gpu GPU resource status. Null if no GPUs available.
 	Gpu     *GPUResourceStatus `json:"gpu"`
@@ -10360,73 +10364,73 @@ var swaggerSpec = []string{
 	"FT6kbFoNHF4asI0ZZuaw+t4fjGsbtvEYSX+AxluRAayMQ1ciXIRqtPJOUznyWxXLHQsyzWIsUD0WecWU",
 	"5SKJKbtp07tcJGMe0xDpD+rm/ITHMb8d6Vfyr7CWrVar0x+MilPJmnluJmfPpM2G1MYtlvBXvcqtWpQL",
 	"SP5t8/025BVu44DzRus+18abCde9ZPSuhOjVSzT7u4OmoKaGTivhTMuh3pvydouyPop3UdhHebIJz5GY",
-	"Oa2pWbBVPbiyXt9q4VRrVQjXsiaAOs6n5y4pVeFauizUShB/lrq7IvvpZyi382SFUtsArbM2mtPho8Mn",
-	"T/b2Hz3Z3UhHcYcNDYePTQcObgbbkoS1vC41venRAP630aTMcYN/Sg1HDtUJVXK0fPKEPq4gn+ISQ4Pl",
-	"uyrzeLGTztSuKsHt1MwVGstRRe0pperqkMmEgPNmZODWKyZTC6ppNYcQpzikauGxwvAtxBmgvEktGL9F",
-	"77XJekBq+0Z4orS1PidCZuPiNlrHDY7+01hXNVw4bH3xUmbjJkvudX1UY8eZwJyo5iVoYaQbjPCdQt/m",
-	"wES3WFY86/o5VCTqllKx1Y9gTIv2mWYdrufJZovDZd+FEn9i2fL217azpPlXFNU6xFeJsWYS1FIZon7a",
-	"OLk9UtFzSyVcH8VQ4w9WDn7aV6Nx+Ur0yjvnlfvTrXPYLQ9rBNHm0y0dhm/yYf2SJ6CVnYOFXNF3t7Kz",
-	"PqQwRxVNGUQSV1mjdgeUmlzl9qIPKjVGHZKkauGC2Z2Rt7XZ0clR3qEXp75w+NHgyZcIgL5cGfH8PyQn",
-	"Tfm0yg2y9pxqaU8bwwz9WudJPRLEmFf2Tn41cqF201iqFSn5V5V/MXVYwHayIb7TrH4naYOSL03WckE5",
-	"Lte+q/myzghc6Zoqraw0k+a9MUeVn1kfh0pXGOcTQWYtmfUxs+a4R9uSvXrSBnPlU1AwjSyADGA1CHJr",
-	"d9mkXh1BcYbv8hHA8MQS1dLcmXWUUsa+eArXuN+4y/t04rqAadQTFj79vMJBDquWN2NVJSF3GO4lPMt/",
-	"VnC0JtqqIWcxRnd1sSLNukiYCaoWF1og2DgvggURR5lBQ5AUsAj4uRgc4sY/fgRrc+JROl8QRgQN0dH5",
-	"KWBJghme6i27OkMxnZBwEcbEhv0uHZPCrfXXx6c9c18hzysHdQAUAMQldDo6P4VcMjYDfzDo7/YhbS5P",
-	"CcMpDYbBXn8HsuVoMMASt+E6GDxan46mQ5Bkp5GVuE9NEw1amXImDXB2B4NaRQdc5OvY/kUaZ4URr611",
-	"O1MyZzl0YSma1WkCdvofu8H+YGej+axNseEb9pLhTM24oL8RmOajDYHwSYOeMmMcu6S+xDYscDYY/lzF",
-	"1p/ffXzXDWSWJFiriAZcBaxSLptUGCIRRozc2nuCv/BxH10Y0wLybRTFyIzlTyLNkjBSWPSnvyEswhmd",
-	"k2tmObFJl4IFXIpIkObAJiS9imZmaLP7hoSJVE95tKhBN+9uW3cH2kgVwBuXushz/6UNNS983NGkGJIh",
-	"9+ZWIgwzVWSsMbmFbgicB07onTesHEJl/Y7jk/ydK45S5e1a3aUsjLOoEIDVohTe68qmuILNlnRDPPrC",
-	"C2hh51+OKnaShvGImAjRdKFmnJnnbJwxlZnnseC3kggtj+ztBgsWbf3mRa1MKjqawA0Dcx9Sj7ltprj9",
-	"4YYsPvav2VGUuPurNiMqjiW3aaTMWT+VKM/Le828GrQcYd3PaOyqc9UUVQJdXQdaVF4H+nkqsFbJMjlD",
-	"OISzff1jGTgdg81cgLjbqs81xAylPM1irTzA9pg8U5U+4KIYjmOkAH/ct1qIAkwa1iNJKIjPVvrbxetX",
-	"CPgnVCmBZkWgNqyBMi398nyresD+NXuGwxkyghHyEF4HNLoOimoUWyDEMkmMbOr1QLL+Fcr0mGG6NPpr",
-	"v6+7MkJ7iH7+YHoZaqxJk5HiN4RdBx+7qPRiStUsG+fv3jUsuMHlclFBedQxDGnLXa3VKyzxZsPMMIsQ",
-	"twwgXiCMClorm2RjyrBYNJV24ZlqDh0xN49ts+Ja3MFgsLX+aMAu1aOuVBpqTP24JJ13v5hgskJ5WTCV",
-	"yrhpMcDstfLIiON7kIxPceRuO/1QAdaoANZ2KQl3+N4qgNsfaPTRoG9MTKhiTUJDtR8noVMscEIU5Hv+",
-	"2Y/zEKVJ9d/uIA98DcaSryJvtwSeukL/bgmx9xvLKOUFiQAX9u8B/2DcItkXjPvkvsbFsUk1m5d2fFDo",
-	"CJvlELHrtz5eEPU9YNzgvlipy0n4DfH3oeDPC2JVpAJoNW62DUney6Zt/TaBIDiRthfTWNsyFzCn3gVh",
-	"CkEBP9m3/zo1GwK138d8+n6IDAhjW75Q2ixzuQ9YC0ULS/jIJOHIv7O5acIZZlMiUcfIz9//9W9Xgu33",
-	"f/3blmD7/V//BnLftgVFobu8eOD7Ifo7IWkPx3RO3GIg+JDMiVigvYEtawGvPJlu5DW7Zm+IygSTeeiO",
-	"XhfAxHQIKjuD9VCWEYkkgBByUE9sTIlxMXlMPEfLBpT3StHdJUvXrqC0AC0VHQ7AASVlVFEcI54pky4S",
-	"5gH3W4qJmDUH5cHr3rIl/+l6/qLInTLY2zMT3JDBmOKbHroz9ShNn6hzcfFsq49A3TdYAXFDYDcU3VhL",
-	"oP+DJ63nSYajVBkKQNnwplKSw0Zf24ltcx/OtqYEiM3eNgHZ2om2Xd1ifqjdLTxvfrg5L5zPFXbiknI3",
-	"+8I+fb2+2pytbMovt88O95ZhbjPOFyD7FtYk6thkwXlOkEpa+2+F9PfCgEvVEHIujLjJRHJvFs4xZ5OY",
-	"hgr13Fxsvcbc6qkiyENhB2/srBF266oHu5dFxXYlbqxRaOQhZPcpPWqDbiJGioD8Atd+SJJ1qHNCZcj1",
-	"tyVs6YU4BUBaIBZ0Wsaidb6dE/g9FzkrFfO8gqojyPvz8tihM1aXDffAFE9qDPEbMsJaxozSFZaHhM2X",
-	"+S66AiQrnEDfF2oO7k8Lum+HkA/NH5JHKKqBTXPBWZ6juwm9bBbvr7jRdgTPwi+IcFRtJmoyNRTLMp+i",
-	"cEbCG7MgWydnlUZw6krpfH09wKQi30D62+n/EPctDMcCVquMxVObvuPr2Yowwkam4pc7frQI5gEyRGmM",
-	"nSPVZMbAcsHCrT/UCeS9SIZ6XZsHREnnWRw7R/ycCFUkZC/z0+0PWj9ooSc7alupi1y+edkjLOQQk2NA",
-	"16iQuPzLX1ZbNhtmlvIDTdrYVwAqhxjNyuhn7L8JnUJ5YsQ/7T63qRH/tPvcJEf8096RSY+49dWQZXBf",
-	"rPm+tdcHjHxaeaVVoAFrMlmS12l7eat7UfhsOvpNVL58gj+0vjZaXxlcKxW/vDLAV1T9bML1b3NOkCOb",
-	"D9rwysWf/cFUvvt1PVmMLNXQq/jibY4QLook57YC18MLkKM5xpX5b0sfakGQK7UDh7qnJ12bv95knc8D",
-	"xO/Jo+rmce9aoh33/t2pR8mYTjOeyXJAO5QrILKo7VphwA9Nfy3Ec6MG+x1j6eA+Rce9K6g/8P4rqc71",
-	"DTXM25aDXaM8u1b3ozwXRzXttWc3wx/acyvtuQSu1dpznhL1a6rPZpBvpj87fPMB3F5h/qFB34cGLbPJ",
-	"hIaUMFWkElqKarHZwB7gvRJmnfCl0+gKE26tQRd5ilcrJxZ5v0UkQj74/SvOLl/Zw4yP5SYiPnKqaiEM",
-	"m3XV7w0fBvfLnO9fR33IKPaiXBrOrw2ayyExn66/GpL35O5BeO6GXDNXR+69YervUY6oSHEkSUxChW5n",
-	"NJzBPRH9G/RvrpHgNH2fXwzdGqIXEH9avqoKg3ckERTHkHycxyZv/vt5krwfLueMuDo7g4/MFRGTHeL9",
-	"ELk8ETmNSd2qfO9DryLGUqFX9jZLR2+44HFsEh2/1/AsrW/L3ggp7tBeM9/tEEZubYd0gt6XLoq8b7gp",
-	"4pDwpd6lb0T53eY882YtiiMBgDN31gmLGm6JaKj574jsDLypj1reVzHT+MrXVZYm85JP8/wCFVTGadoW",
-	"fe00AYvnSbICh1GnlFtfqohn6i9SRUSY0q8Wu5uQG3VwaP5Q+MYUKq1UajPVHHygsnevvaAKTDlmVwTC",
-	"/DVPksCUjUuwr6jD59/7qXe4bDDqnSld7vkhMza5tlNl9qV7OzXJYauJQLYRr3X5xjT4w2suruzKN0bD",
-	"b2DpFbOgUI2FReMF7G1Rz+ZhXVqAjSxWBvLOrstLI+5dI43YMjh/eBop8OMPTiUhF1A7W7padg8nuqxk",
-	"cZTIvQPFs4qiVF1n9V6dnW01EY0pv9xIMuKHOWwDPf/wMgXqiT08ajGlNHG+gFXOQk0QqtFGdzZrpdbg",
-	"mGe696X0qVBjQy6kIokx2CdZDDfvIKzeJjDA5RoiXUSVhGzaXXBZlepHXLMxmWh5mBKhx9afQ3q2wvbw",
-	"mbUXCufke25o8PuwayGjKphyWDVBrVaoI01dMlWf7ZTnf/3kKT0HQ7Vaw0SiTkxvTGE+NJco1g9bKy1d",
-	"U+DkS6dn+HTKykv4+K7dGpzNkfmPwOFOa2zNlah8cGztBSkTi+M/sNF+tibX8jWxYY1HB7tSrcf+NTsj",
-	"Sug2WBAU8jiGsgJGf99OBQ+3of5cmNLIFKKDyQHDa36dwIjH55fQzqSA714z/cdyBbT6RF0htdPt12t8",
-	"f6b25f9gPccscBVZ+Df8h1tn86OARhqSDSTK01WaOE9/KOK2pO0Ps/VBmq1wFpuvpjMVOASlWNqixX4T",
-	"1Vb62v5gHk7XnegrHM6uXLWI70Pbtcnl1w3jFvggiNKuKSImLcD90yTP8/8/0KtfGnBuCaDElGMT/FLA",
-	"1BX5o2H3l4+TK8Nxoyi5e6Utl3Lju6Gt+5Z8dg4uUK0Mj4dC5gbT3EogAXrZ+yTKdcpW2mau/hQUzctV",
-	"S1c+rVuu4mcyfOY+pKJuTF4wrH/N8gppLsOotq66zrRCEZU3pgdrPfWRv5CdsfNsNbtrpjgKcRyavPN5",
-	"RTdThVE2WF9vSlUOvxq9FYN4NjovZSfzkmUPyeTw4wTsXrkmGmCcVadWxqdf2Tb3EZ1uhdkGseluBT8i",
-	"01tEppeA1aYCi2neRxdZmnKhJFK3HEoaSwi3gUStYx4thij/jiFTBc+yOFu+LK+5ASWk9LdnlbIspQ7c",
-	"l6kgPVeyIjLX/iyMjXq0XPCloaZLrh99vRD7uurQ3bRMTGku1f2orhHlNVhsPQkNWwsv10WrqhG+mld5",
-	"HZowk4onrt/TE9TBmeK9KWEauEXJl1TwOY3qlUO/k3J/Z/iOJlmS18p+8RQqDwsTjQU16CEW0OEUuQsJ",
-	"iSQEZ21tWBpwuSqg3YtPq5vx5ZiY46aNOuU3vHdRJDjVW6x1TIfkinMUYzElW3+YuxmW1orLzacnD/9C",
-	"xtxhX6FntLyC0c6kbWlpfo3rF7m7434vX1x9P1ZYKQfkA7yiPM/VzKZbH98XCg7uTyTc922PqwfstdPW",
-	"1rwGNtOB7tGHMC95iGMUkTmJeQrFb03boBtkIralPIfb29pMi7UhNzwcHA6Cj+8+/v8AAAD//3a75jLy",
-	"3QAA",
+	"Oa2pWbBVPbiyXt9q4VRrVQjXsiaAOs6n5y4pVeFauizUShC3O1ire6/dbLYlCauj7x8+enzQ8rbWZ6na",
+	"KzKvfoZiPU9WKNQNO3XWRms7fHT45Mne/qMnuxvpR+6go2F/mg47yvtTyylT09keDeB/G03KHHX4p9Rw",
+	"3FGdUCU/zCdP6OMK0i0uUDRY3auynhc76cz8qgLeTsVdoS0dVVSuUpqwDplMCDiORgZuvWIytYCeVnMI",
+	"cYpDqhYeCxDfQowDypvULgK06L02WQ9Ibd8ITxQRcBohs3FxE67jBkf/aSy7Gi4ctr70KbNxkxX5uj6q",
+	"sSFNUFBU81C0cBAYjPCdgN/mwES3WFa8+vo5VCTqltLA1Y9/TIv2WW4drueJbouDbd9lFn9S2/L217az",
+	"ZHVUlOQ6xFeJ0GYS1BoBRBy1cbB7JLLnhky4PoKixh+sAPy0r0bj8nXslffdK3e3C6m7+bjtEu8tf2ck",
+	"2ObjlU7wN/mwfjMV8NHOwYK86LtbQQkfNpnzlaa0J4krB1K7uEpNgnV7OwmVGqMOSVK1cBH4zjLd2uy8",
+	"5yjv0IuMXzhmavDkS0RtX64M0/4fkkinfMTmBll7uLa0p42xkX519aQevmJsQptIoBpuUbseLdWKOgKr",
+	"ataY4jFg8Nm45GlWv0i1QZ2aJhO/oBxXIMAVqllnua70p5VWVppJ896Y89XPLOpDpavm84kgs+bX+kBf",
+	"c0alDeBePdOEuacqKNhzFkAGsBoEuYm+7AdYHfZxhu/yEcBaxhLVcvOZdZTy3L54CnfP37iMA3TiuoBp",
+	"1LMsPv28akcOq5Y3Y1X5I3eC7yU8y39WcLQm2qohZzFGd3WFJc26SJgJqhYXWiDY4DSCBRFHmUFDkBSw",
+	"CPi5GByC3T9+BDN14tFWXxBGBA3R0fkpYEmCGZ7qLbs6QzGdkHARxsTGKi+d7cJV+9fHpz1zySJPhgfF",
+	"CxQAxGWhOjo/hQQ4tmxAMOjv9iHXL08JwykNhsFefwdS/GgwwBK34Q4bPFpHlKZDkGSnkZW4T00TDVqZ",
+	"ciYNcHYHg1oZClwkGdn+RRoPixGvrZVCU+dnOd5iKQTXaQJ2+h+7wf5gZ6P5rM0L4hv2kuFMzbigvxGY",
+	"5qMNgfBJg54yY1W7TMTENixwNhj+XMXWn999fNcNZJYkWKuIBlwFrFIum1QYIhFGjNzay42/8HEfXRib",
+	"BJKEFBXUjMuARJolYaSw6E9/Q1iEMzon18xyYpPjBQu4yZEgzYFNHH0VzczQZvcNCROpnvJoUYNu3t22",
+	"7g60kSqAN67PkScsTBsKdfi4o8mLJEPuTQhFGGaqSLNjEiLdEDjEnNA7byw8xPf6vd0n+TtX0aXK27W6",
+	"S1kYZ1EhAKuVNLx3rE1FCJvi6YZ49IUX0MLOvxwK7SQN4xExYa3pQs04M8/ZOGMqM89jwW8lEVoe2SsZ",
+	"FizabM4rcZn8eTSBaxHmEqcec9tMcfvDDVl87F+zoyhxl25tGlccS25zX5kABSpRnkz4mnk1aDnCup/R",
+	"2JUUqymqBLq6DrSovA7081RgrZJlcoZwCAEJ+scycDoGm7kAcbdVn2uIGUp5msVaeYDtMcmxKn3A7TYc",
+	"x0gB/rhvtRAFmDSsR5JQEJ+t9LeL168Q8E8orQLNiuhyWANlWvrlSWL1gP1r9gyHM2QEIyRPvA5odB0U",
+	"JTS2QIhlkhjZ1OuBZP0r1BYyw3Rp9Nd+X3dlhPYQ/fzB9DLUWJMmI8VvCLsOPnZR6cWUqlk2zt+9a1hw",
+	"g6/mooLyqGMY0pa7D6xXWOLNhplhFiFuGUC8QBgVtFY2ycaUYbFoqkfDM9Uc72KuS9tmxV2+g8Fga/15",
+	"hl2qR12pNNSY+nFJOu9+McFkhfKyYCrVntNigNm78JERx/cgGZ/iyF3R+qECrFEBrO1SEu7wvVUAtz/Q",
+	"6KNB35iY+MqahIYSRU5Cp1jghChIUv2zH+chtJTqv93pI/gajCVfRd5uCTx1hf7dEmLvN9Z+yqsoAS7s",
+	"3wP+wbhFhjIY98l9jYtjkx83r0f5oNARNsshYtdvfbwg6nvAuMF9sVKXSPEb4u9DwZ8XxKpIBdBq3Gwb",
+	"MtOXTdv6FQhBcCJtL6axtmUuYE69C8IUgqqDsm//dWo2RJe/j/n0/RAZEMa25qK0qfFyH7AWihaW8JHJ",
+	"HJJ/ZxPqhDPMpkSijpGfv//r365u3O//+retG/f7v/4N5L5tq6BCd3nFw/dD9HdC0h6O6Zy4xUDEJJkT",
+	"sUB7A1uLA1550vPIa3bN3hCVCSbzeCO9LoCJ6RBUdgbroSwjEkkAISTOnthAGONi8ph4jpYNKO+VortL",
+	"lq5dQWkBWio6HICTTcqoojhGPFMmxyXMAy7lFBMxaw7Kg9e9ZUv+0/X8RZE7ZbC3Zya4IYMxFUM9dGeK",
+	"aJo+Uefi4tlWH4G6b7ACgp3Abii6sZZA/wdPWs+TDEepMhSAsuFNpcyMjb62E9vmPpxtTVkbm71tAlLM",
+	"E227usX8ULtbeN78cHNeOJ8r7MRlEm/2hX36en0FRVvZlF9unx3uLcPcpskvQPYtrEnUsRmO80QmlVz8",
+	"3wrp74UBl0o45FwYcZM+5d4snGPOJjENFeq5udgik7nVU0WQh8IO3thZI+zWVY/QL4uK7UrAWaPQyGPP",
+	"7lN61AbdRIwUtwgKXPshSdahzgmVIdfflrClF+IUAGmBWNBpGYvW+XZO4Pdc5KxUzPOyr44g78/LY4fO",
+	"WF023ANTPKkxxG/ICGtpPkr3bh4SNl/mu+iqpqxwAn1fqDm4Py3ovh1CPjR/SB6hqAY2zQVneWLxJvSy",
+	"qce/4kbbETwLvyDCUbWZqEkvUSzLfIrCGQlvzIJscZ9VGsGpq//z9fUAkz99A+lvp/9D3LcwHAtYrTIW",
+	"T23Oka9nK8IIG5mKX+740SKYB8gQpTF2jlSTzgPLBQu3/lAnkPciGerFeB4QJZ1ncewc8XMiVJFFvsxP",
+	"tz9o/aCFnuyobaUucvnmZY+wkENMjgFdo0LikkZ/WW3ZbJhZyg80aWNfAagcYjQro5+x/yZ0CuXZHP+0",
+	"+9zmc/zT7nOT0fFPe0cmp+PWV0OWwX2x5vvWXh8w8mnllVaBBqzJpHZep+3lre5F4bM59DdR+fIJ/tD6",
+	"2mh9ZXCtVPzycgZfUfWzWeK/zTlBjmw+aMMrF3/2B1P57tf1ZDGyVPiv4ou3iU24KDKz27JhDy9AjuYY",
+	"V+a/LX2oBUGu1A4c6p6edG3SfZMqPw8QvyePqpvHvWuJdtz7d6ceJWM6zXgmywHtUGOByKIgbYUBPzT9",
+	"tRDPjRrsd4ylg/sUHfeuoP7A+6+kOtc31DBvW8N2jfLsWt2P8lwc1bTXnt0Mf2jPrbTnErhWa895Htev",
+	"qT6bQb6Z/uzwzQdwe4X5hwZ9Hxq0zCYTGlLCVJGDaCmqxaYwe4D3Sph1wpdOoytMuLUGXSRXXq2cWOT9",
+	"FpEI+eD3rzi7RGcPMz6Wm4j4yKmqhTBs1lW/N3wY3C9zvn8d9SGj2ItyPTu/Nmguh8R8uv5qSN6Tuwfh",
+	"uRtyzVzxu/eGqb9HOaIixZEkMQkVup3RcAb3RPRv0L+5RoLT9H1+MXRriF5A/Gn5qioM3pFEUBxDxnQe",
+	"m2T/7+dJ8n64nDPi6uwMPjJXREx2iPdD5PJE5DQmdavyvQ+9ihhLhV7Z2ywdveGCx7HJzvxew7O0vi17",
+	"I6S4Q3vNfLdDGLm1HdIJel+6KPK+4aaIQ8KXepe+EeV3m5Pjm7UojgQAztxZJyxquCWioea/I7Iz8KY+",
+	"anlfxUzjK19XWZrMSz7N8wtUUBmnaVv0tdMELJ4nyQocRp1SQQCpIp6pv0gVEWHq1VrsbkJu1MGh+UPh",
+	"G1NdtVJezpSg8IHK3r32giowNaRd5Qrz1zxJAlPrLsG+ShSff++n3uGywah3pnS554fM2OTaTpXZl+7t",
+	"1CSHLYEC2Ua81uUb0+APr7m4WjHfGA2/gaVXzIJCCRkWjRewt0URnod1aQE2slgZyDu7Li+NuHeNNGJr",
+	"9/zhaaTAjz84lYRcQMFv6QrwPZzospLFUSL3DlT8KippdZ3Ve3V2ttVENKZmdCPJiB/msA30/MPLFCiC",
+	"9vCoxdT/xPkCVjkLNUGoRhvd2ayVAoljnunel9KnQmEQuZCKJMZgn2Qx3LyDsHqbwACXC590EVUS0nB3",
+	"wWVVKnpxzcZkouVhSoQeW38O6dkK28Nn1l4onJPvuaHB78OuhYyqYMph1QS1WnWRNHXJVH22U57/9ZOn",
+	"9BwM1WrhFYk6Mb0x1QTRXKJYP2yttHRNVZYvnZ7h0ykrrzvku3ZrcDZH5j8ChzutsTVXV/PBsbUXpEws",
+	"jv/ARvvZmlzL18SGhSkd7EoFKvvX7IwoodtgQVDI4xjqERj9fTsVPNyGonlhSiNTPQ8mBwyv+XUCIx6f",
+	"X0I7kwK+e830H8tl2+oTddXfTrdfr/H9mYKd/4P1HLPAVWTh3/Afbp3NjwIaaUg2kChPV2niPP2hiNs6",
+	"vD/M1gdptsJZbL6azlTgEJRiaSst+01UW55s+4N5OF13oq9wOLty1SK+D23XJpdfN4xb4IMgSrumiJi0",
+	"APdPkzzP//9Ar35pwLklgBJTjk3wSwFTV+SPht1fPk6uDMeNouTulbZcyo3vhrbuW/LZObhAtTI8HgqZ",
+	"G0xzK4EE6GXvkygXOFtpm7n6U1BtL1ctXd21brn8n8nwmfuQiroxeaWx/jXLS6u5DKPauuo60wpFVN6Y",
+	"Hqz11Ef+CnjGzrNl8K6Z4ijEcWjyzuel4Ez5Rtlgfb0plUf8avRWDOLZ6LwGnsxLlj0kk8OPE7B75Zpo",
+	"gHFWnVoZn35l29xHdLoVZhvEprsV/IhMbxGZXgJWmwospnkfXWRpyoWSSN1yqMMsIdwGErWOebQYovw7",
+	"hkwVPMvibPmyvOYGlJDS355VyrKUOnBfpoL0XMmKyFz7szA26tFywZeGmi65fvT1QuzrqkN30zIxpblU",
+	"96O6RpTXYLH1JDRsLbxcF62qRvhqXuV1aMJMKp64fk9PUAdnivemhGngFiVfUsHnNKqXHP1Oyv2d4Tua",
+	"ZEle4PvFUyhZLEw0FhTOh1hAh1PkLiQkkhCctbVhacDlqoB2Lz6tbsaXY2KOmzbqlN/w3kWR4FRvsdYx",
+	"HZIrzlGMxZRs/WHuZlhaKy43n548/AsZc4d9hZ7R8gpGO5O2paX5Na5f5O6O+718cfX9WGGlHJAP8Iry",
+	"PFczm259fF8oOLg/kXDftz2uHrDXTltb8xrYTAe6Rx/CvOQhjlFE5iTmKRS/NW2DbpCJ2JbyHG5vazMt",
+	"1obc8HBwOAg+vvv4/wMAAP//ZcQlB6feAAA=",
 }
 
 // GetSwagger returns the content of the embedded swagger specification file
diff --git a/lib/resources/resource.go b/lib/resources/resource.go
index dcd90ab1..939fdcbb 100644
--- a/lib/resources/resource.go
+++ b/lib/resources/resource.go
@@ -64,6 +64,7 @@ type AllocationBreakdown struct {
 	DiskBytes          int64  `json:"disk_bytes"`
 	NetworkDownloadBps int64  `json:"network_download_bps"` // External→VM
 	NetworkUploadBps   int64  `json:"network_upload_bps"`   // VM→External
+	DiskIOBps          int64  `json:"disk_io_bps"`          // Disk I/O bandwidth
 }
 
 // DiskBreakdown shows disk usage by category.
@@ -80,6 +81,7 @@ type FullResourceStatus struct {
 	Memory      ResourceStatus        `json:"memory"`
 	Disk        ResourceStatus        `json:"disk"`
 	Network     ResourceStatus        `json:"network"`
+	DiskIO      ResourceStatus        `json:"disk_io"`
 	DiskDetail  *DiskBreakdown        `json:"disk_breakdown,omitempty"`
 	GPU         *GPUResourceStatus    `json:"gpu,omitempty"` // nil if no GPU available
 	Allocations []AllocationBreakdown `json:"allocations"`
@@ -312,6 +314,11 @@ func (m *Manager) GetFullStatus(ctx context.Context) (*FullResourceStatus, error
 		return nil, err
 	}
 
+	diskIOStatus, err := m.GetStatus(ctx, ResourceDiskIO)
+	if err != nil {
+		return nil, err
+	}
+
 	// Get disk breakdown
 	var diskBreakdown *DiskBreakdown
 	m.mu.RLock()
@@ -347,6 +354,7 @@ func (m *Manager) GetFullStatus(ctx context.Context) (*FullResourceStatus, error
 						DiskBytes:          inst.OverlayBytes + inst.VolumeBytes,
 						NetworkDownloadBps: inst.NetworkDownloadBps,
 						NetworkUploadBps:   inst.NetworkUploadBps,
+						DiskIOBps:          inst.DiskIOBps,
 					})
 				}
 			}
@@ -361,6 +369,7 @@ func (m *Manager) GetFullStatus(ctx context.Context) (*FullResourceStatus, error
 		Memory:      *memStatus,
 		Disk:        *diskStatus,
 		Network:     *netStatus,
+		DiskIO:      *diskIOStatus,
 		DiskDetail:  diskBreakdown,
 		GPU:         gpuStatus,
 		Allocations: allocations,
diff --git a/openapi.yaml b/openapi.yaml
index 219c553a..ec88bfdd 100644
--- a/openapi.yaml
+++ b/openapi.yaml
@@ -1058,6 +1058,11 @@ components:
           format: int64
           description: Upload bandwidth limit in bytes/sec (VM→external)
           example: 125000000
+        disk_io_bps:
+          type: integer
+          format: int64
+          description: Disk I/O bandwidth limit in bytes/sec
+          example: 104857600
 
     Resources:
       type: object
@@ -1071,6 +1076,8 @@ components:
           $ref: "#/components/schemas/ResourceStatus"
         network:
           $ref: "#/components/schemas/ResourceStatus"
+        disk_io:
+          $ref: "#/components/schemas/ResourceStatus"
         disk_breakdown:
           $ref: "#/components/schemas/DiskBreakdown"
         gpu:

From ca5e720a89022a93fb1ce2d5738c834965e3ad99 Mon Sep 17 00:00:00 2001
From: Steven Miller <sjmiller609@gmail.com>
Date: Fri, 30 Jan 2026 14:48:44 -0500
Subject: [PATCH 5/5] Also check resources on restore

---
 lib/instances/restore.go | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/lib/instances/restore.go b/lib/instances/restore.go
index 8acc6466..ed81fbfd 100644
--- a/lib/instances/restore.go
+++ b/lib/instances/restore.go
@@ -51,6 +51,16 @@ func (m *manager) restoreInstance(
 		return nil, fmt.Errorf("no snapshot available for instance %s", id)
 	}
 
+	// 2b. Validate aggregate resource limits before allocating resources (if configured)
+	if m.resourceValidator != nil {
+		needsGPU := stored.GPUProfile != ""
+		totalMemory := stored.Size + stored.HotplugSize
+		if err := m.resourceValidator.ValidateAllocation(ctx, stored.Vcpus, totalMemory, stored.NetworkBandwidthDownload, stored.NetworkBandwidthUpload, stored.DiskIOBps, needsGPU); err != nil {
+			log.ErrorContext(ctx, "resource validation failed for restore", "instance_id", id, "error", err)
+			return nil, fmt.Errorf("%w: %v", ErrInsufficientResources, err)
+		}
+	}
+
 	// 3. Get snapshot directory
 	snapshotDir := m.paths.InstanceSnapshotLatest(id)