From 86d5c635a3a1643160241272ef9e95368ff83bc2 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 30 Jan 2026 11:43:43 -0500 Subject: [PATCH 1/5] Use resources module for input validation --- cmd/api/api/instances.go | 28 +++-- cmd/api/config/config.go | 8 +- cmd/api/main.go | 6 + integration/systemd_test.go | 2 - integration/vgpu_test.go | 3 - lib/instances/create.go | 45 ++----- lib/instances/errors.go | 3 + lib/instances/manager.go | 41 +++++-- lib/instances/manager_test.go | 4 - lib/instances/qemu_test.go | 2 - lib/instances/resource_limits_test.go | 170 +------------------------- lib/instances/standby.go | 6 + lib/instances/start.go | 31 +++++ lib/instances/stop.go | 13 +- lib/oapi/oapi.go | 68 +++++++---- lib/providers/providers.go | 14 +-- lib/resources/resource.go | 62 ++++++++++ openapi.yaml | 6 + 18 files changed, 230 insertions(+), 282 deletions(-) diff --git a/cmd/api/api/instances.go b/cmd/api/api/instances.go index e95536d4..62621dec 100644 --- a/cmd/api/api/instances.go +++ b/cmd/api/api/instances.go @@ -252,6 +252,11 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst Code: "name_conflict", Message: err.Error(), }, nil + case errors.Is(err, instances.ErrInsufficientResources): + return oapi.CreateInstance409JSONResponse{ + Code: "insufficient_resources", + Message: err.Error(), + }, nil default: log.ErrorContext(ctx, "failed to create instance", "error", err, "image", request.Body.Image) return oapi.CreateInstance500JSONResponse{ @@ -309,15 +314,15 @@ func (s *ApiService) GetInstanceStats(ctx context.Context, request oapi.GetInsta // vmStatsToOAPI converts vm_metrics.VMStats to oapi.InstanceStats func vmStatsToOAPI(s *vm_metrics.VMStats) oapi.InstanceStats { stats := oapi.InstanceStats{ - InstanceId: s.InstanceID, - InstanceName: s.InstanceName, - CpuSeconds: s.CPUSeconds(), - MemoryRssBytes: int64(s.MemoryRSSBytes), - MemoryVmsBytes: int64(s.MemoryVMSBytes), - NetworkRxBytes: int64(s.NetRxBytes), - NetworkTxBytes: int64(s.NetTxBytes), - AllocatedVcpus: s.AllocatedVcpus, - AllocatedMemoryBytes: s.AllocatedMemoryBytes, + InstanceId: s.InstanceID, + InstanceName: s.InstanceName, + CpuSeconds: s.CPUSeconds(), + MemoryRssBytes: int64(s.MemoryRSSBytes), + MemoryVmsBytes: int64(s.MemoryVMSBytes), + NetworkRxBytes: int64(s.NetRxBytes), + NetworkTxBytes: int64(s.NetTxBytes), + AllocatedVcpus: s.AllocatedVcpus, + AllocatedMemoryBytes: s.AllocatedMemoryBytes, MemoryUtilizationRatio: s.MemoryUtilizationRatio(), } return stats @@ -464,6 +469,11 @@ func (s *ApiService) StartInstance(ctx context.Context, request oapi.StartInstan Code: "invalid_state", Message: err.Error(), }, nil + case errors.Is(err, instances.ErrInsufficientResources): + return oapi.StartInstance409JSONResponse{ + Code: "insufficient_resources", + Message: err.Error(), + }, nil default: log.ErrorContext(ctx, "failed to start instance", "error", err) return oapi.StartInstance500JSONResponse{ diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go index 0fd01f16..79dcd0e5 100644 --- a/cmd/api/config/config.go +++ b/cmd/api/config/config.go @@ -69,8 +69,7 @@ type Config struct { MaxMemoryPerInstance string // Max memory for a single VM (0 = unlimited) // Resource limits - aggregate - MaxTotalVcpus int // Aggregate vCPU limit across all instances (0 = unlimited) - MaxTotalMemory string // Aggregate memory limit across all instances (0 = unlimited) + // Note: CPU/memory aggregate limits are now handled via oversubscription ratios (OVERSUB_CPU, OVERSUB_MEMORY) MaxTotalVolumeStorage string // Total volume storage limit (0 = unlimited) // OpenTelemetry configuration @@ -166,9 +165,8 @@ func Load() *Config { MaxVcpusPerInstance: getEnvInt("MAX_VCPUS_PER_INSTANCE", 16), MaxMemoryPerInstance: getEnv("MAX_MEMORY_PER_INSTANCE", "32GB"), - // Resource limits - aggregate (0 or empty = unlimited) - MaxTotalVcpus: getEnvInt("MAX_TOTAL_VCPUS", 0), - MaxTotalMemory: getEnv("MAX_TOTAL_MEMORY", ""), + // Resource limits - aggregate + // Note: CPU/memory aggregate limits are now handled via oversubscription ratios MaxTotalVolumeStorage: getEnv("MAX_TOTAL_VOLUME_STORAGE", ""), // OpenTelemetry configuration diff --git a/cmd/api/main.go b/cmd/api/main.go index fef9f322..7f5e4265 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -225,6 +225,12 @@ func run() error { logger.Warn("failed to reconcile mdev devices", "error", err) } + // Wire up resource validator for aggregate limit checking + // This enables the instance manager to validate CPU, memory, network, and GPU + // availability before creating or starting instances. + app.InstanceManager.SetResourceValidator(app.ResourceManager) + logger.Info("Resource validator configured") + // Initialize ingress manager (starts Caddy daemon and DNS server for dynamic upstreams) logger.Info("Initializing ingress manager...") if err := app.IngressManager.Initialize(app.Ctx); err != nil { diff --git a/integration/systemd_test.go b/integration/systemd_test.go index bba12df0..79973d7c 100644 --- a/integration/systemd_test.go +++ b/integration/systemd_test.go @@ -67,8 +67,6 @@ func TestSystemdMode(t *testing.T) { MaxOverlaySize: 100 * 1024 * 1024 * 1024, MaxVcpusPerInstance: 0, MaxMemoryPerInstance: 0, - MaxTotalVcpus: 0, - MaxTotalMemory: 0, } instanceManager := instances.NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil) diff --git a/integration/vgpu_test.go b/integration/vgpu_test.go index ea0d668f..8c02eabc 100644 --- a/integration/vgpu_test.go +++ b/integration/vgpu_test.go @@ -76,8 +76,6 @@ func TestVGPU(t *testing.T) { MaxOverlaySize: 100 * 1024 * 1024 * 1024, MaxVcpusPerInstance: 0, MaxMemoryPerInstance: 0, - MaxTotalVcpus: 0, - MaxTotalMemory: 0, } instanceManager := instances.NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil) @@ -272,4 +270,3 @@ func checkVGPUTestPrerequisites() (string, string) { return "vGPU test requires at least one available VF (all VFs are in use)", "" } - diff --git a/lib/instances/create.go b/lib/instances/create.go index 904e589a..35088d49 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -8,7 +8,6 @@ import ( "strings" "time" - "github.com/nrednav/cuid2" "github.com/kernel/hypeman/lib/devices" "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/images" @@ -16,6 +15,7 @@ import ( "github.com/kernel/hypeman/lib/network" "github.com/kernel/hypeman/lib/system" "github.com/kernel/hypeman/lib/volumes" + "github.com/nrednav/cuid2" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/trace" "gvisor.dev/gvisor/pkg/cleanup" @@ -48,31 +48,6 @@ var systemDirectories = []string{ "/var", } -// AggregateUsage represents total resource usage across all instances -type AggregateUsage struct { - TotalVcpus int - TotalMemory int64 // in bytes -} - -// calculateAggregateUsage calculates total resource usage across all running instances -func (m *manager) calculateAggregateUsage(ctx context.Context) (AggregateUsage, error) { - instances, err := m.listInstances(ctx) - if err != nil { - return AggregateUsage{}, err - } - - var usage AggregateUsage - for _, inst := range instances { - // Only count running/paused instances (those consuming resources) - if inst.State == StateRunning || inst.State == StatePaused || inst.State == StateCreated { - usage.TotalVcpus += inst.Vcpus - usage.TotalMemory += inst.Size + inst.HotplugSize - } - } - - return usage, nil -} - // generateVsockCID converts first 8 chars of instance ID to a unique CID // CIDs 0-2 are reserved (hypervisor, loopback, host) // Returns value in range 3 to 4294967295 @@ -174,18 +149,12 @@ func (m *manager) createInstance( return nil, fmt.Errorf("total memory %d (size + hotplug_size) exceeds maximum allowed %d per instance", totalMemory, m.limits.MaxMemoryPerInstance) } - // Validate aggregate resource limits - if m.limits.MaxTotalVcpus > 0 || m.limits.MaxTotalMemory > 0 { - usage, err := m.calculateAggregateUsage(ctx) - if err != nil { - log.WarnContext(ctx, "failed to calculate aggregate usage, skipping limit check", "error", err) - } else { - if m.limits.MaxTotalVcpus > 0 && usage.TotalVcpus+vcpus > m.limits.MaxTotalVcpus { - return nil, fmt.Errorf("total vcpus would be %d, exceeds aggregate limit of %d", usage.TotalVcpus+vcpus, m.limits.MaxTotalVcpus) - } - if m.limits.MaxTotalMemory > 0 && usage.TotalMemory+totalMemory > m.limits.MaxTotalMemory { - return nil, fmt.Errorf("total memory would be %d, exceeds aggregate limit of %d", usage.TotalMemory+totalMemory, m.limits.MaxTotalMemory) - } + // Validate aggregate resource limits via ResourceValidator + if m.resourceValidator != nil { + needsGPU := req.GPU != nil && req.GPU.Profile != "" + if err := m.resourceValidator.ValidateAllocation(ctx, vcpus, totalMemory, req.NetworkBandwidthDownload, req.NetworkBandwidthUpload, req.DiskIOBps, needsGPU); err != nil { + log.ErrorContext(ctx, "resource validation failed", "error", err) + return nil, fmt.Errorf("%w: %v", ErrInsufficientResources, err) } } diff --git a/lib/instances/errors.go b/lib/instances/errors.go index 1090104e..9925bb02 100644 --- a/lib/instances/errors.go +++ b/lib/instances/errors.go @@ -17,4 +17,7 @@ var ( // ErrAmbiguousName is returned when multiple instances have the same name ErrAmbiguousName = errors.New("multiple instances with the same name") + + // ErrInsufficientResources is returned when resources (CPU, memory, network, GPU) are not available + ErrInsufficientResources = errors.New("insufficient resources") ) diff --git a/lib/instances/manager.go b/lib/instances/manager.go index 09c9616e..23860d28 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -41,6 +41,9 @@ type Manager interface { // ListRunningInstancesInfo returns info needed for utilization metrics collection. // Used by the resource manager for VM utilization tracking. ListRunningInstancesInfo(ctx context.Context) ([]resources.InstanceUtilizationInfo, error) + // SetResourceValidator sets the validator for aggregate resource limit checking. + // Called after initialization to avoid circular dependencies. + SetResourceValidator(v ResourceValidator) } // ResourceLimits contains configurable resource limits for instances @@ -48,21 +51,28 @@ type ResourceLimits struct { MaxOverlaySize int64 // Maximum overlay disk size in bytes per instance MaxVcpusPerInstance int // Maximum vCPUs per instance (0 = unlimited) MaxMemoryPerInstance int64 // Maximum memory in bytes per instance (0 = unlimited) - MaxTotalVcpus int // Maximum total vCPUs across all instances (0 = unlimited) - MaxTotalMemory int64 // Maximum total memory in bytes across all instances (0 = unlimited) +} + +// ResourceValidator validates if resources can be allocated +type ResourceValidator interface { + // ValidateAllocation checks if the requested resources are available. + // Returns nil if allocation is allowed, or a detailed error describing + // which resource is insufficient and the current capacity/usage. + ValidateAllocation(ctx context.Context, vcpus int, memoryBytes int64, networkDownloadBps int64, networkUploadBps int64, diskIOBps int64, needsGPU bool) error } type manager struct { - paths *paths.Paths - imageManager images.Manager - systemManager system.Manager - networkManager network.Manager - deviceManager devices.Manager - volumeManager volumes.Manager - limits ResourceLimits - instanceLocks sync.Map // map[string]*sync.RWMutex - per-instance locks - hostTopology *HostTopology // Cached host CPU topology - metrics *Metrics + paths *paths.Paths + imageManager images.Manager + systemManager system.Manager + networkManager network.Manager + deviceManager devices.Manager + volumeManager volumes.Manager + limits ResourceLimits + resourceValidator ResourceValidator // Optional validator for aggregate resource limits + instanceLocks sync.Map // map[string]*sync.RWMutex - per-instance locks + hostTopology *HostTopology // Cached host CPU topology + metrics *Metrics // Hypervisor support vmStarters map[hypervisor.Type]hypervisor.VMStarter @@ -106,6 +116,12 @@ func NewManager(p *paths.Paths, imageManager images.Manager, systemManager syste return m } +// SetResourceValidator sets the resource validator for aggregate limit checking. +// This is called after initialization to avoid circular dependencies. +func (m *manager) SetResourceValidator(v ResourceValidator) { + m.resourceValidator = v +} + // getHypervisor creates a hypervisor client for the given socket and type. // Used for connecting to already-running VMs (e.g., for state queries). func (m *manager) getHypervisor(socketPath string, hvType hypervisor.Type) (hypervisor.Hypervisor, error) { @@ -366,4 +382,3 @@ func (m *manager) ListRunningInstancesInfo(ctx context.Context) ([]resources.Ins return infos, nil } - diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go index 839f2b90..e06e659d 100644 --- a/lib/instances/manager_test.go +++ b/lib/instances/manager_test.go @@ -54,8 +54,6 @@ func setupTestManager(t *testing.T) (*manager, string) { MaxOverlaySize: 100 * 1024 * 1024 * 1024, // 100GB MaxVcpusPerInstance: 0, // unlimited MaxMemoryPerInstance: 0, // unlimited - MaxTotalVcpus: 0, // unlimited - MaxTotalMemory: 0, // unlimited } mgr := NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil).(*manager) @@ -938,8 +936,6 @@ func TestStorageOperations(t *testing.T) { MaxOverlaySize: 100 * 1024 * 1024 * 1024, // 100GB MaxVcpusPerInstance: 0, // unlimited MaxMemoryPerInstance: 0, // unlimited - MaxTotalVcpus: 0, // unlimited - MaxTotalMemory: 0, // unlimited } manager := NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil).(*manager) diff --git a/lib/instances/qemu_test.go b/lib/instances/qemu_test.go index 6ff8118e..e276676c 100644 --- a/lib/instances/qemu_test.go +++ b/lib/instances/qemu_test.go @@ -52,8 +52,6 @@ func setupTestManagerForQEMU(t *testing.T) (*manager, string) { MaxOverlaySize: 100 * 1024 * 1024 * 1024, // 100GB MaxVcpusPerInstance: 0, // unlimited MaxMemoryPerInstance: 0, // unlimited - MaxTotalVcpus: 0, // unlimited - MaxTotalMemory: 0, // unlimited } mgr := NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, hypervisor.TypeQEMU, nil, nil).(*manager) diff --git a/lib/instances/resource_limits_test.go b/lib/instances/resource_limits_test.go index 767d9e7c..fe100aeb 100644 --- a/lib/instances/resource_limits_test.go +++ b/lib/instances/resource_limits_test.go @@ -2,10 +2,8 @@ package instances import ( "context" - "os" "syscall" "testing" - "time" "github.com/kernel/hypeman/cmd/api/config" "github.com/kernel/hypeman/lib/devices" @@ -171,15 +169,11 @@ func TestResourceLimits_StructValues(t *testing.T) { MaxOverlaySize: 10 * 1024 * 1024 * 1024, // 10GB MaxVcpusPerInstance: 4, MaxMemoryPerInstance: 8 * 1024 * 1024 * 1024, // 8GB - MaxTotalVcpus: 16, - MaxTotalMemory: 32 * 1024 * 1024 * 1024, // 32GB } assert.Equal(t, int64(10*1024*1024*1024), limits.MaxOverlaySize) assert.Equal(t, 4, limits.MaxVcpusPerInstance) assert.Equal(t, int64(8*1024*1024*1024), limits.MaxMemoryPerInstance) - assert.Equal(t, 16, limits.MaxTotalVcpus) - assert.Equal(t, int64(32*1024*1024*1024), limits.MaxTotalMemory) } func TestResourceLimits_ZeroMeansUnlimited(t *testing.T) { @@ -188,8 +182,6 @@ func TestResourceLimits_ZeroMeansUnlimited(t *testing.T) { MaxOverlaySize: 100 * 1024 * 1024 * 1024, MaxVcpusPerInstance: 0, // unlimited MaxMemoryPerInstance: 0, // unlimited - MaxTotalVcpus: 0, // unlimited - MaxTotalMemory: 0, // unlimited } mgr := createTestManager(t, limits) @@ -200,166 +192,8 @@ func TestResourceLimits_ZeroMeansUnlimited(t *testing.T) { assert.Equal(t, int64(0), mgr.limits.MaxMemoryPerInstance) } -func TestAggregateUsage_NoInstances(t *testing.T) { - limits := ResourceLimits{ - MaxOverlaySize: 100 * 1024 * 1024 * 1024, - } - mgr := createTestManager(t, limits) - - usage, err := mgr.calculateAggregateUsage(context.Background()) - require.NoError(t, err) - - assert.Equal(t, 0, usage.TotalVcpus) - assert.Equal(t, int64(0), usage.TotalMemory) -} - -func TestAggregateUsage_StructValues(t *testing.T) { - usage := AggregateUsage{ - TotalVcpus: 8, - TotalMemory: 16 * 1024 * 1024 * 1024, - } - - assert.Equal(t, 8, usage.TotalVcpus) - assert.Equal(t, int64(16*1024*1024*1024), usage.TotalMemory) -} - -// TestAggregateLimits_EnforcedAtRuntime is an integration test that verifies -// aggregate resource limits are enforced when creating VMs. -// It creates one VM, then tries to create another that would exceed the total limit. -func TestAggregateLimits_EnforcedAtRuntime(t *testing.T) { - // Skip in short mode - this is an integration test - if testing.Short() { - t.Skip("skipping integration test in short mode") - } - - // Require KVM access - if _, err := os.Stat("/dev/kvm"); os.IsNotExist(err) { - t.Skip("/dev/kvm not available - skipping VM test") - } - - ctx := context.Background() - tmpDir := t.TempDir() - - cfg := &config.Config{ - DataDir: tmpDir, - BridgeName: "vmbr0", - SubnetCIDR: "10.100.0.0/16", - DNSServer: "1.1.1.1", - } - - p := paths.New(tmpDir) - imageManager, err := images.NewManager(p, 1, nil) - require.NoError(t, err) - - systemManager := system.NewManager(p) - networkManager := network.NewManager(p, cfg, nil) - deviceManager := devices.NewManager(p) - volumeManager := volumes.NewManager(p, 0, nil) - - // Set small aggregate limits: - // - MaxTotalVcpus: 2 (first VM gets 1, second wants 2 -> denied) - // - MaxTotalMemory: 6GB (first VM gets 2.5GB, second wants 4GB -> denied) - limits := ResourceLimits{ - MaxOverlaySize: 100 * 1024 * 1024 * 1024, // 100GB - MaxVcpusPerInstance: 4, // per-instance limit (high) - MaxMemoryPerInstance: 8 * 1024 * 1024 * 1024, // 8GB per-instance (high) - MaxTotalVcpus: 2, // aggregate: only 2 total - MaxTotalMemory: 6 * 1024 * 1024 * 1024, // aggregate: only 6GB total (allows first 2.5GB VM) - } - - mgr := NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil).(*manager) - - // Cleanup any orphaned processes on test end - t.Cleanup(func() { - cleanupTestProcesses(t, mgr) - }) - - // Pull a small image - t.Log("Pulling alpine image...") - alpineImage, err := imageManager.CreateImage(ctx, images.CreateImageRequest{ - Name: "docker.io/library/alpine:latest", - }) - require.NoError(t, err) - - // Wait for image to be ready - t.Log("Waiting for image build...") - for i := 0; i < 120; i++ { - img, err := imageManager.GetImage(ctx, alpineImage.Name) - if err == nil && img.Status == images.StatusReady { - break - } - if err == nil && img.Status == images.StatusFailed { - t.Fatalf("Image build failed: %s", *img.Error) - } - time.Sleep(1 * time.Second) - } - t.Log("Image ready") - - // Ensure system files - t.Log("Ensuring system files...") - err = systemManager.EnsureSystemFiles(ctx) - require.NoError(t, err) - - // Check initial aggregate usage (should be 0) - usage, err := mgr.calculateAggregateUsage(ctx) - require.NoError(t, err) - assert.Equal(t, 0, usage.TotalVcpus, "Initial vCPUs should be 0") - assert.Equal(t, int64(0), usage.TotalMemory, "Initial memory should be 0") - - // Create first VM: 1 vCPU, 2GB + 512MB = 2.5GB memory - t.Log("Creating first instance (1 vCPU, 2.5GB memory)...") - inst1, err := mgr.CreateInstance(ctx, CreateInstanceRequest{ - Name: "small-vm-1", - Image: "docker.io/library/alpine:latest", - Vcpus: 1, - Size: 2 * 1024 * 1024 * 1024, // 2GB (needs extra room for initrd with NVIDIA libs) - HotplugSize: 512 * 1024 * 1024, // 512MB - OverlaySize: 1 * 1024 * 1024 * 1024, - NetworkEnabled: false, - }) - require.NoError(t, err) - require.NotNil(t, inst1) - t.Logf("First instance created: %s", inst1.Id) - - // Verify aggregate usage increased - usage, err = mgr.calculateAggregateUsage(ctx) - require.NoError(t, err) - assert.Equal(t, 1, usage.TotalVcpus, "Should have 1 vCPU in use") - assert.Equal(t, int64(2*1024*1024*1024+512*1024*1024), usage.TotalMemory, "Should have 2.5GB memory in use") - t.Logf("Aggregate usage after first VM: %d vCPUs, %d bytes memory", usage.TotalVcpus, usage.TotalMemory) - - // Try to create second VM: 2 vCPUs (would exceed MaxTotalVcpus=2) - // Note: 2 vCPUs alone is fine (under MaxVcpusPerInstance=4), but 1+2=3 exceeds MaxTotalVcpus=2 - t.Log("Attempting to create second instance (2 vCPUs) - should be denied...") - _, err = mgr.CreateInstance(ctx, CreateInstanceRequest{ - Name: "big-vm-2", - Image: "docker.io/library/alpine:latest", - Vcpus: 2, // This would make total 3, exceeding limit of 2 - Size: 256 * 1024 * 1024, - HotplugSize: 256 * 1024 * 1024, - OverlaySize: 1 * 1024 * 1024 * 1024, - NetworkEnabled: false, - }) - require.Error(t, err, "Should deny creation due to aggregate vCPU limit") - assert.Contains(t, err.Error(), "exceeds aggregate limit") - t.Logf("Second instance correctly denied: %v", err) - - // Verify aggregate usage didn't change (failed creation shouldn't affect it) - usage, err = mgr.calculateAggregateUsage(ctx) - require.NoError(t, err) - assert.Equal(t, 1, usage.TotalVcpus, "vCPUs should still be 1") - - // Clean up first instance - t.Log("Deleting first instance...") - err = mgr.DeleteInstance(ctx, inst1.Id) - require.NoError(t, err) - - // Verify aggregate usage is back to 0 - usage, err = mgr.calculateAggregateUsage(ctx) - require.NoError(t, err) - assert.Equal(t, 0, usage.TotalVcpus, "vCPUs should be 0 after deletion") - t.Log("Test passed: aggregate limits enforced correctly") -} +// Note: Aggregate resource limits are now handled by ResourceValidator in lib/resources. +// Tests for aggregate limits should be in lib/resources/resource_test.go. // cleanupTestProcesses kills any Cloud Hypervisor processes started during test func cleanupTestProcesses(t *testing.T, mgr *manager) { diff --git a/lib/instances/standby.go b/lib/instances/standby.go index ba19df99..f3b1cfa2 100644 --- a/lib/instances/standby.go +++ b/lib/instances/standby.go @@ -47,6 +47,12 @@ func (m *manager) standbyInstance( return nil, fmt.Errorf("%w: cannot standby from state %s", ErrInvalidState, inst.State) } + // 2b. Block standby for vGPU instances (driver limitation - NVIDIA vGPU doesn't support snapshots) + if inst.GPUMdevUUID != "" || inst.GPUProfile != "" { + log.ErrorContext(ctx, "standby not supported for vGPU instances", "instance_id", id, "gpu_profile", inst.GPUProfile) + return nil, fmt.Errorf("%w: standby is not supported for instances with vGPU attached (driver limitation)", ErrInvalidState) + } + // 3. Get network allocation BEFORE killing VMM (while we can still query it) // This is needed to delete the TAP device after VMM shuts down var networkAlloc *network.Allocation diff --git a/lib/instances/start.go b/lib/instances/start.go index 9f495f57..aaa7cb8c 100644 --- a/lib/instances/start.go +++ b/lib/instances/start.go @@ -5,6 +5,7 @@ import ( "fmt" "time" + "github.com/kernel/hypeman/lib/devices" "github.com/kernel/hypeman/lib/logger" "github.com/kernel/hypeman/lib/network" "go.opentelemetry.io/otel/trace" @@ -45,6 +46,16 @@ func (m *manager) startInstance( return nil, fmt.Errorf("%w: cannot start from state %s, must be Stopped", ErrInvalidState, inst.State) } + // 2b. Validate aggregate resource limits before allocating resources + if m.resourceValidator != nil { + needsGPU := stored.GPUProfile != "" + totalMemory := stored.Size + stored.HotplugSize + if err := m.resourceValidator.ValidateAllocation(ctx, stored.Vcpus, totalMemory, stored.NetworkBandwidthDownload, stored.NetworkBandwidthUpload, stored.DiskIOBps, needsGPU); err != nil { + log.ErrorContext(ctx, "resource validation failed for start", "instance_id", id, "error", err) + return nil, fmt.Errorf("%w: %v", ErrInsufficientResources, err) + } + } + // 3. Get image info (needed for buildHypervisorConfig) log.DebugContext(ctx, "getting image info", "instance_id", id, "image", stored.Image) imageInfo, err := m.imageManager.GetImage(ctx, stored.Image) @@ -81,6 +92,26 @@ func (m *manager) startInstance( }) } + // 4b. Recreate vGPU mdev if this instance had a GPU profile + // Note: GPU availability was already validated in step 2b + if stored.GPUProfile != "" { + log.InfoContext(ctx, "creating vGPU mdev for start", "instance_id", id, "profile", stored.GPUProfile) + mdev, err := devices.CreateMdev(ctx, stored.GPUProfile, id) + if err != nil { + log.ErrorContext(ctx, "failed to create mdev", "instance_id", id, "profile", stored.GPUProfile, "error", err) + return nil, fmt.Errorf("create vGPU mdev for profile %s: %w", stored.GPUProfile, err) + } + stored.GPUMdevUUID = mdev.UUID + log.InfoContext(ctx, "created vGPU mdev", "instance_id", id, "profile", stored.GPUProfile, "uuid", mdev.UUID) + // Add mdev cleanup to stack + cu.Add(func() { + log.DebugContext(ctx, "destroying mdev on cleanup", "instance_id", id, "uuid", mdev.UUID) + if err := devices.DestroyMdev(ctx, mdev.UUID); err != nil { + log.WarnContext(ctx, "failed to destroy mdev on cleanup", "instance_id", id, "uuid", mdev.UUID, "error", err) + } + }) + } + // 5. Regenerate config disk with new network configuration instForConfig := &Instance{StoredMetadata: *stored} log.DebugContext(ctx, "regenerating config disk", "instance_id", id) diff --git a/lib/instances/stop.go b/lib/instances/stop.go index 20888e8a..973f7641 100644 --- a/lib/instances/stop.go +++ b/lib/instances/stop.go @@ -5,6 +5,7 @@ import ( "fmt" "time" + "github.com/kernel/hypeman/lib/devices" "github.com/kernel/hypeman/lib/logger" "github.com/kernel/hypeman/lib/network" "go.opentelemetry.io/otel/trace" @@ -71,10 +72,20 @@ func (m *manager) stopInstance( } } - // 6. Update metadata (clear PID, set StoppedAt) + // 6. Destroy vGPU mdev device if present (frees vGPU slot for other VMs) + if inst.GPUMdevUUID != "" { + log.InfoContext(ctx, "destroying vGPU mdev on stop", "instance_id", id, "uuid", inst.GPUMdevUUID) + if err := devices.DestroyMdev(ctx, inst.GPUMdevUUID); err != nil { + // Log error but continue - mdev cleanup is best-effort + log.WarnContext(ctx, "failed to destroy mdev on stop", "instance_id", id, "uuid", inst.GPUMdevUUID, "error", err) + } + } + + // 7. Update metadata (clear PID, mdev UUID, set StoppedAt) now := time.Now() stored.StoppedAt = &now stored.HypervisorPID = nil + stored.GPUMdevUUID = "" // Clear mdev UUID since we destroyed it meta = &metadata{StoredMetadata: *stored} if err := m.saveMetadata(meta); err != nil { diff --git a/lib/oapi/oapi.go b/lib/oapi/oapi.go index be37cc74..8ca161fa 100644 --- a/lib/oapi/oapi.go +++ b/lib/oapi/oapi.go @@ -3645,6 +3645,7 @@ type CreateInstanceResponse struct { JSON201 *Instance JSON400 *Error JSON401 *Error + JSON409 *Error JSON500 *Error } @@ -5304,6 +5305,13 @@ func ParseCreateInstanceResponse(rsp *http.Response) (*CreateInstanceResponse, e } response.JSON401 = &dest + case strings.Contains(rsp.Header.Get("Content-Type"), "json") && rsp.StatusCode == 409: + var dest Error + if err := json.Unmarshal(bodyBytes, &dest); err != nil { + return nil, err + } + response.JSON409 = &dest + case strings.Contains(rsp.Header.Get("Content-Type"), "json") && rsp.StatusCode == 500: var dest Error if err := json.Unmarshal(bodyBytes, &dest); err != nil { @@ -8452,6 +8460,15 @@ func (response CreateInstance401JSONResponse) VisitCreateInstanceResponse(w http return json.NewEncoder(w).Encode(response) } +type CreateInstance409JSONResponse Error + +func (response CreateInstance409JSONResponse) VisitCreateInstanceResponse(w http.ResponseWriter) error { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(409) + + return json.NewEncoder(w).Encode(response) +} + type CreateInstance500JSONResponse Error func (response CreateInstance500JSONResponse) VisitCreateInstanceResponse(w http.ResponseWriter) error { @@ -10384,31 +10401,32 @@ var swaggerSpec = []string{ "rPm+tdcHjHxaeaVVoAFrMlmS12l7eat7UfhsOvpNVL58gj+0vjZaXxlcKxW/vDLAV1T9bML1b3NOkCOb", "D9rwysWf/cFUvvt1PVmMLNXQq/jibY4QLook57YC18MLkKM5xpX5b0sfakGQK7UDh7qnJ12bv95knc8D", "xO/Jo+rmce9aoh33/t2pR8mYTjOeyXJAO5QrILKo7VphwA9Nfy3Ec6MG+x1j6eA+Rce9K6g/8P4rqc71", - "DTXM25aDXaM8u1b3ozwXRzXttWc3wx/acyvtuQSu1dpznhL1a6rPZpBvpj87fPMB3F5h/iNq0A/t2gaz", - "Pu7SYW+Fx7VWUIs0wKtlv8WNb3HQnw9+/3qpSwf2MMNPuQk4j5wmWMiaZlXwe8OHwf3yvvtXAR8yir0o", - "V17zK1vm7kXMp+tvXuQ9uWsGnqsX18yVaXtvrkO+RzmiIsWRJDEJFbqd0XAG1zD0b9C/uaWB0/R9fu9y", - "a4heQHhn+SYoDN6RRFAcQ25vHpu09O/nSfJ+uJyS4ersDD4yNzBM8oX3Q+TSMOQ0JnWr8rUKvYoYS4Ve", - "2csiHb3hgsexySP8XsOztL4te+GiuKJ6zXyXLxi5tR3SCXpfuofxvuEihkPCl3qXvhHld5vTuJu1KI4E", - "AM5cCScsariEoaHmv4KxM/BmFmp5HcRM4yvfBlmazEs+za/vV1AZp2lb9LXTBCyeJ8kKHEadUup6qSKe", - "qb9IFRFhKqta7G5CbtTBoflD4RtTB7RSCM0US/CByl5t9oIqMNWOXY0F89c8SQJTlS3BvpoJn3+tpt7h", - "sj2md6Z0d+aHzNjkVkyV2ZeuxdQkhy3WAck8vMbbG9PgD6+5uKom3xgN7/8oojQLCsVOWDRewN4W5WIe", - "1p0A2MhiZSDv7Lq8NOLeNdKIrTLzh6eRAj/+4FQScgGlqaUrFfdwgrdKFkeJ3DtQm6qo+dR1Vu/V2dlW", - "E9GY6saNJCN+mMM2jvIPL1OgXNfDoxZTqRLnC1jlLNQEoRptdGezVkr5jXmme1/KTgolLORCKpIYg32S", - "xXCxDaLWbX4AXC7R0UVUSUhW3QWXVak8wzUbk4mWhykRemz9OWQ/K2wPn1l7oXBOvueGBr8PuxYSloIp", - "h1UT1Gp1MNLU5Sr12U55etVPntJzMFSrJUIk6sT0xtS9Q3OJYv2wtdLSNfVDvnT2g0+nrLxCju9Wq8HZ", - "HJn/CBzutMbWXAXIB8fWXpAysTj+AxvtZ2tyLV8TG5ZQdLArlVLsX7MzooRugwVBIY9jyNpv9PftVPBw", - "G8q7hSmNTJ03mBwwvObXCYx4fH4J7UyG9e41038sFxirT9TVKTvdfr3G92dKS/4P1nPMAleRhX/Df7h1", - "Nj8KaKQh2UCiPF2lifP0hyJuK8b+MFsfpNkKZ7H5ajpTgUNQiqWtCew3UW0hre0P5uF03Ym+wuHsyhVj", - "+D60XZu7fd0wboEPgijtmiJibt3fP03yPL3+A71ZpQHnlgBKTDk2wS8FTNmOPxp2f/kwtDIcNwpCu1fa", - "chktvhvaum/JZ+fgblSU4fFQyNxgmlsJ5Bcve59EuQzYStvMlXeCmnS5aumqk3XLRfJMAs3ch1SUZcnr", - "cfWvWV6AzCXw1NZV15lWKKLyxvRgrac+8teJM3aeLRZ3zRRHIY5Dk9Y9L5hmihzKBuvrTamI4Fejt2IQ", - "z0bnleJkXhHsIZkcfpyA3SuXHAOMs+rUyvDvK9vmPoK/rTDbIPTbreBHlGyLwO8SsNoUODHN++giS1Mu", - "lETqlkPFYAnhNpAHdcyjxRDl3zFkisxZFmerg+UlLaBCk/72rFL1pNSB+zIVpOcqQkTmVp2FsVGPluup", - "NJRMyfWjrxfBXlcduptWYSnNpbof1TWivMSJLdegYWvh5bpoVZTBV1IqL/MSZlLxxPV7eoI6OFO8NyVM", - "A7eoqJIKPqdRvTDnd1JN7wzf0SRL8lLUL55CYV9horGgxDvEAjqcInchIZGE4KytDSvvLRfds3vxaWUp", - "vhwTc9y0Uaf8htcaivyheou1jumQXHGOYiymZOsPc3nY0lpxd/j0pHZz+AFeyJg77Cv0jJZXMNqZtC0t", - "za9x/SJ3d9zv5Yur78cKK6VYfIA3gOe5mtl06+P7QsHB/YmE+77tcfWAvXba2prXwGY60D36EOYlD3GM", - "IjInMU+htqxpG3SDTMS2UuZwe1ubabE25IaHg8NB8PHdx/8fAAD//6qDr0pR3QAA", + "DTXM25aDXaM8u1b3ozwXRzXttWc3wx/acyvtuQSu1dpznhL1a6rPZpBvpj87fPMB3F5h/qFB34cGLbPJ", + "hIaUMFWkElqKarHZwB7gvRJmnfCl0+gKE26tQRd5ilcrJxZ5v0UkQj74/SvOLl/Zw4yP5SYiPnKqaiEM", + "m3XV7w0fBvfLnO9fR33IKPaiXBrOrw2ayyExn66/GpL35O5BeO6GXDNXR+69YervUY6oSHEkSUxChW5n", + "NJzBPRH9G/RvrpHgNH2fXwzdGqIXEH9avqoKg3ckERTHkHycxyZv/vt5krwfLueMuDo7g4/MFRGTHeL9", + "ELk8ETmNSd2qfO9DryLGUqFX9jZLR2+44HFsEh2/1/AsrW/L3ggp7tBeM9/tEEZubYd0gt6XLoq8b7gp", + "4pDwpd6lb0T53eY882YtiiMBgDN31gmLGm6JaKj574jsDLypj1reVzHT+MrXVZYm85JP8/wCFVTGadoW", + "fe00AYvnSbICh1GnlFtfqohn6i9SRUSY0q8Wu5uQG3VwaP5Q+MYUKq1UajPVHHygsnevvaAKTDlmVwTC", + "/DVPksCUjUuwr6jD59/7qXe4bDDqnSld7vkhMza5tlNl9qV7OzXJYauJQLYRr3X5xjT4w2suruzKN0bD", + "b2DpFbOgUI2FReMF7G1Rz+ZhXVqAjSxWBvLOrstLI+5dI43YMjh/eBop8OMPTiUhF1A7W7padg8nuqxk", + "cZTIvQPFs4qiVF1n9V6dnW01EY0pv9xIMuKHOWwDPf/wMgXqiT08ajGlNHG+gFXOQk0QqtFGdzZrpdbg", + "mGe696X0qVBjQy6kIokx2CdZDDfvIKzeJjDA5RoiXUSVhGzaXXBZlepHXLMxmWh5mBKhx9afQ3q2wvbw", + "mbUXCufke25o8PuwayGjKphyWDVBrVaoI01dMlWf7ZTnf/3kKT0HQ7Vaw0SiTkxvTGE+NJco1g9bKy1d", + "U+DkS6dn+HTKykv4+K7dGpzNkfmPwOFOa2zNlah8cGztBSkTi+M/sNF+tibX8jWxYY1HB7tSrcf+NTsj", + "Sug2WBAU8jiGsgJGf99OBQ+3of5cmNLIFKKDyQHDa36dwIjH55fQzqSA714z/cdyBbT6RF0htdPt12t8", + "f6b25f9gPccscBVZ+Df8h1tn86OARhqSDSTK01WaOE9/KOK2pO0Ps/VBmq1wFpuvpjMVOASlWNqixX4T", + "1Vb62v5gHk7XnegrHM6uXLWI70Pbtcnl1w3jFvggiNKuKSImLcD90yTP8/8/0KtfGnBuCaDElGMT/FLA", + "1BX5o2H3l4+TK8Nxoyi5e6Utl3Lju6Gt+5Z8dg4uUK0Mj4dC5gbT3EogAXrZ+yTKdcpW2mau/hQUzctV", + "S1c+rVuu4mcyfOY+pKJuTF4wrH/N8gppLsOotq66zrRCEZU3pgdrPfWRv5CdsfNsNbtrpjgKcRyavPN5", + "RTdThVE2WF9vSlUOvxq9FYN4NjovZSfzkmUPyeTw4wTsXrkmGmCcVadWxqdf2Tb3EZ1uhdkGseluBT8i", + "01tEppeA1aYCi2neRxdZmnKhJFK3HEoaSwi3gUStYx4thij/jiFTBc+yOFu+LK+5ASWk9LdnlbIspQ7c", + "l6kgPVeyIjLX/iyMjXq0XPCloaZLrh99vRD7uurQ3bRMTGku1f2orhHlNVhsPQkNWwsv10WrqhG+mld5", + "HZowk4onrt/TE9TBmeK9KWEauEXJl1TwOY3qlUO/k3J/Z/iOJlmS18p+8RQqDwsTjQU16CEW0OEUuQsJ", + "iSQEZ21tWBpwuSqg3YtPq5vx5ZiY46aNOuU3vHdRJDjVW6x1TIfkinMUYzElW3+YuxmW1orLzacnD/9C", + "xtxhX6FntLyC0c6kbWlpfo3rF7m7434vX1x9P1ZYKQfkA7yiPM/VzKZbH98XCg7uTyTc922PqwfstdPW", + "1rwGNtOB7tGHMC95iGMUkTmJeQrFb03boBtkIralPIfb29pMi7UhNzwcHA6Cj+8+/v8AAAD//3a75jLy", + "3QAA", } // GetSwagger returns the content of the embedded swagger specification file diff --git a/lib/providers/providers.go b/lib/providers/providers.go index 2c88ea00..3ea7cf8b 100644 --- a/lib/providers/providers.go +++ b/lib/providers/providers.go @@ -105,22 +105,12 @@ func ProvideInstanceManager(p *paths.Paths, cfg *config.Config, imageManager ima maxMemoryPerInstance = int64(memSize) } - // Parse max total memory (empty or "0" means unlimited) - var maxTotalMemory int64 - if cfg.MaxTotalMemory != "" && cfg.MaxTotalMemory != "0" { - var memSize datasize.ByteSize - if err := memSize.UnmarshalText([]byte(cfg.MaxTotalMemory)); err != nil { - return nil, fmt.Errorf("failed to parse MAX_TOTAL_MEMORY '%s': %w", cfg.MaxTotalMemory, err) - } - maxTotalMemory = int64(memSize) - } - + // Note: Aggregate CPU/memory limits are now handled via oversubscription ratios + // in the ResourceManager, wired up via SetResourceValidator after initialization. limits := instances.ResourceLimits{ MaxOverlaySize: int64(maxOverlaySize), MaxVcpusPerInstance: cfg.MaxVcpusPerInstance, MaxMemoryPerInstance: maxMemoryPerInstance, - MaxTotalVcpus: cfg.MaxTotalVcpus, - MaxTotalMemory: maxTotalMemory, } meter := otel.GetMeterProvider().Meter("hypeman") diff --git a/lib/resources/resource.go b/lib/resources/resource.go index 39e7e555..09d51840 100644 --- a/lib/resources/resource.go +++ b/lib/resources/resource.go @@ -7,6 +7,7 @@ import ( "fmt" "sync" + "github.com/c2h5oh/datasize" "github.com/kernel/hypeman/cmd/api/config" "github.com/kernel/hypeman/lib/logger" "github.com/kernel/hypeman/lib/paths" @@ -360,6 +361,67 @@ func (m *Manager) CanAllocate(ctx context.Context, rt ResourceType, amount int64 return amount <= status.Available, nil } +// ValidateAllocation checks if the requested resources can be allocated. +// Returns nil if allocation is allowed, or a detailed error describing +// which resource is insufficient and the current capacity/usage. +// Parameters match instances.AllocationRequest to implement instances.ResourceValidator. +func (m *Manager) ValidateAllocation(ctx context.Context, vcpus int, memoryBytes int64, networkDownloadBps int64, networkUploadBps int64, diskIOBps int64, needsGPU bool) error { + // Check CPU + if vcpus > 0 { + status, err := m.GetStatus(ctx, ResourceCPU) + if err != nil { + return fmt.Errorf("check CPU capacity: %w", err) + } + if int64(vcpus) > status.Available { + return fmt.Errorf("insufficient CPU: requested %d vCPUs, but only %d available (currently allocated: %d, effective limit: %d with %.1fx oversubscription)", + vcpus, status.Available, status.Allocated, status.EffectiveLimit, status.OversubRatio) + } + } + + // Check Memory + if memoryBytes > 0 { + status, err := m.GetStatus(ctx, ResourceMemory) + if err != nil { + return fmt.Errorf("check memory capacity: %w", err) + } + if memoryBytes > status.Available { + return fmt.Errorf("insufficient memory: requested %s, but only %s available (currently allocated: %s, effective limit: %s with %.1fx oversubscription)", + datasize.ByteSize(memoryBytes).HR(), datasize.ByteSize(status.Available).HR(), datasize.ByteSize(status.Allocated).HR(), datasize.ByteSize(status.EffectiveLimit).HR(), status.OversubRatio) + } + } + + // Check Network (use max of download/upload since they share physical link) + netBandwidth := networkDownloadBps + if networkUploadBps > netBandwidth { + netBandwidth = networkUploadBps + } + if netBandwidth > 0 { + status, err := m.GetStatus(ctx, ResourceNetwork) + if err != nil { + return fmt.Errorf("check network capacity: %w", err) + } + if netBandwidth > status.Available { + return fmt.Errorf("insufficient network bandwidth: requested %s/s, but only %s/s available (currently allocated: %s/s, effective limit: %s/s with %.1fx oversubscription)", + datasize.ByteSize(netBandwidth).HR(), datasize.ByteSize(status.Available).HR(), datasize.ByteSize(status.Allocated).HR(), datasize.ByteSize(status.EffectiveLimit).HR(), status.OversubRatio) + } + } + + // Check GPU if needed + if needsGPU { + gpuStatus := GetGPUStatus() + if gpuStatus == nil { + return fmt.Errorf("insufficient GPU: no GPU available on this host") + } + availableSlots := gpuStatus.TotalSlots - gpuStatus.UsedSlots + if availableSlots <= 0 { + return fmt.Errorf("insufficient GPU: all %d %s slots are in use", + gpuStatus.TotalSlots, gpuStatus.Mode) + } + } + + return nil +} + // CPUCapacity returns the raw CPU capacity (number of vCPUs). func (m *Manager) CPUCapacity() int64 { m.mu.RLock() diff --git a/openapi.yaml b/openapi.yaml index bd85c463..219c553a 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -1304,6 +1304,12 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" + 409: + description: Conflict - insufficient resources or name already exists + content: + application/json: + schema: + $ref: "#/components/schemas/Error" 500: description: Internal server error content: From 2915dc1e4fbe920781411b3d25c6864e5f5ffc9b Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 30 Jan 2026 12:58:49 -0500 Subject: [PATCH 2/5] Use oversubscription manager in tests --- lib/builds/manager_test.go | 6 +++++- lib/instances/create.go | 2 +- lib/instances/manager_test.go | 22 ++++++++++++++++++---- lib/instances/qemu_test.go | 10 ++++++++++ lib/instances/resource_limits_test.go | 8 +++++++- lib/instances/start.go | 2 +- lib/resources/resource.go | 15 +++++++++++---- 7 files changed, 53 insertions(+), 12 deletions(-) diff --git a/lib/builds/manager_test.go b/lib/builds/manager_test.go index 7357ac8f..5a9e82cc 100644 --- a/lib/builds/manager_test.go +++ b/lib/builds/manager_test.go @@ -126,6 +126,10 @@ func (m *mockInstanceManager) ListRunningInstancesInfo(ctx context.Context) ([]r return nil, nil } +func (m *mockInstanceManager) SetResourceValidator(v instances.ResourceValidator) { + // no-op for mock +} + // mockVolumeManager implements volumes.Manager for testing type mockVolumeManager struct { volumes map[string]*volumes.Volume @@ -611,7 +615,7 @@ func TestBuildQueue_ConcurrencyLimit(t *testing.T) { queue := NewBuildQueue(2) // Max 2 concurrent started := make(chan string, 5) - + // Enqueue 5 builds with blocking start functions for i := 0; i < 5; i++ { id := string(rune('A' + i)) diff --git a/lib/instances/create.go b/lib/instances/create.go index 35088d49..a28d0e26 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -149,7 +149,7 @@ func (m *manager) createInstance( return nil, fmt.Errorf("total memory %d (size + hotplug_size) exceeds maximum allowed %d per instance", totalMemory, m.limits.MaxMemoryPerInstance) } - // Validate aggregate resource limits via ResourceValidator + // Validate aggregate resource limits via ResourceValidator (if configured) if m.resourceValidator != nil { needsGPU := req.GPU != nil && req.GPU.Profile != "" if err := m.resourceValidator.ValidateAllocation(ctx, vcpus, totalMemory, req.NetworkBandwidthDownload, req.NetworkBandwidthUpload, req.DiskIOBps, needsGPU); err != nil { diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go index e06e659d..4bfb9b4f 100644 --- a/lib/instances/manager_test.go +++ b/lib/instances/manager_test.go @@ -24,6 +24,7 @@ import ( "github.com/kernel/hypeman/lib/ingress" "github.com/kernel/hypeman/lib/network" "github.com/kernel/hypeman/lib/paths" + "github.com/kernel/hypeman/lib/resources" "github.com/kernel/hypeman/lib/system" "github.com/kernel/hypeman/lib/vmm" "github.com/kernel/hypeman/lib/volumes" @@ -57,6 +58,15 @@ func setupTestManager(t *testing.T) (*manager, string) { } mgr := NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil).(*manager) + // Set up resource validation using the real ResourceManager + resourceMgr := resources.NewManager(cfg, p) + resourceMgr.SetInstanceLister(mgr) + resourceMgr.SetImageLister(imageManager) + resourceMgr.SetVolumeLister(volumeManager) + err = resourceMgr.Initialize(context.Background()) + require.NoError(t, err) + mgr.SetResourceValidator(resourceMgr) + // Register cleanup to kill any orphaned Cloud Hypervisor processes t.Cleanup(func() { cleanupOrphanedProcesses(t, mgr) @@ -920,10 +930,14 @@ func TestStorageOperations(t *testing.T) { tmpDir := t.TempDir() cfg := &config.Config{ - DataDir: tmpDir, - BridgeName: "vmbr0", - SubnetCIDR: "10.100.0.0/16", - DNSServer: "1.1.1.1", + DataDir: tmpDir, + BridgeName: "vmbr0", + SubnetCIDR: "10.100.0.0/16", + DNSServer: "1.1.1.1", + OversubCPU: 1.0, + OversubMemory: 1.0, + OversubDisk: 1.0, + OversubNetwork: 1.0, } p := paths.New(tmpDir) diff --git a/lib/instances/qemu_test.go b/lib/instances/qemu_test.go index e276676c..4f34384d 100644 --- a/lib/instances/qemu_test.go +++ b/lib/instances/qemu_test.go @@ -23,6 +23,7 @@ import ( "github.com/kernel/hypeman/lib/ingress" "github.com/kernel/hypeman/lib/network" "github.com/kernel/hypeman/lib/paths" + "github.com/kernel/hypeman/lib/resources" "github.com/kernel/hypeman/lib/system" "github.com/kernel/hypeman/lib/volumes" "github.com/stretchr/testify/assert" @@ -55,6 +56,15 @@ func setupTestManagerForQEMU(t *testing.T) (*manager, string) { } mgr := NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, hypervisor.TypeQEMU, nil, nil).(*manager) + // Set up resource validation using the real ResourceManager + resourceMgr := resources.NewManager(cfg, p) + resourceMgr.SetInstanceLister(mgr) + resourceMgr.SetImageLister(imageManager) + resourceMgr.SetVolumeLister(volumeManager) + err = resourceMgr.Initialize(context.Background()) + require.NoError(t, err) + mgr.SetResourceValidator(resourceMgr) + // Register cleanup to kill any orphaned QEMU processes t.Cleanup(func() { cleanupOrphanedQEMUProcesses(t, mgr) diff --git a/lib/instances/resource_limits_test.go b/lib/instances/resource_limits_test.go index fe100aeb..5b50f1af 100644 --- a/lib/instances/resource_limits_test.go +++ b/lib/instances/resource_limits_test.go @@ -150,7 +150,13 @@ func TestValidateVolumeAttachments_OverlayCountsAsTwoDevices(t *testing.T) { func createTestManager(t *testing.T, limits ResourceLimits) *manager { t.Helper() tmpDir := t.TempDir() - cfg := &config.Config{DataDir: tmpDir} + cfg := &config.Config{ + DataDir: tmpDir, + OversubCPU: 1.0, + OversubMemory: 1.0, + OversubDisk: 1.0, + OversubNetwork: 1.0, + } p := paths.New(cfg.DataDir) imageMgr, err := images.NewManager(p, 1, nil) diff --git a/lib/instances/start.go b/lib/instances/start.go index aaa7cb8c..9eaf60ed 100644 --- a/lib/instances/start.go +++ b/lib/instances/start.go @@ -46,7 +46,7 @@ func (m *manager) startInstance( return nil, fmt.Errorf("%w: cannot start from state %s, must be Stopped", ErrInvalidState, inst.State) } - // 2b. Validate aggregate resource limits before allocating resources + // 2b. Validate aggregate resource limits before allocating resources (if configured) if m.resourceValidator != nil { needsGPU := stored.GPUProfile != "" totalMemory := stored.Size + stored.HotplugSize diff --git a/lib/resources/resource.go b/lib/resources/resource.go index 09d51840..b107c17d 100644 --- a/lib/resources/resource.go +++ b/lib/resources/resource.go @@ -215,19 +215,26 @@ func (m *Manager) Initialize(ctx context.Context) error { } // GetOversubRatio returns the oversubscription ratio for a resource type. +// Returns 1.0 (no oversubscription) if the config value is not set or <= 0. func (m *Manager) GetOversubRatio(rt ResourceType) float64 { + var ratio float64 switch rt { case ResourceCPU: - return m.cfg.OversubCPU + ratio = m.cfg.OversubCPU case ResourceMemory: - return m.cfg.OversubMemory + ratio = m.cfg.OversubMemory case ResourceDisk: - return m.cfg.OversubDisk + ratio = m.cfg.OversubDisk case ResourceNetwork: - return m.cfg.OversubNetwork + ratio = m.cfg.OversubNetwork default: return 1.0 } + // Default to 1.0 (no oversubscription) if not configured + if ratio <= 0 { + return 1.0 + } + return ratio } // GetStatus returns the current status of a specific resource type. From ea8ca34f08d7870edbba88eeee04788da855cb6f Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 30 Jan 2026 14:28:57 -0500 Subject: [PATCH 3/5] Handle disk io --- lib/instances/manager.go | 1 + lib/resources/disk.go | 39 +++++++++++++++++++++++++++++++++++++++ lib/resources/resource.go | 22 ++++++++++++++++++++++ 3 files changed, 62 insertions(+) diff --git a/lib/instances/manager.go b/lib/instances/manager.go index 23860d28..8411d193 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -340,6 +340,7 @@ func (m *manager) ListInstanceAllocations(ctx context.Context) ([]resources.Inst VolumeOverlayBytes: volumeOverlayBytes, NetworkDownloadBps: inst.NetworkBandwidthDownload, NetworkUploadBps: inst.NetworkBandwidthUpload, + DiskIOBps: inst.DiskIOBps, State: string(inst.State), VolumeBytes: volumeBytes, }) diff --git a/lib/resources/disk.go b/lib/resources/disk.go index 6ea0bdd8..2b6bf76d 100644 --- a/lib/resources/disk.go +++ b/lib/resources/disk.go @@ -126,3 +126,42 @@ func parseDiskIOLimit(limit string) (int64, error) { return int64(ds.Bytes()), nil } + +// DiskIOResource implements Resource for disk I/O bandwidth tracking. +type DiskIOResource struct { + capacity int64 // bytes per second + instanceLister InstanceLister +} + +// NewDiskIOResource creates a disk I/O resource with the given capacity. +func NewDiskIOResource(capacity int64, instLister InstanceLister) *DiskIOResource { + return &DiskIOResource{capacity: capacity, instanceLister: instLister} +} + +// Type returns the resource type. +func (d *DiskIOResource) Type() ResourceType { + return ResourceDiskIO +} + +// Capacity returns the total disk I/O capacity in bytes per second. +func (d *DiskIOResource) Capacity() int64 { + return d.capacity +} + +// Allocated returns total disk I/O allocated across all active instances. +func (d *DiskIOResource) Allocated(ctx context.Context) (int64, error) { + if d.instanceLister == nil { + return 0, nil + } + instances, err := d.instanceLister.ListInstanceAllocations(ctx) + if err != nil { + return 0, err + } + var total int64 + for _, inst := range instances { + if isActiveState(inst.State) { + total += inst.DiskIOBps + } + } + return total, nil +} diff --git a/lib/resources/resource.go b/lib/resources/resource.go index b107c17d..dcd90ab1 100644 --- a/lib/resources/resource.go +++ b/lib/resources/resource.go @@ -21,6 +21,7 @@ const ( ResourceMemory ResourceType = "memory" ResourceDisk ResourceType = "disk" ResourceNetwork ResourceType = "network" + ResourceDiskIO ResourceType = "disk_io" ) // SourceType identifies how a resource capacity was determined. @@ -100,6 +101,7 @@ type InstanceAllocation struct { VolumeOverlayBytes int64 // Sum of volume overlay sizes NetworkDownloadBps int64 // Download rate limit (external→VM) NetworkUploadBps int64 // Upload rate limit (VM→external) + DiskIOBps int64 // Disk I/O rate limit (bytes/sec) State string // Only count running/paused/created instances VolumeBytes int64 // Sum of attached volume base sizes (for per-instance reporting) } @@ -211,6 +213,10 @@ func (m *Manager) Initialize(ctx context.Context) error { } m.resources[ResourceNetwork] = net + // Discover disk I/O (reuses existing DiskIOCapacity method) + diskIO := NewDiskIOResource(m.DiskIOCapacity(), m.instanceLister) + m.resources[ResourceDiskIO] = diskIO + return nil } @@ -227,6 +233,8 @@ func (m *Manager) GetOversubRatio(rt ResourceType) float64 { ratio = m.cfg.OversubDisk case ResourceNetwork: ratio = m.cfg.OversubNetwork + case ResourceDiskIO: + ratio = m.cfg.OversubDiskIO default: return 1.0 } @@ -413,6 +421,20 @@ func (m *Manager) ValidateAllocation(ctx context.Context, vcpus int, memoryBytes } } + // Check Disk I/O + if diskIOBps > 0 { + status, err := m.GetStatus(ctx, ResourceDiskIO) + if err != nil { + return fmt.Errorf("check disk I/O capacity: %w", err) + } + if diskIOBps > status.Available { + return fmt.Errorf("insufficient disk I/O: requested %s/s, but only %s/s available (currently allocated: %s/s, effective limit: %s/s with %.1fx oversubscription)", + datasize.ByteSize(diskIOBps).HR(), datasize.ByteSize(status.Available).HR(), + datasize.ByteSize(status.Allocated).HR(), datasize.ByteSize(status.EffectiveLimit).HR(), + status.OversubRatio) + } + } + // Check GPU if needed if needsGPU { gpuStatus := GetGPUStatus() From 481cedb33e858d86d05032e0c50b79466c48e62c Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 30 Jan 2026 14:44:37 -0500 Subject: [PATCH 4/5] Add disk IO in resources returned by API --- cmd/api/api/resources.go | 3 + lib/oapi/oapi.go | 138 ++++++++++++++++++++------------------ lib/resources/resource.go | 9 +++ openapi.yaml | 7 ++ 4 files changed, 90 insertions(+), 67 deletions(-) diff --git a/cmd/api/api/resources.go b/cmd/api/api/resources.go index 6df93288..ebb6ad95 100644 --- a/cmd/api/api/resources.go +++ b/cmd/api/api/resources.go @@ -25,11 +25,13 @@ func (s *ApiService) GetResources(ctx context.Context, _ oapi.GetResourcesReques } // Convert to API response + diskIO := convertResourceStatus(status.DiskIO) resp := oapi.Resources{ Cpu: convertResourceStatus(status.CPU), Memory: convertResourceStatus(status.Memory), Disk: convertResourceStatus(status.Disk), Network: convertResourceStatus(status.Network), + DiskIo: &diskIO, Allocations: make([]oapi.ResourceAllocation, 0, len(status.Allocations)), } @@ -53,6 +55,7 @@ func (s *ApiService) GetResources(ctx context.Context, _ oapi.GetResourcesReques DiskBytes: &alloc.DiskBytes, NetworkDownloadBps: &alloc.NetworkDownloadBps, NetworkUploadBps: &alloc.NetworkUploadBps, + DiskIoBps: &alloc.DiskIOBps, }) } diff --git a/lib/oapi/oapi.go b/lib/oapi/oapi.go index 8ca161fa..64630c42 100644 --- a/lib/oapi/oapi.go +++ b/lib/oapi/oapi.go @@ -712,6 +712,9 @@ type ResourceAllocation struct { // DiskBytes Disk allocated in bytes (overlay + volumes) DiskBytes *int64 `json:"disk_bytes,omitempty"` + // DiskIoBps Disk I/O bandwidth limit in bytes/sec + DiskIoBps *int64 `json:"disk_io_bps,omitempty"` + // InstanceId Instance identifier InstanceId *string `json:"instance_id,omitempty"` @@ -758,6 +761,7 @@ type Resources struct { Cpu ResourceStatus `json:"cpu"` Disk ResourceStatus `json:"disk"` DiskBreakdown *DiskBreakdown `json:"disk_breakdown,omitempty"` + DiskIo *ResourceStatus `json:"disk_io,omitempty"` // Gpu GPU resource status. Null if no GPUs available. Gpu *GPUResourceStatus `json:"gpu"` @@ -10360,73 +10364,73 @@ var swaggerSpec = []string{ "FT6kbFoNHF4asI0ZZuaw+t4fjGsbtvEYSX+AxluRAayMQ1ciXIRqtPJOUznyWxXLHQsyzWIsUD0WecWU", "5SKJKbtp07tcJGMe0xDpD+rm/ITHMb8d6Vfyr7CWrVar0x+MilPJmnluJmfPpM2G1MYtlvBXvcqtWpQL", "SP5t8/025BVu44DzRus+18abCde9ZPSuhOjVSzT7u4OmoKaGTivhTMuh3pvydouyPop3UdhHebIJz5GY", - "Oa2pWbBVPbiyXt9q4VRrVQjXsiaAOs6n5y4pVeFauizUShB/lrq7IvvpZyi382SFUtsArbM2mtPho8Mn", - "T/b2Hz3Z3UhHcYcNDYePTQcObgbbkoS1vC41venRAP630aTMcYN/Sg1HDtUJVXK0fPKEPq4gn+ISQ4Pl", - "uyrzeLGTztSuKsHt1MwVGstRRe0pperqkMmEgPNmZODWKyZTC6ppNYcQpzikauGxwvAtxBmgvEktGL9F", - "77XJekBq+0Z4orS1PidCZuPiNlrHDY7+01hXNVw4bH3xUmbjJkvudX1UY8eZwJyo5iVoYaQbjPCdQt/m", - "wES3WFY86/o5VCTqllKx1Y9gTIv2mWYdrufJZovDZd+FEn9i2fL217azpPlXFNU6xFeJsWYS1FIZon7a", - "OLk9UtFzSyVcH8VQ4w9WDn7aV6Nx+Ur0yjvnlfvTrXPYLQ9rBNHm0y0dhm/yYf2SJ6CVnYOFXNF3t7Kz", - "PqQwRxVNGUQSV1mjdgeUmlzl9qIPKjVGHZKkauGC2Z2Rt7XZ0clR3qEXp75w+NHgyZcIgL5cGfH8PyQn", - "Tfm0yg2y9pxqaU8bwwz9WudJPRLEmFf2Tn41cqF201iqFSn5V5V/MXVYwHayIb7TrH4naYOSL03WckE5", - "Lte+q/myzghc6Zoqraw0k+a9MUeVn1kfh0pXGOcTQWYtmfUxs+a4R9uSvXrSBnPlU1AwjSyADGA1CHJr", - "d9mkXh1BcYbv8hHA8MQS1dLcmXWUUsa+eArXuN+4y/t04rqAadQTFj79vMJBDquWN2NVJSF3GO4lPMt/", - "VnC0JtqqIWcxRnd1sSLNukiYCaoWF1og2DgvggURR5lBQ5AUsAj4uRgc4sY/fgRrc+JROl8QRgQN0dH5", - "KWBJghme6i27OkMxnZBwEcbEhv0uHZPCrfXXx6c9c18hzysHdQAUAMQldDo6P4VcMjYDfzDo7/YhbS5P", - "CcMpDYbBXn8HsuVoMMASt+E6GDxan46mQ5Bkp5GVuE9NEw1amXImDXB2B4NaRQdc5OvY/kUaZ4URr611", - "O1MyZzl0YSma1WkCdvofu8H+YGej+axNseEb9pLhTM24oL8RmOajDYHwSYOeMmMcu6S+xDYscDYY/lzF", - "1p/ffXzXDWSWJFiriAZcBaxSLptUGCIRRozc2nuCv/BxH10Y0wLybRTFyIzlTyLNkjBSWPSnvyEswhmd", - "k2tmObFJl4IFXIpIkObAJiS9imZmaLP7hoSJVE95tKhBN+9uW3cH2kgVwBuXushz/6UNNS983NGkGJIh", - "9+ZWIgwzVWSsMbmFbgicB07onTesHEJl/Y7jk/ydK45S5e1a3aUsjLOoEIDVohTe68qmuILNlnRDPPrC", - "C2hh51+OKnaShvGImAjRdKFmnJnnbJwxlZnnseC3kggtj+ztBgsWbf3mRa1MKjqawA0Dcx9Sj7ltprj9", - "4YYsPvav2VGUuPurNiMqjiW3aaTMWT+VKM/Le828GrQcYd3PaOyqc9UUVQJdXQdaVF4H+nkqsFbJMjlD", - "OISzff1jGTgdg81cgLjbqs81xAylPM1irTzA9pg8U5U+4KIYjmOkAH/ct1qIAkwa1iNJKIjPVvrbxetX", - "CPgnVCmBZkWgNqyBMi398nyresD+NXuGwxkyghHyEF4HNLoOimoUWyDEMkmMbOr1QLL+Fcr0mGG6NPpr", - "v6+7MkJ7iH7+YHoZaqxJk5HiN4RdBx+7qPRiStUsG+fv3jUsuMHlclFBedQxDGnLXa3VKyzxZsPMMIsQ", - "twwgXiCMClorm2RjyrBYNJV24ZlqDh0xN49ts+Ja3MFgsLX+aMAu1aOuVBpqTP24JJ13v5hgskJ5WTCV", - "yrhpMcDstfLIiON7kIxPceRuO/1QAdaoANZ2KQl3+N4qgNsfaPTRoG9MTKhiTUJDtR8noVMscEIU5Hv+", - "2Y/zEKVJ9d/uIA98DcaSryJvtwSeukL/bgmx9xvLKOUFiQAX9u8B/2DcItkXjPvkvsbFsUk1m5d2fFDo", - "CJvlELHrtz5eEPU9YNzgvlipy0n4DfH3oeDPC2JVpAJoNW62DUney6Zt/TaBIDiRthfTWNsyFzCn3gVh", - "CkEBP9m3/zo1GwK138d8+n6IDAhjW75Q2ixzuQ9YC0ULS/jIJOHIv7O5acIZZlMiUcfIz9//9W9Xgu33", - "f/3blmD7/V//BnLftgVFobu8eOD7Ifo7IWkPx3RO3GIg+JDMiVigvYEtawGvPJlu5DW7Zm+IygSTeeiO", - "XhfAxHQIKjuD9VCWEYkkgBByUE9sTIlxMXlMPEfLBpT3StHdJUvXrqC0AC0VHQ7AASVlVFEcI54pky4S", - "5gH3W4qJmDUH5cHr3rIl/+l6/qLInTLY2zMT3JDBmOKbHroz9ShNn6hzcfFsq49A3TdYAXFDYDcU3VhL", - "oP+DJ63nSYajVBkKQNnwplKSw0Zf24ltcx/OtqYEiM3eNgHZ2om2Xd1ifqjdLTxvfrg5L5zPFXbiknI3", - "+8I+fb2+2pytbMovt88O95ZhbjPOFyD7FtYk6thkwXlOkEpa+2+F9PfCgEvVEHIujLjJRHJvFs4xZ5OY", - "hgr13Fxsvcbc6qkiyENhB2/srBF266oHu5dFxXYlbqxRaOQhZPcpPWqDbiJGioD8Atd+SJJ1qHNCZcj1", - "tyVs6YU4BUBaIBZ0Wsaidb6dE/g9FzkrFfO8gqojyPvz8tihM1aXDffAFE9qDPEbMsJaxozSFZaHhM2X", - "+S66AiQrnEDfF2oO7k8Lum+HkA/NH5JHKKqBTXPBWZ6juwm9bBbvr7jRdgTPwi+IcFRtJmoyNRTLMp+i", - "cEbCG7MgWydnlUZw6krpfH09wKQi30D62+n/EPctDMcCVquMxVObvuPr2Yowwkam4pc7frQI5gEyRGmM", - "nSPVZMbAcsHCrT/UCeS9SIZ6XZsHREnnWRw7R/ycCFUkZC/z0+0PWj9ooSc7alupi1y+edkjLOQQk2NA", - "16iQuPzLX1ZbNhtmlvIDTdrYVwAqhxjNyuhn7L8JnUJ5YsQ/7T63qRH/tPvcJEf8096RSY+49dWQZXBf", - "rPm+tdcHjHxaeaVVoAFrMlmS12l7eat7UfhsOvpNVL58gj+0vjZaXxlcKxW/vDLAV1T9bML1b3NOkCOb", - "D9rwysWf/cFUvvt1PVmMLNXQq/jibY4QLook57YC18MLkKM5xpX5b0sfakGQK7UDh7qnJ12bv95knc8D", - "xO/Jo+rmce9aoh33/t2pR8mYTjOeyXJAO5QrILKo7VphwA9Nfy3Ec6MG+x1j6eA+Rce9K6g/8P4rqc71", - "DTXM25aDXaM8u1b3ozwXRzXttWc3wx/acyvtuQSu1dpznhL1a6rPZpBvpj87fPMB3F5h/qFB34cGLbPJ", - "hIaUMFWkElqKarHZwB7gvRJmnfCl0+gKE26tQRd5ilcrJxZ5v0UkQj74/SvOLl/Zw4yP5SYiPnKqaiEM", - "m3XV7w0fBvfLnO9fR33IKPaiXBrOrw2ayyExn66/GpL35O5BeO6GXDNXR+69YervUY6oSHEkSUxChW5n", - "NJzBPRH9G/RvrpHgNH2fXwzdGqIXEH9avqoKg3ckERTHkHycxyZv/vt5krwfLueMuDo7g4/MFRGTHeL9", - "ELk8ETmNSd2qfO9DryLGUqFX9jZLR2+44HFsEh2/1/AsrW/L3ggp7tBeM9/tEEZubYd0gt6XLoq8b7gp", - "4pDwpd6lb0T53eY882YtiiMBgDN31gmLGm6JaKj574jsDLypj1reVzHT+MrXVZYm85JP8/wCFVTGadoW", - "fe00AYvnSbICh1GnlFtfqohn6i9SRUSY0q8Wu5uQG3VwaP5Q+MYUKq1UajPVHHygsnevvaAKTDlmVwTC", - "/DVPksCUjUuwr6jD59/7qXe4bDDqnSld7vkhMza5tlNl9qV7OzXJYauJQLYRr3X5xjT4w2suruzKN0bD", - "b2DpFbOgUI2FReMF7G1Rz+ZhXVqAjSxWBvLOrstLI+5dI43YMjh/eBop8OMPTiUhF1A7W7padg8nuqxk", - "cZTIvQPFs4qiVF1n9V6dnW01EY0pv9xIMuKHOWwDPf/wMgXqiT08ajGlNHG+gFXOQk0QqtFGdzZrpdbg", - "mGe696X0qVBjQy6kIokx2CdZDDfvIKzeJjDA5RoiXUSVhGzaXXBZlepHXLMxmWh5mBKhx9afQ3q2wvbw", - "mbUXCufke25o8PuwayGjKphyWDVBrVaoI01dMlWf7ZTnf/3kKT0HQ7Vaw0SiTkxvTGE+NJco1g9bKy1d", - "U+DkS6dn+HTKykv4+K7dGpzNkfmPwOFOa2zNlah8cGztBSkTi+M/sNF+tibX8jWxYY1HB7tSrcf+NTsj", - "Sug2WBAU8jiGsgJGf99OBQ+3of5cmNLIFKKDyQHDa36dwIjH55fQzqSA714z/cdyBbT6RF0htdPt12t8", - "f6b25f9gPccscBVZ+Df8h1tn86OARhqSDSTK01WaOE9/KOK2pO0Ps/VBmq1wFpuvpjMVOASlWNqixX4T", - "1Vb62v5gHk7XnegrHM6uXLWI70Pbtcnl1w3jFvggiNKuKSImLcD90yTP8/8/0KtfGnBuCaDElGMT/FLA", - "1BX5o2H3l4+TK8Nxoyi5e6Utl3Lju6Gt+5Z8dg4uUK0Mj4dC5gbT3EogAXrZ+yTKdcpW2mau/hQUzctV", - "S1c+rVuu4mcyfOY+pKJuTF4wrH/N8gppLsOotq66zrRCEZU3pgdrPfWRv5CdsfNsNbtrpjgKcRyavPN5", - "RTdThVE2WF9vSlUOvxq9FYN4NjovZSfzkmUPyeTw4wTsXrkmGmCcVadWxqdf2Tb3EZ1uhdkGseluBT8i", - "01tEppeA1aYCi2neRxdZmnKhJFK3HEoaSwi3gUStYx4thij/jiFTBc+yOFu+LK+5ASWk9LdnlbIspQ7c", - "l6kgPVeyIjLX/iyMjXq0XPCloaZLrh99vRD7uurQ3bRMTGku1f2orhHlNVhsPQkNWwsv10WrqhG+mld5", - "HZowk4onrt/TE9TBmeK9KWEauEXJl1TwOY3qlUO/k3J/Z/iOJlmS18p+8RQqDwsTjQU16CEW0OEUuQsJ", - "iSQEZ21tWBpwuSqg3YtPq5vx5ZiY46aNOuU3vHdRJDjVW6x1TIfkinMUYzElW3+YuxmW1orLzacnD/9C", - "xtxhX6FntLyC0c6kbWlpfo3rF7m7434vX1x9P1ZYKQfkA7yiPM/VzKZbH98XCg7uTyTc922PqwfstdPW", - "1rwGNtOB7tGHMC95iGMUkTmJeQrFb03boBtkIralPIfb29pMi7UhNzwcHA6Cj+8+/v8AAAD//3a75jLy", - "3QAA", + "Oa2pWbBVPbiyXt9q4VRrVQjXsiaAOs6n5y4pVeFauizUShC3O1ire6/dbLYlCauj7x8+enzQ8rbWZ6na", + "KzKvfoZiPU9WKNQNO3XWRms7fHT45Mne/qMnuxvpR+6go2F/mg47yvtTyylT09keDeB/G03KHHX4p9Rw", + "3FGdUCU/zCdP6OMK0i0uUDRY3auynhc76cz8qgLeTsVdoS0dVVSuUpqwDplMCDiORgZuvWIytYCeVnMI", + "cYpDqhYeCxDfQowDypvULgK06L02WQ9Ibd8ITxQRcBohs3FxE67jBkf/aSy7Gi4ctr70KbNxkxX5uj6q", + "sSFNUFBU81C0cBAYjPCdgN/mwES3WFa8+vo5VCTqltLA1Y9/TIv2WW4drueJbouDbd9lFn9S2/L217az", + "ZHVUlOQ6xFeJ0GYS1BoBRBy1cbB7JLLnhky4PoKixh+sAPy0r0bj8nXslffdK3e3C6m7+bjtEu8tf2ck", + "2ObjlU7wN/mwfjMV8NHOwYK86LtbQQkfNpnzlaa0J4krB1K7uEpNgnV7OwmVGqMOSVK1cBH4zjLd2uy8", + "5yjv0IuMXzhmavDkS0RtX64M0/4fkkinfMTmBll7uLa0p42xkX519aQevmJsQptIoBpuUbseLdWKOgKr", + "ataY4jFg8Nm45GlWv0i1QZ2aJhO/oBxXIMAVqllnua70p5VWVppJ896Y89XPLOpDpavm84kgs+bX+kBf", + "c0alDeBePdOEuacqKNhzFkAGsBoEuYm+7AdYHfZxhu/yEcBaxhLVcvOZdZTy3L54CnfP37iMA3TiuoBp", + "1LMsPv28akcOq5Y3Y1X5I3eC7yU8y39WcLQm2qohZzFGd3WFJc26SJgJqhYXWiDY4DSCBRFHmUFDkBSw", + "CPi5GByC3T9+BDN14tFWXxBGBA3R0fkpYEmCGZ7qLbs6QzGdkHARxsTGKi+d7cJV+9fHpz1zySJPhgfF", + "CxQAxGWhOjo/hQQ4tmxAMOjv9iHXL08JwykNhsFefwdS/GgwwBK34Q4bPFpHlKZDkGSnkZW4T00TDVqZ", + "ciYNcHYHg1oZClwkGdn+RRoPixGvrZVCU+dnOd5iKQTXaQJ2+h+7wf5gZ6P5rM0L4hv2kuFMzbigvxGY", + "5qMNgfBJg54yY1W7TMTENixwNhj+XMXWn999fNcNZJYkWKuIBlwFrFIum1QYIhFGjNzay42/8HEfXRib", + "BJKEFBXUjMuARJolYaSw6E9/Q1iEMzon18xyYpPjBQu4yZEgzYFNHH0VzczQZvcNCROpnvJoUYNu3t22", + "7g60kSqAN67PkScsTBsKdfi4o8mLJEPuTQhFGGaqSLNjEiLdEDjEnNA7byw8xPf6vd0n+TtX0aXK27W6", + "S1kYZ1EhAKuVNLx3rE1FCJvi6YZ49IUX0MLOvxwK7SQN4xExYa3pQs04M8/ZOGMqM89jwW8lEVoe2SsZ", + "FizabM4rcZn8eTSBaxHmEqcec9tMcfvDDVl87F+zoyhxl25tGlccS25zX5kABSpRnkz4mnk1aDnCup/R", + "2JUUqymqBLq6DrSovA7081RgrZJlcoZwCAEJ+scycDoGm7kAcbdVn2uIGUp5msVaeYDtMcmxKn3A7TYc", + "x0gB/rhvtRAFmDSsR5JQEJ+t9LeL168Q8E8orQLNiuhyWANlWvrlSWL1gP1r9gyHM2QEIyRPvA5odB0U", + "JTS2QIhlkhjZ1OuBZP0r1BYyw3Rp9Nd+X3dlhPYQ/fzB9DLUWJMmI8VvCLsOPnZR6cWUqlk2zt+9a1hw", + "g6/mooLyqGMY0pa7D6xXWOLNhplhFiFuGUC8QBgVtFY2ycaUYbFoqkfDM9Uc72KuS9tmxV2+g8Fga/15", + "hl2qR12pNNSY+nFJOu9+McFkhfKyYCrVntNigNm78JERx/cgGZ/iyF3R+qECrFEBrO1SEu7wvVUAtz/Q", + "6KNB35iY+MqahIYSRU5Cp1jghChIUv2zH+chtJTqv93pI/gajCVfRd5uCTx1hf7dEmLvN9Z+yqsoAS7s", + "3wP+wbhFhjIY98l9jYtjkx83r0f5oNARNsshYtdvfbwg6nvAuMF9sVKXSPEb4u9DwZ8XxKpIBdBq3Gwb", + "MtOXTdv6FQhBcCJtL6axtmUuYE69C8IUgqqDsm//dWo2RJe/j/n0/RAZEMa25qK0qfFyH7AWihaW8JHJ", + "HJJ/ZxPqhDPMpkSijpGfv//r365u3O//+retG/f7v/4N5L5tq6BCd3nFw/dD9HdC0h6O6Zy4xUDEJJkT", + "sUB7A1uLA1550vPIa3bN3hCVCSbzeCO9LoCJ6RBUdgbroSwjEkkAISTOnthAGONi8ph4jpYNKO+VortL", + "lq5dQWkBWio6HICTTcqoojhGPFMmxyXMAy7lFBMxaw7Kg9e9ZUv+0/X8RZE7ZbC3Zya4IYMxFUM9dGeK", + "aJo+Uefi4tlWH4G6b7ACgp3Abii6sZZA/wdPWs+TDEepMhSAsuFNpcyMjb62E9vmPpxtTVkbm71tAlLM", + "E227usX8ULtbeN78cHNeOJ8r7MRlEm/2hX36en0FRVvZlF9unx3uLcPcpskvQPYtrEnUsRmO80QmlVz8", + "3wrp74UBl0o45FwYcZM+5d4snGPOJjENFeq5udgik7nVU0WQh8IO3thZI+zWVY/QL4uK7UrAWaPQyGPP", + "7lN61AbdRIwUtwgKXPshSdahzgmVIdfflrClF+IUAGmBWNBpGYvW+XZO4Pdc5KxUzPOyr44g78/LY4fO", + "WF023ANTPKkxxG/ICGtpPkr3bh4SNl/mu+iqpqxwAn1fqDm4Py3ovh1CPjR/SB6hqAY2zQVneWLxJvSy", + "qce/4kbbETwLvyDCUbWZqEkvUSzLfIrCGQlvzIJscZ9VGsGpq//z9fUAkz99A+lvp/9D3LcwHAtYrTIW", + "T23Oka9nK8IIG5mKX+740SKYB8gQpTF2jlSTzgPLBQu3/lAnkPciGerFeB4QJZ1ncewc8XMiVJFFvsxP", + "tz9o/aCFnuyobaUucvnmZY+wkENMjgFdo0LikkZ/WW3ZbJhZyg80aWNfAagcYjQro5+x/yZ0CuXZHP+0", + "+9zmc/zT7nOT0fFPe0cmp+PWV0OWwX2x5vvWXh8w8mnllVaBBqzJpHZep+3lre5F4bM59DdR+fIJ/tD6", + "2mh9ZXCtVPzycgZfUfWzWeK/zTlBjmw+aMMrF3/2B1P57tf1ZDGyVPiv4ou3iU24KDKz27JhDy9AjuYY", + "V+a/LX2oBUGu1A4c6p6edG3SfZMqPw8QvyePqpvHvWuJdtz7d6ceJWM6zXgmywHtUGOByKIgbYUBPzT9", + "tRDPjRrsd4ylg/sUHfeuoP7A+6+kOtc31DBvW8N2jfLsWt2P8lwc1bTXnt0Mf2jPrbTnErhWa895Htev", + "qT6bQb6Z/uzwzQdwe4X5hwZ9Hxq0zCYTGlLCVJGDaCmqxaYwe4D3Sph1wpdOoytMuLUGXSRXXq2cWOT9", + "FpEI+eD3rzi7RGcPMz6Wm4j4yKmqhTBs1lW/N3wY3C9zvn8d9SGj2ItyPTu/Nmguh8R8uv5qSN6Tuwfh", + "uRtyzVzxu/eGqb9HOaIixZEkMQkVup3RcAb3RPRv0L+5RoLT9H1+MXRriF5A/Gn5qioM3pFEUBxDxnQe", + "m2T/7+dJ8n64nDPi6uwMPjJXREx2iPdD5PJE5DQmdavyvQ+9ihhLhV7Z2ywdveGCx7HJzvxew7O0vi17", + "I6S4Q3vNfLdDGLm1HdIJel+6KPK+4aaIQ8KXepe+EeV3m5Pjm7UojgQAztxZJyxquCWioea/I7Iz8KY+", + "anlfxUzjK19XWZrMSz7N8wtUUBmnaVv0tdMELJ4nyQocRp1SQQCpIp6pv0gVEWHq1VrsbkJu1MGh+UPh", + "G1NdtVJezpSg8IHK3r32giowNaRd5Qrz1zxJAlPrLsG+ShSff++n3uGywah3pnS554fM2OTaTpXZl+7t", + "1CSHLYEC2Ua81uUb0+APr7m4WjHfGA2/gaVXzIJCCRkWjRewt0URnod1aQE2slgZyDu7Li+NuHeNNGJr", + "9/zhaaTAjz84lYRcQMFv6QrwPZzospLFUSL3DlT8KippdZ3Ve3V2ttVENKZmdCPJiB/msA30/MPLFCiC", + "9vCoxdT/xPkCVjkLNUGoRhvd2ayVAoljnunel9KnQmEQuZCKJMZgn2Qx3LyDsHqbwACXC590EVUS0nB3", + "wWVVKnpxzcZkouVhSoQeW38O6dkK28Nn1l4onJPvuaHB78OuhYyqYMph1QS1WnWRNHXJVH22U57/9ZOn", + "9BwM1WrhFYk6Mb0x1QTRXKJYP2yttHRNVZYvnZ7h0ykrrzvku3ZrcDZH5j8ChzutsTVXV/PBsbUXpEws", + "jv/ARvvZmlzL18SGhSkd7EoFKvvX7IwoodtgQVDI4xjqERj9fTsVPNyGonlhSiNTPQ8mBwyv+XUCIx6f", + "X0I7kwK+e830H8tl2+oTddXfTrdfr/H9mYKd/4P1HLPAVWTh3/Afbp3NjwIaaUg2kChPV2niPP2hiNs6", + "vD/M1gdptsJZbL6azlTgEJRiaSst+01UW55s+4N5OF13oq9wOLty1SK+D23XJpdfN4xb4IMgSrumiJi0", + "APdPkzzP//9Ar35pwLklgBJTjk3wSwFTV+SPht1fPk6uDMeNouTulbZcyo3vhrbuW/LZObhAtTI8HgqZ", + "G0xzK4EE6GXvkygXOFtpm7n6U1BtL1ctXd21brn8n8nwmfuQiroxeaWx/jXLS6u5DKPauuo60wpFVN6Y", + "Hqz11Ef+CnjGzrNl8K6Z4ijEcWjyzuel4Ez5Rtlgfb0plUf8avRWDOLZ6LwGnsxLlj0kk8OPE7B75Zpo", + "gHFWnVoZn35l29xHdLoVZhvEprsV/IhMbxGZXgJWmwospnkfXWRpyoWSSN1yqMMsIdwGErWOebQYovw7", + "hkwVPMvibPmyvOYGlJDS355VyrKUOnBfpoL0XMmKyFz7szA26tFywZeGmi65fvT1QuzrqkN30zIxpblU", + "96O6RpTXYLH1JDRsLbxcF62qRvhqXuV1aMJMKp64fk9PUAdnivemhGngFiVfUsHnNKqXHP1Oyv2d4Tua", + "ZEle4PvFUyhZLEw0FhTOh1hAh1PkLiQkkhCctbVhacDlqoB2Lz6tbsaXY2KOmzbqlN/w3kWR4FRvsdYx", + "HZIrzlGMxZRs/WHuZlhaKy43n548/AsZc4d9hZ7R8gpGO5O2paX5Na5f5O6O+718cfX9WGGlHJAP8Iry", + "PFczm259fF8oOLg/kXDftz2uHrDXTltb8xrYTAe6Rx/CvOQhjlFE5iTmKRS/NW2DbpCJ2JbyHG5vazMt", + "1obc8HBwOAg+vvv4/wMAAP//ZcQlB6feAAA=", } // GetSwagger returns the content of the embedded swagger specification file diff --git a/lib/resources/resource.go b/lib/resources/resource.go index dcd90ab1..939fdcbb 100644 --- a/lib/resources/resource.go +++ b/lib/resources/resource.go @@ -64,6 +64,7 @@ type AllocationBreakdown struct { DiskBytes int64 `json:"disk_bytes"` NetworkDownloadBps int64 `json:"network_download_bps"` // External→VM NetworkUploadBps int64 `json:"network_upload_bps"` // VM→External + DiskIOBps int64 `json:"disk_io_bps"` // Disk I/O bandwidth } // DiskBreakdown shows disk usage by category. @@ -80,6 +81,7 @@ type FullResourceStatus struct { Memory ResourceStatus `json:"memory"` Disk ResourceStatus `json:"disk"` Network ResourceStatus `json:"network"` + DiskIO ResourceStatus `json:"disk_io"` DiskDetail *DiskBreakdown `json:"disk_breakdown,omitempty"` GPU *GPUResourceStatus `json:"gpu,omitempty"` // nil if no GPU available Allocations []AllocationBreakdown `json:"allocations"` @@ -312,6 +314,11 @@ func (m *Manager) GetFullStatus(ctx context.Context) (*FullResourceStatus, error return nil, err } + diskIOStatus, err := m.GetStatus(ctx, ResourceDiskIO) + if err != nil { + return nil, err + } + // Get disk breakdown var diskBreakdown *DiskBreakdown m.mu.RLock() @@ -347,6 +354,7 @@ func (m *Manager) GetFullStatus(ctx context.Context) (*FullResourceStatus, error DiskBytes: inst.OverlayBytes + inst.VolumeBytes, NetworkDownloadBps: inst.NetworkDownloadBps, NetworkUploadBps: inst.NetworkUploadBps, + DiskIOBps: inst.DiskIOBps, }) } } @@ -361,6 +369,7 @@ func (m *Manager) GetFullStatus(ctx context.Context) (*FullResourceStatus, error Memory: *memStatus, Disk: *diskStatus, Network: *netStatus, + DiskIO: *diskIOStatus, DiskDetail: diskBreakdown, GPU: gpuStatus, Allocations: allocations, diff --git a/openapi.yaml b/openapi.yaml index 219c553a..ec88bfdd 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -1058,6 +1058,11 @@ components: format: int64 description: Upload bandwidth limit in bytes/sec (VM→external) example: 125000000 + disk_io_bps: + type: integer + format: int64 + description: Disk I/O bandwidth limit in bytes/sec + example: 104857600 Resources: type: object @@ -1071,6 +1076,8 @@ components: $ref: "#/components/schemas/ResourceStatus" network: $ref: "#/components/schemas/ResourceStatus" + disk_io: + $ref: "#/components/schemas/ResourceStatus" disk_breakdown: $ref: "#/components/schemas/DiskBreakdown" gpu: From ca5e720a89022a93fb1ce2d5738c834965e3ad99 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 30 Jan 2026 14:48:44 -0500 Subject: [PATCH 5/5] Also check resources on restore --- lib/instances/restore.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/instances/restore.go b/lib/instances/restore.go index 8acc6466..ed81fbfd 100644 --- a/lib/instances/restore.go +++ b/lib/instances/restore.go @@ -51,6 +51,16 @@ func (m *manager) restoreInstance( return nil, fmt.Errorf("no snapshot available for instance %s", id) } + // 2b. Validate aggregate resource limits before allocating resources (if configured) + if m.resourceValidator != nil { + needsGPU := stored.GPUProfile != "" + totalMemory := stored.Size + stored.HotplugSize + if err := m.resourceValidator.ValidateAllocation(ctx, stored.Vcpus, totalMemory, stored.NetworkBandwidthDownload, stored.NetworkBandwidthUpload, stored.DiskIOBps, needsGPU); err != nil { + log.ErrorContext(ctx, "resource validation failed for restore", "instance_id", id, "error", err) + return nil, fmt.Errorf("%w: %v", ErrInsufficientResources, err) + } + } + // 3. Get snapshot directory snapshotDir := m.paths.InstanceSnapshotLatest(id)