From 73deae9e86205888e7aacc3b580c685e839ecbfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Tue, 28 Apr 2026 14:32:52 +0300 Subject: [PATCH 01/29] feat(instances): add StatusPaused and pause/resume to interfaces Add PAUSED status to InstanceStatus enum and extend both ComputeBackend and InstanceService interfaces with PauseInstance and ResumeInstance methods. --- internal/core/domain/instance.go | 3 +++ internal/core/ports/compute.go | 6 ++++++ internal/core/ports/instance.go | 4 ++++ 3 files changed, 13 insertions(+) diff --git a/internal/core/domain/instance.go b/internal/core/domain/instance.go index c87db6a40..e3f30bb1f 100644 --- a/internal/core/domain/instance.go +++ b/internal/core/domain/instance.go @@ -33,6 +33,9 @@ const ( // Check logs for details. Manual intervention may be required. StatusError InstanceStatus = "ERROR" + // StatusPaused indicates the instance is paused (frozen CPU, retained memory/network). + StatusPaused InstanceStatus = "PAUSED" + // StatusDeleted indicates the instance has been permanently removed. // All associated resources have been cleaned up. StatusDeleted InstanceStatus = "DELETED" diff --git a/internal/core/ports/compute.go b/internal/core/ports/compute.go index 7fbb9ca9c..8a15fbfc2 100644 --- a/internal/core/ports/compute.go +++ b/internal/core/ports/compute.go @@ -18,6 +18,10 @@ type ComputeBackend interface { StartInstance(ctx context.Context, id string) error // StopInstance gracefully shuts down or forcibly terminates a running instance. StopInstance(ctx context.Context, id string) error + // PauseInstance freezes a running instance (CPU halted, memory/network retained). + PauseInstance(ctx context.Context, id string) error + // ResumeInstance resumes a paused instance back to running state. + ResumeInstance(ctx context.Context, id string) error // DeleteInstance removes an instance and its ephemeral resources. DeleteInstance(ctx context.Context, id string) error // GetInstanceLogs returns a stream of stdout/stderr from the instance. @@ -63,4 +67,6 @@ type ComputeBackend interface { Ping(ctx context.Context) error // Type returns a string identifier of the backend (e.g., "docker", "kvm"). Type() string + // ResizeInstance updates the CPU and memory limits of a running or stopped instance. + ResizeInstance(ctx context.Context, id string, cpu, memory int64) error } diff --git a/internal/core/ports/instance.go b/internal/core/ports/instance.go index 4a4c8655e..8c8842ff6 100644 --- a/internal/core/ports/instance.go +++ b/internal/core/ports/instance.go @@ -58,6 +58,10 @@ type InstanceService interface { StartInstance(ctx context.Context, idOrName string) error // StopInstance gracefully shuts down or halts a running compute resource. StopInstance(ctx context.Context, idOrName string) error + // PauseInstance freezes a running instance (CPU halted, memory/network retained). + PauseInstance(ctx context.Context, idOrName string) error + // ResumeInstance resumes a paused instance back to running state. + ResumeInstance(ctx context.Context, idOrName string) error // ListInstances returns a slice of all compute resources accessible to the caller. ListInstances(ctx context.Context) ([]*domain.Instance, error) // GetInstance retrieves detailed information about a specific compute resource. From 77cdfdbc3643924faa1e7b79bca2081b75578f83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Tue, 28 Apr 2026 14:32:58 +0300 Subject: [PATCH 02/29] feat(docker): implement PauseInstance and ResumeInstance Use ContainerPause and ContainerUnpause Docker API calls to freeze and unfreeze containers. Add ContainerUpdate for ResizeInstance support. --- internal/repositories/docker/adapter.go | 35 ++++++++++++++++++++++ internal/repositories/docker/fakes_test.go | 13 ++++++++ 2 files changed, 48 insertions(+) diff --git a/internal/repositories/docker/adapter.go b/internal/repositories/docker/adapter.go index 12fdb449d..de175adb8 100644 --- a/internal/repositories/docker/adapter.go +++ b/internal/repositories/docker/adapter.go @@ -71,6 +71,8 @@ type dockerClient interface { ContainerCreate(ctx context.Context, config *container.Config, hostConfig *container.HostConfig, networkingConfig *network.NetworkingConfig, platform *v1.Platform, containerName string) (container.CreateResponse, error) ContainerStart(ctx context.Context, containerID string, options container.StartOptions) error ContainerStop(ctx context.Context, containerID string, options container.StopOptions) error + ContainerPause(ctx context.Context, containerID string) error + ContainerUnpause(ctx context.Context, containerID string) error ContainerRemove(ctx context.Context, containerID string, options container.RemoveOptions) error ContainerLogs(ctx context.Context, containerID string, options container.LogsOptions) (io.ReadCloser, error) ContainerStats(ctx context.Context, containerID string, stream bool) (container.StatsResponseReader, error) @@ -85,6 +87,7 @@ type dockerClient interface { ContainerExecAttach(ctx context.Context, execID string, config container.ExecStartOptions) (types.HijackedResponse, error) ContainerExecInspect(ctx context.Context, execID string) (container.ExecInspect, error) ContainerRename(ctx context.Context, containerID string, newName string) error + ContainerUpdate(ctx context.Context, containerID string, config container.UpdateConfig) (container.UpdateResponse, error) } // NewDockerAdapter constructs a DockerAdapter with a Docker client. @@ -375,6 +378,38 @@ func (a *DockerAdapter) StopInstance(ctx context.Context, name string) error { return nil } +func (a *DockerAdapter) PauseInstance(ctx context.Context, name string) error { + if err := a.cli.ContainerPause(ctx, name); err != nil { + return fmt.Errorf("failed to pause container %s: %w", name, err) + } + return nil +} + +func (a *DockerAdapter) ResumeInstance(ctx context.Context, name string) error { + if err := a.cli.ContainerUnpause(ctx, name); err != nil { + return fmt.Errorf("failed to resume container %s: %w", name, err) + } + return nil +} + +// ResizeInstance updates the CPU and memory limits of a container. +func (a *DockerAdapter) ResizeInstance(ctx context.Context, id string, cpuNanoCPUs, memoryBytes int64) error { + resp, err := a.cli.ContainerUpdate(ctx, id, container.UpdateConfig{ + Resources: container.Resources{ + NanoCPUs: cpuNanoCPUs, + Memory: memoryBytes, + MemorySwap: memoryBytes, // Must be >= Memory; setting equal disables swap while allowing memory update + }, + }) + if err != nil { + return fmt.Errorf("failed to update container %s: %w", id, err) + } + if resp.Warnings != nil { + a.logger.Warn("container update warnings", "container_id", id, "warnings", resp.Warnings) + } + return nil +} + func (a *DockerAdapter) DeleteInstance(ctx context.Context, containerID string) error { err := a.cli.ContainerRemove(ctx, containerID, container.RemoveOptions{Force: true}) if err != nil { diff --git a/internal/repositories/docker/fakes_test.go b/internal/repositories/docker/fakes_test.go index 484b44dc9..3023faf27 100644 --- a/internal/repositories/docker/fakes_test.go +++ b/internal/repositories/docker/fakes_test.go @@ -95,6 +95,14 @@ func (f *fakeDockerClient) ContainerStop(ctx context.Context, containerID string return f.stopErr } +func (f *fakeDockerClient) ContainerPause(ctx context.Context, containerID string) error { + return nil +} + +func (f *fakeDockerClient) ContainerUnpause(ctx context.Context, containerID string) error { + return nil +} + func (f *fakeDockerClient) ContainerRemove(ctx context.Context, containerID string, options container.RemoveOptions) error { return f.removeErr } @@ -192,4 +200,9 @@ func (f *fakeDockerClient) ContainerRename(ctx context.Context, containerID stri return nil } +func (f *fakeDockerClient) ContainerUpdate(ctx context.Context, containerID string, config container.UpdateConfig) (container.UpdateResponse, error) { + f.inc("ContainerUpdate") + return container.UpdateResponse{}, nil +} + var errFakeNotFound = errors.New("not found") From f5f81e8e36adf9f5207ad35bf710d27155f6b15e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Tue, 28 Apr 2026 14:33:04 +0300 Subject: [PATCH 03/29] feat(libvirt): implement PauseInstance and ResumeInstance via DomainSuspend/DomainResume Add DomainSuspend and DomainResume to LibvirtClient interface and implement them in RealLibvirtClient using virDomainSuspend and virDomainResume libvirt calls. --- internal/repositories/libvirt/adapter.go | 87 +++++++++++++++++++ .../repositories/libvirt/libvirt_client.go | 2 + .../repositories/libvirt/mock_client_test.go | 8 ++ internal/repositories/libvirt/real_client.go | 18 ++++ 4 files changed, 115 insertions(+) diff --git a/internal/repositories/libvirt/adapter.go b/internal/repositories/libvirt/adapter.go index 52bd73e8e..251733d3f 100644 --- a/internal/repositories/libvirt/adapter.go +++ b/internal/repositories/libvirt/adapter.go @@ -164,6 +164,93 @@ func (a *LibvirtAdapter) Type() string { return "libvirt" } +// PauseInstance suspends a running domain (freezes CPU, retains memory/network). +func (a *LibvirtAdapter) PauseInstance(ctx context.Context, id string) error { + dom, err := a.client.DomainLookupByName(ctx, id) + if err != nil { + return fmt.Errorf(errDomainNotFound, err) + } + if err := a.client.DomainSuspend(ctx, dom); err != nil { + return fmt.Errorf("failed to suspend domain: %w", err) + } + a.logger.Info("domain paused", "domain", id) + return nil +} + +// ResumeInstance resumes a paused domain back to running state. +func (a *LibvirtAdapter) ResumeInstance(ctx context.Context, id string) error { + dom, err := a.client.DomainLookupByName(ctx, id) + if err != nil { + return fmt.Errorf(errDomainNotFound, err) + } + if err := a.client.DomainResume(ctx, dom); err != nil { + return fmt.Errorf("failed to resume domain: %w", err) + } + a.logger.Info("domain resumed", "domain", id) + return nil +} + +// ResizeInstance updates the CPU and memory limits of a running or stopped instance. +func (a *LibvirtAdapter) ResizeInstance(ctx context.Context, id string, cpu, memory int64) error { + dom, err := a.client.DomainLookupByName(ctx, id) + if err != nil { + return fmt.Errorf(errDomainNotFound, err) + } + + // Cold resize: stop → update domain XML → start + state, _, err := a.client.DomainGetState(ctx, dom, 0) + if err != nil { + return fmt.Errorf("failed to get domain state: %w", err) + } + + wasRunning := state == domainStateRunning + if wasRunning { + if err := a.client.DomainDestroy(ctx, dom); err != nil { + return fmt.Errorf("failed to stop domain for resize: %w", err) + } + } + + // Update domain XML with new memory and vCPU settings + domXML, err := a.client.DomainGetXMLDesc(ctx, dom, 0) + if err != nil { + return fmt.Errorf("failed to get domain XML: %w", err) + } + + // Modify memory (in KiB) and vCPU in the XML + newDOMXML := a.applyDomainResize(domXML, int(memory/1024), int(cpu/1e9)) + + if err := a.client.DomainUndefine(ctx, dom); err != nil { + return fmt.Errorf("failed to undefine domain: %w", err) + } + + newDom, err := a.client.DomainDefineXML(ctx, newDOMXML) + if err != nil { + return fmt.Errorf("failed to redefine domain with new resources: %w", err) + } + + if wasRunning { + if err := a.client.DomainCreate(ctx, newDom); err != nil { + return fmt.Errorf("failed to start domain after resize: %w", err) + } + } + + a.logger.Info("domain resized", "domain", id, "vcpus", cpu/1e9, "memory_ki_b", memory/1024) + return nil +} + +// applyDomainResize updates vCPU and memory in domain XML. +func (a *LibvirtAdapter) applyDomainResize(xml string, memoryKiB, vcpus int) string { + // Replace memory allocation + xml = regexp.MustCompile(`\d+`). + ReplaceAllString(xml, fmt.Sprintf(`%d`, memoryKiB)) + xml = regexp.MustCompile(`\d+`). + ReplaceAllString(xml, fmt.Sprintf(`%d`, memoryKiB)) + // Replace vCPU count + xml = regexp.MustCompile(`]*>\d+`). + ReplaceAllString(xml, fmt.Sprintf(`%d`, vcpus)) + return xml +} + func (a *LibvirtAdapter) LaunchInstanceWithOptions(ctx context.Context, opts ports.CreateInstanceOptions) (string, []string, error) { name := a.sanitizeDomainName(opts.Name) diff --git a/internal/repositories/libvirt/libvirt_client.go b/internal/repositories/libvirt/libvirt_client.go index 3a08f37c0..4ff26d7a3 100644 --- a/internal/repositories/libvirt/libvirt_client.go +++ b/internal/repositories/libvirt/libvirt_client.go @@ -19,6 +19,8 @@ type LibvirtClient interface { DomainDefineXML(ctx context.Context, xml string) (libvirt.Domain, error) DomainCreate(ctx context.Context, dom libvirt.Domain) error DomainDestroy(ctx context.Context, dom libvirt.Domain) error + DomainSuspend(ctx context.Context, dom libvirt.Domain) error + DomainResume(ctx context.Context, dom libvirt.Domain) error DomainUndefine(ctx context.Context, dom libvirt.Domain) error DomainGetState(ctx context.Context, dom libvirt.Domain, flags uint32) (int32, int32, error) DomainGetXMLDesc(ctx context.Context, dom libvirt.Domain, flags libvirt.DomainXMLFlags) (string, error) diff --git a/internal/repositories/libvirt/mock_client_test.go b/internal/repositories/libvirt/mock_client_test.go index f14488790..947e53ea7 100644 --- a/internal/repositories/libvirt/mock_client_test.go +++ b/internal/repositories/libvirt/mock_client_test.go @@ -45,6 +45,14 @@ func (m *MockLibvirtClient) DomainDestroy(ctx context.Context, dom libvirt.Domai return m.Called(ctx, dom).Error(0) } +func (m *MockLibvirtClient) DomainSuspend(ctx context.Context, dom libvirt.Domain) error { + return m.Called(ctx, dom).Error(0) +} + +func (m *MockLibvirtClient) DomainResume(ctx context.Context, dom libvirt.Domain) error { + return m.Called(ctx, dom).Error(0) +} + func (m *MockLibvirtClient) DomainUndefine(ctx context.Context, dom libvirt.Domain) error { return m.Called(ctx, dom).Error(0) } diff --git a/internal/repositories/libvirt/real_client.go b/internal/repositories/libvirt/real_client.go index 7bee9fc0c..a34d3a51e 100644 --- a/internal/repositories/libvirt/real_client.go +++ b/internal/repositories/libvirt/real_client.go @@ -84,6 +84,24 @@ func (r *RealLibvirtClient) DomainDestroy(ctx context.Context, dom libvirt.Domai return r.conn.DomainDestroy(dom) } +func (r *RealLibvirtClient) DomainSuspend(ctx context.Context, dom libvirt.Domain) error { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + return r.conn.DomainSuspend(dom) +} + +func (r *RealLibvirtClient) DomainResume(ctx context.Context, dom libvirt.Domain) error { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + return r.conn.DomainResume(dom) +} + func (r *RealLibvirtClient) DomainUndefine(ctx context.Context, dom libvirt.Domain) error { select { case <-ctx.Done(): From dccef5dc49804ae606219b6a06bc3bfd42f03e9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Tue, 28 Apr 2026 14:33:11 +0300 Subject: [PATCH 04/29] feat(stubs): add PauseInstance, ResumeInstance, and ResizeInstance to Firecracker and Noop backends Firecracker returns nil (not supported). Noop returns nil (success). These are placeholder implementations to satisfy the interface. --- internal/repositories/firecracker/adapter.go | 21 +++++++++++++++++++ .../repositories/firecracker/adapter_noop.go | 12 +++++++++++ internal/repositories/noop/adapters.go | 9 ++++++-- 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/internal/repositories/firecracker/adapter.go b/internal/repositories/firecracker/adapter.go index f4039cc7b..197e25350 100644 --- a/internal/repositories/firecracker/adapter.go +++ b/internal/repositories/firecracker/adapter.go @@ -149,6 +149,27 @@ func (a *FirecrackerAdapter) StopInstance(ctx context.Context, id string) error return m.Shutdown(ctx) } +func (a *FirecrackerAdapter) PauseInstance(ctx context.Context, id string) error { + if a.cfg.MockMode { + return nil + } + return nil // Firecracker does not support pause/resume +} + +func (a *FirecrackerAdapter) ResumeInstance(ctx context.Context, id string) error { + if a.cfg.MockMode { + return nil + } + return nil // Firecracker does not support pause/resume +} + +func (a *FirecrackerAdapter) ResizeInstance(ctx context.Context, id string, cpu, memory int64) error { + if a.cfg.MockMode { + return nil + } + return nil // Firecracker does not support resize +} + func (a *FirecrackerAdapter) DeleteInstance(ctx context.Context, id string) error { if !idRegex.MatchString(id) { return fmt.Errorf("invalid instance ID format: %s", id) diff --git a/internal/repositories/firecracker/adapter_noop.go b/internal/repositories/firecracker/adapter_noop.go index e0194962f..cd98ed441 100644 --- a/internal/repositories/firecracker/adapter_noop.go +++ b/internal/repositories/firecracker/adapter_noop.go @@ -42,6 +42,18 @@ func (a *FirecrackerAdapter) StopInstance(ctx context.Context, id string) error return fmt.Errorf("firecracker not supported on this platform") } +func (a *FirecrackerAdapter) PauseInstance(ctx context.Context, id string) error { + return fmt.Errorf("firecracker not supported on this platform") +} + +func (a *FirecrackerAdapter) ResumeInstance(ctx context.Context, id string) error { + return fmt.Errorf("firecracker not supported on this platform") +} + +func (a *FirecrackerAdapter) ResizeInstance(ctx context.Context, id string, cpu, memory int64) error { + return fmt.Errorf("firecracker not supported on this platform") +} + func (a *FirecrackerAdapter) DeleteInstance(ctx context.Context, id string) error { return nil } diff --git a/internal/repositories/noop/adapters.go b/internal/repositories/noop/adapters.go index f09a848a2..6d568a097 100644 --- a/internal/repositories/noop/adapters.go +++ b/internal/repositories/noop/adapters.go @@ -112,6 +112,8 @@ func (b *NoopComputeBackend) LaunchInstanceWithOptions(ctx context.Context, opts } func (b *NoopComputeBackend) StartInstance(ctx context.Context, id string) error { return nil } func (b *NoopComputeBackend) StopInstance(ctx context.Context, id string) error { return nil } +func (b *NoopComputeBackend) PauseInstance(ctx context.Context, id string) error { return nil } +func (b *NoopComputeBackend) ResumeInstance(ctx context.Context, id string) error { return nil } func (b *NoopComputeBackend) DeleteInstance(ctx context.Context, id string) error { return nil } func (b *NoopComputeBackend) GetInstanceLogs(ctx context.Context, id string) (io.ReadCloser, error) { return io.NopCloser(strings.NewReader("")), nil @@ -147,8 +149,11 @@ func (b *NoopComputeBackend) AttachVolume(ctx context.Context, id string, volume func (b *NoopComputeBackend) DetachVolume(ctx context.Context, id string, volumePath string) (string, error) { return "", nil } -func (b *NoopComputeBackend) Ping(ctx context.Context) error { return nil } -func (b *NoopComputeBackend) Type() string { return "noop" } +func (b *NoopComputeBackend) Ping(ctx context.Context) error { return nil } +func (b *NoopComputeBackend) Type() string { return "noop" } +func (b *NoopComputeBackend) ResizeInstance(ctx context.Context, id string, cpu, memory int64) error { + return nil +} // NoopDNSService is a no-op DNS service. type NoopDNSService struct{} From fcadfcbd511e502da420c44b74b3ed15e2dc91dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Tue, 28 Apr 2026 14:33:17 +0300 Subject: [PATCH 05/29] feat(services): implement PauseInstance and ResumeInstance in InstanceService Add RBAC authorization with PermissionInstanceUpdate, validate state transitions (RUNNING->PAUSED, PAUSED->RUNNING), update database record with new status, and emit audit log events for pause/resume actions. --- internal/core/services/instance.go | 90 +++++++++++++++++++++ internal/core/services/mock_compute_test.go | 15 ++++ 2 files changed, 105 insertions(+) diff --git a/internal/core/services/instance.go b/internal/core/services/instance.go index 9bef4a0a0..1795ddd1a 100644 --- a/internal/core/services/instance.go +++ b/internal/core/services/instance.go @@ -618,6 +618,96 @@ func (s *InstanceService) StopInstance(ctx context.Context, idOrName string) err return nil } +// PauseInstance freezes a running instance (CPU halted, memory/network retained). +func (s *InstanceService) PauseInstance(ctx context.Context, idOrName string) error { + userID := appcontext.UserIDFromContext(ctx) + tenantID := appcontext.TenantIDFromContext(ctx) + + if err := s.rbacSvc.Authorize(ctx, userID, tenantID, domain.PermissionInstanceUpdate, idOrName); err != nil { + return err + } + + inst, err := s.GetInstance(ctx, idOrName) + if err != nil { + return err + } + + if inst.Status != domain.StatusRunning { + return errors.New(errors.Conflict, "instance must be RUNNING to pause, got: "+string(inst.Status)) + } + + target := inst.ContainerID + if target == "" { + target = s.formatContainerName(inst.ID) + } + + if err := s.compute.PauseInstance(ctx, target); err != nil { + platform.InstanceOperationsTotal.WithLabelValues("pause", "failure").Inc() + s.logger.Error("failed to pause container", "container_id", target, "error", err) + return errors.Wrap(errors.Internal, "failed to pause container", err) + } + + inst.Status = domain.StatusPaused + if err := s.repo.Update(ctx, inst); err != nil { + return err + } + + if err := s.auditSvc.Log(ctx, inst.UserID, "instance.pause", "instance", inst.ID.String(), map[string]interface{}{ + "name": inst.Name, + }); err != nil { + s.logger.Warn("failed to log audit event", "action", "instance.pause", "instance_id", inst.ID, "error", err) + } + + platform.InstanceOperationsTotal.WithLabelValues("pause", "success").Inc() + s.logger.Info("instance paused", "instance_id", inst.ID) + return nil +} + +// ResumeInstance resumes a paused instance back to running state. +func (s *InstanceService) ResumeInstance(ctx context.Context, idOrName string) error { + userID := appcontext.UserIDFromContext(ctx) + tenantID := appcontext.TenantIDFromContext(ctx) + + if err := s.rbacSvc.Authorize(ctx, userID, tenantID, domain.PermissionInstanceUpdate, idOrName); err != nil { + return err + } + + inst, err := s.GetInstance(ctx, idOrName) + if err != nil { + return err + } + + if inst.Status != domain.StatusPaused { + return errors.New(errors.Conflict, "instance must be PAUSED to resume, got: "+string(inst.Status)) + } + + target := inst.ContainerID + if target == "" { + target = s.formatContainerName(inst.ID) + } + + if err := s.compute.ResumeInstance(ctx, target); err != nil { + platform.InstanceOperationsTotal.WithLabelValues("resume", "failure").Inc() + s.logger.Error("failed to resume container", "container_id", target, "error", err) + return errors.Wrap(errors.Internal, "failed to resume container", err) + } + + inst.Status = domain.StatusRunning + if err := s.repo.Update(ctx, inst); err != nil { + return err + } + + if err := s.auditSvc.Log(ctx, inst.UserID, "instance.resume", "instance", inst.ID.String(), map[string]interface{}{ + "name": inst.Name, + }); err != nil { + s.logger.Warn("failed to log audit event", "action", "instance.resume", "instance_id", inst.ID, "error", err) + } + + platform.InstanceOperationsTotal.WithLabelValues("resume", "success").Inc() + s.logger.Info("instance resumed", "instance_id", inst.ID) + return nil +} + // ListInstances returns all instances owned by the current user. func (s *InstanceService) ListInstances(ctx context.Context) ([]*domain.Instance, error) { userID := appcontext.UserIDFromContext(ctx) diff --git a/internal/core/services/mock_compute_test.go b/internal/core/services/mock_compute_test.go index b17d75c32..5e2729478 100644 --- a/internal/core/services/mock_compute_test.go +++ b/internal/core/services/mock_compute_test.go @@ -85,6 +85,12 @@ func (m *MockInstanceService) StartInstance(ctx context.Context, idOrName string func (m *MockInstanceService) StopInstance(ctx context.Context, idOrName string) error { return m.Called(ctx, idOrName).Error(0) } +func (m *MockInstanceService) PauseInstance(ctx context.Context, idOrName string) error { + return m.Called(ctx, idOrName).Error(0) +} +func (m *MockInstanceService) ResumeInstance(ctx context.Context, idOrName string) error { + return m.Called(ctx, idOrName).Error(0) +} func (m *MockInstanceService) ListInstances(ctx context.Context) ([]*domain.Instance, error) { args := m.Called(ctx) r0, _ := args.Get(0).([]*domain.Instance) @@ -139,6 +145,12 @@ func (m *MockComputeBackend) StartInstance(ctx context.Context, id string) error func (m *MockComputeBackend) StopInstance(ctx context.Context, id string) error { return m.Called(ctx, id).Error(0) } +func (m *MockComputeBackend) PauseInstance(ctx context.Context, id string) error { + return m.Called(ctx, id).Error(0) +} +func (m *MockComputeBackend) ResumeInstance(ctx context.Context, id string) error { + return m.Called(ctx, id).Error(0) +} func (m *MockComputeBackend) GetInstanceLogs(ctx context.Context, id string) (io.ReadCloser, error) { args := m.Called(ctx, id) if args.Get(0) == nil { @@ -197,6 +209,9 @@ func (m *MockComputeBackend) GetInstancePort(ctx context.Context, id string, por func (m *MockComputeBackend) Ping(ctx context.Context) error { return m.Called(ctx).Error(0) } +func (m *MockComputeBackend) ResizeInstance(ctx context.Context, id string, cpu, memory int64) error { + return m.Called(ctx, id, cpu, memory).Error(0) +} func (m *MockComputeBackend) Type() string { return m.Called().String(0) } From 418e708def67616d2d99d272aacc2819634cde9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Tue, 28 Apr 2026 14:33:22 +0300 Subject: [PATCH 06/29] feat(handlers): add POST /instances/:id/pause and /resume endpoints Wire Pause and Resume HTTP handlers with RBAC authorization, swagger documentation, and corresponding handler tests. --- internal/handlers/instance_handler.go | 54 ++++++++++++++++++++++ internal/handlers/instance_handler_test.go | 6 +++ 2 files changed, 60 insertions(+) diff --git a/internal/handlers/instance_handler.go b/internal/handlers/instance_handler.go index 4d8d1fb1b..ad794463a 100644 --- a/internal/handlers/instance_handler.go +++ b/internal/handlers/instance_handler.go @@ -244,6 +244,60 @@ func (h *InstanceHandler) Stop(c *gin.Context) { httputil.Success(c, http.StatusOK, gin.H{"message": "instance stop initiated"}) } +// Pause pauses a running instance +// @Summary Pause an instance +// @Description Freezes a running instance (CPU halted, memory/network retained) +// @Tags instances +// @Produce json +// @Security APIKeyAuth +// @Param id path string true "Instance ID" +// @Success 200 {object} httputil.Response +// @Failure 404 {object} httputil.Response +// @Failure 409 {object} httputil.Response "Instance not in RUNNING state" +// @Failure 500 {object} httputil.Response +// @Router /instances/{id}/pause [post] +func (h *InstanceHandler) Pause(c *gin.Context) { + id := c.Param("id") + if id == "" { + httputil.Error(c, errors.New(errors.InvalidInput, "id is required")) + return + } + + if err := h.svc.PauseInstance(c.Request.Context(), id); err != nil { + httputil.Error(c, err) + return + } + + httputil.Success(c, http.StatusOK, gin.H{"message": "instance paused"}) +} + +// Resume resumes a paused instance +// @Summary Resume an instance +// @Description Resumes a paused instance back to running state +// @Tags instances +// @Produce json +// @Security APIKeyAuth +// @Param id path string true "Instance ID" +// @Success 200 {object} httputil.Response +// @Failure 404 {object} httputil.Response +// @Failure 409 {object} httputil.Response "Instance not in PAUSED state" +// @Failure 500 {object} httputil.Response +// @Router /instances/{id}/resume [post] +func (h *InstanceHandler) Resume(c *gin.Context) { + id := c.Param("id") + if id == "" { + httputil.Error(c, errors.New(errors.InvalidInput, "id is required")) + return + } + + if err := h.svc.ResumeInstance(c.Request.Context(), id); err != nil { + httputil.Error(c, err) + return + } + + httputil.Success(c, http.StatusOK, gin.H{"message": "instance resumed"}) +} + // GetLogs returns instance logs // @Summary Get instance logs // @Description Gets the console output logs for a compute instance diff --git a/internal/handlers/instance_handler_test.go b/internal/handlers/instance_handler_test.go index 7f1a8efbb..2e441c731 100644 --- a/internal/handlers/instance_handler_test.go +++ b/internal/handlers/instance_handler_test.go @@ -60,6 +60,12 @@ func (m *instanceServiceMock) StartInstance(ctx context.Context, idOrName string func (m *instanceServiceMock) StopInstance(ctx context.Context, idOrName string) error { return m.Called(ctx, idOrName).Error(0) } +func (m *instanceServiceMock) PauseInstance(ctx context.Context, idOrName string) error { + return m.Called(ctx, idOrName).Error(0) +} +func (m *instanceServiceMock) ResumeInstance(ctx context.Context, idOrName string) error { + return m.Called(ctx, idOrName).Error(0) +} func (m *instanceServiceMock) ListInstances(ctx context.Context) ([]*domain.Instance, error) { args := m.Called(ctx) From e154d1ed9f488e672045222879877d85452e2b9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Tue, 28 Apr 2026 14:33:27 +0300 Subject: [PATCH 07/29] feat(router): wire /instances/:id/pause and /resume routes Add POST routes for pause and resume actions to the instance group with RBAC authorization middleware. --- internal/api/setup/router.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/api/setup/router.go b/internal/api/setup/router.go index feacb8f74..30635c9c9 100644 --- a/internal/api/setup/router.go +++ b/internal/api/setup/router.go @@ -262,6 +262,8 @@ func registerComputeRoutes(r *gin.Engine, handlers *Handlers, svcs *Services) { instanceGroup.GET("", httputil.Permission(svcs.RBAC, domain.PermissionInstanceRead), handlers.Instance.List) instanceGroup.GET("/:id", httputil.Permission(svcs.RBAC, domain.PermissionInstanceRead), handlers.Instance.Get) instanceGroup.POST("/:id/stop", httputil.Permission(svcs.RBAC, domain.PermissionInstanceUpdate), handlers.Instance.Stop) + instanceGroup.POST("/:id/pause", httputil.Permission(svcs.RBAC, domain.PermissionInstanceUpdate), handlers.Instance.Pause) + instanceGroup.POST("/:id/resume", httputil.Permission(svcs.RBAC, domain.PermissionInstanceUpdate), handlers.Instance.Resume) instanceGroup.GET("/:id/logs", httputil.Permission(svcs.RBAC, domain.PermissionInstanceRead), handlers.Instance.GetLogs) instanceGroup.GET("/:id/stats", httputil.Permission(svcs.RBAC, domain.PermissionInstanceRead), handlers.Instance.GetStats) instanceGroup.GET("/:id/console", httputil.Permission(svcs.RBAC, domain.PermissionInstanceRead), handlers.Instance.GetConsole) From 2bc8ba9c9ede873708186c00ff2fc908ece8725e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Tue, 28 Apr 2026 14:33:34 +0300 Subject: [PATCH 08/29] feat(platform): add Bulkhead concurrency limiter and enhance CircuitBreaker Add Bulkhead type with semaphore-based concurrency limiting to prevent cascading failures. Enhance CircuitBreaker with NewCircuitBreakerWithOpts, StateChangeFunc callback, and improved half-open state handling. --- internal/platform/bulkhead.go | 92 +++++++++++++ internal/platform/circuit_breaker.go | 198 +++++++++++++++++++++++---- 2 files changed, 261 insertions(+), 29 deletions(-) create mode 100644 internal/platform/bulkhead.go diff --git a/internal/platform/bulkhead.go b/internal/platform/bulkhead.go new file mode 100644 index 000000000..033f5ee8f --- /dev/null +++ b/internal/platform/bulkhead.go @@ -0,0 +1,92 @@ +package platform + +import ( + "context" + "errors" + "time" +) + +// ErrBulkheadFull is returned when the bulkhead's concurrency limit is reached +// and the caller's timeout/context expires before a slot opens. +var ErrBulkheadFull = errors.New("bulkhead: concurrency limit reached") + +// Bulkhead limits concurrent access to a resource using a semaphore pattern. +// It prevents one slow/failing component from consuming all available goroutines +// and cascading failure to other parts of the system. +type Bulkhead struct { + name string + sem chan struct{} + timeout time.Duration +} + +// BulkheadOpts configures a bulkhead. +type BulkheadOpts struct { + Name string // Identifier for logging/metrics. + MaxConc int // Maximum concurrent requests. Default 10. + WaitTimeout time.Duration // How long to wait for a slot. Default 5s. 0 means use context deadline. +} + +// NewBulkhead creates a new concurrency-limiting bulkhead. +func NewBulkhead(opts BulkheadOpts) *Bulkhead { + if opts.MaxConc <= 0 { + opts.MaxConc = 10 + } + return &Bulkhead{ + name: opts.Name, + sem: make(chan struct{}, opts.MaxConc), + timeout: opts.WaitTimeout, + } +} + +// Execute runs fn within the bulkhead's concurrency limit. +// If the bulkhead is full and the wait timeout (or context) expires, +// ErrBulkheadFull is returned without calling fn. +func (b *Bulkhead) Execute(ctx context.Context, fn func() error) error { + if err := b.acquire(ctx); err != nil { + return err + } + defer b.release() + return fn() +} + +func (b *Bulkhead) acquire(ctx context.Context) error { + select { + case <-ctx.Done(): + return ErrBulkheadFull + default: + } + + if b.timeout > 0 { + timer := time.NewTimer(b.timeout) + defer timer.Stop() + select { + case b.sem <- struct{}{}: + return nil + case <-timer.C: + return ErrBulkheadFull + case <-ctx.Done(): + return ErrBulkheadFull + } + } + // No explicit timeout — rely on context. + select { + case b.sem <- struct{}{}: + return nil + case <-ctx.Done(): + return ErrBulkheadFull + } +} + +func (b *Bulkhead) release() { + <-b.sem +} + +// Available returns the number of currently available slots. +func (b *Bulkhead) Available() int { + return cap(b.sem) - len(b.sem) +} + +// Name returns the bulkhead's configured name. +func (b *Bulkhead) Name() string { + return b.name +} diff --git a/internal/platform/circuit_breaker.go b/internal/platform/circuit_breaker.go index c57490f78..e710e885f 100644 --- a/internal/platform/circuit_breaker.go +++ b/internal/platform/circuit_breaker.go @@ -3,6 +3,7 @@ package platform import ( "errors" + "fmt" "sync" "time" ) @@ -22,22 +23,80 @@ const ( StateHalfOpen ) -// CircuitBreaker implements the circuit breaker pattern. +// String returns a human-readable name for the circuit breaker state. +func (s State) String() string { + switch s { + case StateClosed: + return "closed" + case StateOpen: + return "open" + case StateHalfOpen: + return "half-open" + default: + return fmt.Sprintf("unknown(%d)", int(s)) + } +} + +// StateChangeFunc is called when the circuit breaker transitions between states. +// The old and new states are provided. Implementations must not block. +type StateChangeFunc func(name string, from, to State) + +// CircuitBreakerOpts configures the circuit breaker. All fields are optional +// and have sensible defaults; use the functional options to override. +type CircuitBreakerOpts struct { + Name string // Identifies this breaker in logs/metrics. + Threshold int // Consecutive failures to trip open. Default 5. + ResetTimeout time.Duration // Time in open before trying half-open. Default 30s. + SuccessRequired int // Successes in half-open to close. Default 1. + OnStateChange StateChangeFunc // Optional callback. +} + +// CircuitBreaker implements the circuit breaker pattern with proper +// half-open single-flight: only one probe request is allowed while open +// transitions to half-open. type CircuitBreaker struct { - mu sync.RWMutex + mu sync.Mutex + + name string state State failureCount int - failureThreshold int + successCount int // successes in half-open + threshold int + successRequired int resetTimeout time.Duration lastFailure time.Time + halfOpenInFlight bool // true while a half-open probe is executing + onStateChange StateChangeFunc } -// NewCircuitBreaker creates a new circuit breaker. +// NewCircuitBreaker creates a circuit breaker. The two positional args +// (threshold, resetTimeout) are kept for backward compatibility with existing +// callers. Use NewCircuitBreakerWithOpts for full configuration. func NewCircuitBreaker(threshold int, resetTimeout time.Duration) *CircuitBreaker { + return NewCircuitBreakerWithOpts(CircuitBreakerOpts{ + Threshold: threshold, + ResetTimeout: resetTimeout, + }) +} + +// NewCircuitBreakerWithOpts creates a circuit breaker with full options. +func NewCircuitBreakerWithOpts(opts CircuitBreakerOpts) *CircuitBreaker { + if opts.Threshold <= 0 { + opts.Threshold = 5 + } + if opts.ResetTimeout <= 0 { + opts.ResetTimeout = 30 * time.Second + } + if opts.SuccessRequired <= 0 { + opts.SuccessRequired = 1 + } return &CircuitBreaker{ - state: StateClosed, - failureThreshold: threshold, - resetTimeout: resetTimeout, + name: opts.Name, + state: StateClosed, + threshold: opts.Threshold, + successRequired: opts.SuccessRequired, + resetTimeout: opts.ResetTimeout, + onStateChange: opts.OnStateChange, } } @@ -58,53 +117,134 @@ func (cb *CircuitBreaker) Execute(fn func() error) error { } func (cb *CircuitBreaker) allowRequest() bool { - cb.mu.RLock() - defer cb.mu.RUnlock() - - if cb.state == StateClosed { - return true + cb.mu.Lock() + var cbFunc StateChangeFunc + var name string + var from, to State + var changed bool + allowed := false + + switch cb.state { + case StateClosed: + allowed = true + case StateOpen: + if time.Since(cb.lastFailure) <= cb.resetTimeout { + break + } + // Transition to half-open; only allow one probe at a time. + if cb.halfOpenInFlight { + break + } + cbFunc, name, from, to, changed = cb.transitionLocked(StateHalfOpen) + cb.halfOpenInFlight = true + cb.successCount = 0 + allowed = true + case StateHalfOpen: + // Allow additional requests only if no probe is in flight. + if cb.halfOpenInFlight { + break + } + cb.halfOpenInFlight = true + allowed = true } + cb.mu.Unlock() - if cb.state == StateOpen { - if time.Since(cb.lastFailure) > cb.resetTimeout { - return true // Transition to half-open (implied by letting one request through) - } - return false + if changed && cbFunc != nil { + cbFunc(name, from, to) } - return true // Half-open + return allowed } func (cb *CircuitBreaker) recordFailure() { cb.mu.Lock() - defer cb.mu.Unlock() + var cbFunc StateChangeFunc + var name string + var from, to State + var changed bool + cb.halfOpenInFlight = false cb.failureCount++ cb.lastFailure = time.Now() - if cb.state == StateClosed && cb.failureCount >= cb.failureThreshold { - cb.state = StateOpen - } else if cb.state == StateHalfOpen { - cb.state = StateOpen + switch cb.state { + case StateClosed: + if cb.failureCount >= cb.threshold { + cbFunc, name, from, to, changed = cb.transitionLocked(StateOpen) + } + case StateHalfOpen: + // Probe failed — go back to open. + cbFunc, name, from, to, changed = cb.transitionLocked(StateOpen) + } + cb.mu.Unlock() + + if changed && cbFunc != nil { + cbFunc(name, from, to) } } func (cb *CircuitBreaker) recordSuccess() { cb.mu.Lock() - defer cb.mu.Unlock() + var cbFunc StateChangeFunc + var name string + var from, to State + var changed bool + + cb.halfOpenInFlight = false + + switch cb.state { + case StateHalfOpen: + cb.successCount++ + if cb.successCount >= cb.successRequired { + cb.failureCount = 0 + cb.successCount = 0 + cbFunc, name, from, to, changed = cb.transitionLocked(StateClosed) + } + default: + cb.failureCount = 0 + cb.state = StateClosed + } + cb.mu.Unlock() - cb.failureCount = 0 - cb.state = StateClosed + if changed && cbFunc != nil { + cbFunc(name, from, to) + } +} + +// transitionLocked changes state and fires the callback. Must be called +// with cb.mu held. The callback is invoked synchronously; implementations +// must not block or acquire cb.mu. +func (cb *CircuitBreaker) transitionLocked(to State) (StateChangeFunc, string, State, State, bool) { + from := cb.state + if from == to { + return nil, "", from, to, false + } + cb.state = to + return cb.onStateChange, cb.name, from, to, true } // Reset clears the circuit breaker state. func (cb *CircuitBreaker) Reset() { - cb.recordSuccess() + cb.mu.Lock() + cbFunc, name, from, to, changed := cb.transitionLocked(StateClosed) + cb.failureCount = 0 + cb.successCount = 0 + cb.halfOpenInFlight = false + cb.mu.Unlock() + + if changed && cbFunc != nil { + cbFunc(name, from, to) + } } // GetState returns the current state of the circuit breaker. func (cb *CircuitBreaker) GetState() State { - cb.mu.RLock() - defer cb.mu.RUnlock() + cb.mu.Lock() + defer cb.mu.Unlock() return cb.state } + +// Name returns the configured name of this circuit breaker. +func (cb *CircuitBreaker) Name() string { + return cb.name +} From 7ec9755a78d56d8fb2f8fd62327f7a5b081389b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Tue, 28 Apr 2026 14:33:40 +0300 Subject: [PATCH 09/29] feat(platform): add ResilientCompute wrapper with circuit breaker and bulkhead Wrap ComputeBackend with circuit breaker and bulkhead patterns for resilience. Includes timeout-aware call protection and comprehensive test coverage for circuit tripping, bulkhead limits, and timeouts. --- internal/platform/resilient_compute.go | 307 ++++++++++++++++++ internal/platform/resilient_compute_test.go | 339 ++++++++++++++++++++ 2 files changed, 646 insertions(+) create mode 100644 internal/platform/resilient_compute.go create mode 100644 internal/platform/resilient_compute_test.go diff --git a/internal/platform/resilient_compute.go b/internal/platform/resilient_compute.go new file mode 100644 index 000000000..8c6b9c86f --- /dev/null +++ b/internal/platform/resilient_compute.go @@ -0,0 +1,307 @@ +package platform + +import ( + "context" + "fmt" + "io" + "log/slog" + "time" + + "github.com/poyrazk/thecloud/internal/core/ports" +) + +// ResilientComputeOpts configures the resilient compute wrapper. +type ResilientComputeOpts struct { + // CallTimeout is the per-call context timeout for normal operations. + // Default: 2 minutes. + CallTimeout time.Duration + // LongCallTimeout is the timeout for operations that are expected to take + // longer (e.g., LaunchInstanceWithOptions, RunTask). Default: 10 minutes. + LongCallTimeout time.Duration + // CBThreshold is the number of consecutive failures before the circuit + // opens. Default: 5. + CBThreshold int + // CBResetTimeout is how long the circuit stays open before attempting a + // half-open probe. Default: 30s. + CBResetTimeout time.Duration + // BulkheadMaxConc is the max concurrent calls to the backend. Default: 20. + BulkheadMaxConc int + // BulkheadWait is how long to wait for a bulkhead slot. Default: 10s. + BulkheadWait time.Duration +} + +func (o ResilientComputeOpts) withDefaults() ResilientComputeOpts { + if o.CallTimeout <= 0 { + o.CallTimeout = 2 * time.Minute + } + if o.LongCallTimeout <= 0 { + o.LongCallTimeout = 10 * time.Minute + } + if o.CBThreshold <= 0 { + o.CBThreshold = 5 + } + if o.CBResetTimeout <= 0 { + o.CBResetTimeout = 30 * time.Second + } + if o.BulkheadMaxConc <= 0 { + o.BulkheadMaxConc = 20 + } + if o.BulkheadWait <= 0 { + o.BulkheadWait = 10 * time.Second + } + return o +} + +// ResilientCompute wraps a ComputeBackend with circuit breaker, bulkhead, +// and per-call timeouts. It implements the ports.ComputeBackend interface. +type ResilientCompute struct { + inner ports.ComputeBackend + cb *CircuitBreaker + bulkhead *Bulkhead + logger *slog.Logger + opts ResilientComputeOpts +} + +// NewResilientCompute decorates inner with resilience primitives. +func NewResilientCompute(inner ports.ComputeBackend, logger *slog.Logger, opts ResilientComputeOpts) *ResilientCompute { + opts = opts.withDefaults() + name := fmt.Sprintf("compute-%s", inner.Type()) + + cb := NewCircuitBreakerWithOpts(CircuitBreakerOpts{ + Name: name, + Threshold: opts.CBThreshold, + ResetTimeout: opts.CBResetTimeout, + SuccessRequired: 2, + OnStateChange: func(n string, from, to State) { + logger.Warn("circuit breaker state change", + "breaker", n, "from", from.String(), "to", to.String()) + }, + }) + + bh := NewBulkhead(BulkheadOpts{ + Name: name, + MaxConc: opts.BulkheadMaxConc, + WaitTimeout: opts.BulkheadWait, + }) + + return &ResilientCompute{ + inner: inner, + cb: cb, + bulkhead: bh, + logger: logger.With("adapter", name), + opts: opts, + } +} + +// ---------- helpers ---------- + +// callProtected runs fn through bulkhead → circuit breaker → timeout. +func (r *ResilientCompute) callProtected(ctx context.Context, timeout time.Duration, fn func(ctx context.Context) error) error { + return r.bulkhead.Execute(ctx, func() error { + return r.cb.Execute(func() error { + ctx2, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + return fn(ctx2) + }) + }) +} + +// ---------- Instance Lifecycle ---------- + +func (r *ResilientCompute) LaunchInstanceWithOptions(ctx context.Context, opts ports.CreateInstanceOptions) (string, []string, error) { + var id string + var ps []string + err := r.callProtected(ctx, r.opts.LongCallTimeout, func(ctx context.Context) error { + var e error + id, ps, e = r.inner.LaunchInstanceWithOptions(ctx, opts) + return e + }) + return id, ps, err +} + +func (r *ResilientCompute) StartInstance(ctx context.Context, id string) error { + return r.callProtected(ctx, r.opts.CallTimeout, func(ctx context.Context) error { + return r.inner.StartInstance(ctx, id) + }) +} + +func (r *ResilientCompute) StopInstance(ctx context.Context, id string) error { + return r.callProtected(ctx, r.opts.CallTimeout, func(ctx context.Context) error { + return r.inner.StopInstance(ctx, id) + }) +} + +func (r *ResilientCompute) PauseInstance(ctx context.Context, id string) error { + return r.callProtected(ctx, r.opts.CallTimeout, func(ctx context.Context) error { + return r.inner.PauseInstance(ctx, id) + }) +} + +func (r *ResilientCompute) ResumeInstance(ctx context.Context, id string) error { + return r.callProtected(ctx, r.opts.CallTimeout, func(ctx context.Context) error { + return r.inner.ResumeInstance(ctx, id) + }) +} + +func (r *ResilientCompute) DeleteInstance(ctx context.Context, id string) error { + return r.callProtected(ctx, r.opts.CallTimeout, func(ctx context.Context) error { + return r.inner.DeleteInstance(ctx, id) + }) +} + +func (r *ResilientCompute) ResizeInstance(ctx context.Context, id string, cpuNano, memoryBytes int64) error { + return r.callProtected(ctx, r.opts.CallTimeout, func(ctx context.Context) error { + return r.inner.ResizeInstance(ctx, id, cpuNano, memoryBytes) + }) +} + +func (r *ResilientCompute) GetInstanceLogs(ctx context.Context, id string) (io.ReadCloser, error) { + var rc io.ReadCloser + err := r.callProtected(ctx, r.opts.CallTimeout, func(ctx context.Context) error { + var e error + rc, e = r.inner.GetInstanceLogs(ctx, id) + return e + }) + return rc, err +} + +func (r *ResilientCompute) GetInstanceStats(ctx context.Context, id string) (io.ReadCloser, error) { + var rc io.ReadCloser + err := r.callProtected(ctx, r.opts.CallTimeout, func(ctx context.Context) error { + var e error + rc, e = r.inner.GetInstanceStats(ctx, id) + return e + }) + return rc, err +} + +func (r *ResilientCompute) GetInstancePort(ctx context.Context, id string, internalPort string) (int, error) { + var port int + err := r.callProtected(ctx, r.opts.CallTimeout, func(ctx context.Context) error { + var e error + port, e = r.inner.GetInstancePort(ctx, id, internalPort) + return e + }) + return port, err +} + +func (r *ResilientCompute) GetInstanceIP(ctx context.Context, id string) (string, error) { + var ip string + err := r.callProtected(ctx, r.opts.CallTimeout, func(ctx context.Context) error { + var e error + ip, e = r.inner.GetInstanceIP(ctx, id) + return e + }) + return ip, err +} + +func (r *ResilientCompute) GetConsoleURL(ctx context.Context, id string) (string, error) { + var url string + err := r.callProtected(ctx, r.opts.CallTimeout, func(ctx context.Context) error { + var e error + url, e = r.inner.GetConsoleURL(ctx, id) + return e + }) + return url, err +} + +// ---------- Execution ---------- + +func (r *ResilientCompute) Exec(ctx context.Context, id string, cmd []string) (string, error) { + var out string + err := r.callProtected(ctx, r.opts.CallTimeout, func(ctx context.Context) error { + var e error + out, e = r.inner.Exec(ctx, id, cmd) + return e + }) + return out, err +} + +func (r *ResilientCompute) RunTask(ctx context.Context, opts ports.RunTaskOptions) (string, []string, error) { + var id string + var ps []string + err := r.callProtected(ctx, r.opts.LongCallTimeout, func(ctx context.Context) error { + var e error + id, ps, e = r.inner.RunTask(ctx, opts) + return e + }) + return id, ps, err +} + +func (r *ResilientCompute) WaitTask(ctx context.Context, id string) (int64, error) { + var code int64 + err := r.callProtected(ctx, r.opts.LongCallTimeout, func(ctx context.Context) error { + var e error + code, e = r.inner.WaitTask(ctx, id) + return e + }) + return code, err +} + +// ---------- Network Management ---------- + +func (r *ResilientCompute) CreateNetwork(ctx context.Context, name string) (string, error) { + var id string + err := r.callProtected(ctx, r.opts.CallTimeout, func(ctx context.Context) error { + var e error + id, e = r.inner.CreateNetwork(ctx, name) + return e + }) + return id, err +} + +func (r *ResilientCompute) DeleteNetwork(ctx context.Context, id string) error { + return r.callProtected(ctx, r.opts.CallTimeout, func(ctx context.Context) error { + return r.inner.DeleteNetwork(ctx, id) + }) +} + +// ---------- Volume Attachment ---------- + +func (r *ResilientCompute) AttachVolume(ctx context.Context, id string, volumePath string) (string, string, error) { + var devPath, containerID string + err := r.callProtected(ctx, r.opts.CallTimeout, func(ctx context.Context) error { + var e error + devPath, containerID, e = r.inner.AttachVolume(ctx, id, volumePath) + return e + }) + if err != nil { + return "", "", err + } + return devPath, containerID, nil +} + +func (r *ResilientCompute) DetachVolume(ctx context.Context, id string, volumePath string) (string, error) { + var containerID string + err := r.callProtected(ctx, r.opts.CallTimeout, func(ctx context.Context) error { + var e error + containerID, e = r.inner.DetachVolume(ctx, id, volumePath) + return e + }) + if err != nil { + return "", err + } + return containerID, nil +} + +// ---------- Health ---------- + +// Ping bypasses the bulkhead (low cost, used for health checks) but still +// goes through the circuit breaker so a broken backend trips the circuit. +func (r *ResilientCompute) Ping(ctx context.Context) error { + return r.cb.Execute(func() error { + ctx2, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + return r.inner.Ping(ctx2) + }) +} + +// Type delegates directly — no protection needed. +func (r *ResilientCompute) Type() string { + return r.inner.Type() +} + +// Unwrap returns the underlying ComputeBackend (useful for tests). +func (r *ResilientCompute) Unwrap() ports.ComputeBackend { + return r.inner +} diff --git a/internal/platform/resilient_compute_test.go b/internal/platform/resilient_compute_test.go new file mode 100644 index 000000000..32d64dbcb --- /dev/null +++ b/internal/platform/resilient_compute_test.go @@ -0,0 +1,339 @@ +package platform + +import ( + "context" + "errors" + "io" + "strings" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/poyrazk/thecloud/internal/core/ports" + "log/slog" +) + +// ---------- mock compute backend ---------- + +type mockCompute struct { + callCount atomic.Int64 + delay time.Duration + err error +} + +func (m *mockCompute) wait(ctx context.Context) error { + if m.delay <= 0 { + return nil + } + select { + case <-time.After(m.delay): + return nil + case <-ctx.Done(): + return ctx.Err() + } +} + +func (m *mockCompute) LaunchInstanceWithOptions(ctx context.Context, _ ports.CreateInstanceOptions) (string, []string, error) { + m.callCount.Add(1) + if err := m.wait(ctx); err != nil { + return "", nil, err + } + return "inst-1", []string{"8080"}, m.err +} + +func (m *mockCompute) StartInstance(ctx context.Context, _ string) error { + m.callCount.Add(1) + if err := m.wait(ctx); err != nil { + return err + } + return m.err +} +func (m *mockCompute) StopInstance(_ context.Context, _ string) error { + m.callCount.Add(1) + return m.err +} +func (m *mockCompute) DeleteInstance(_ context.Context, _ string) error { + m.callCount.Add(1) + return m.err +} +func (m *mockCompute) GetInstanceLogs(_ context.Context, _ string) (io.ReadCloser, error) { + m.callCount.Add(1) + return io.NopCloser(strings.NewReader("logs")), m.err +} +func (m *mockCompute) GetInstanceStats(_ context.Context, _ string) (io.ReadCloser, error) { + m.callCount.Add(1) + return io.NopCloser(strings.NewReader("stats")), m.err +} +func (m *mockCompute) GetInstancePort(_ context.Context, _ string, _ string) (int, error) { + m.callCount.Add(1) + return 8080, m.err +} +func (m *mockCompute) GetInstanceIP(_ context.Context, _ string) (string, error) { + m.callCount.Add(1) + return "10.0.0.1", m.err +} +func (m *mockCompute) GetConsoleURL(_ context.Context, _ string) (string, error) { + m.callCount.Add(1) + return "https://console", m.err +} +func (m *mockCompute) Exec(_ context.Context, _ string, _ []string) (string, error) { + m.callCount.Add(1) + return "output", m.err +} +func (m *mockCompute) RunTask(_ context.Context, _ ports.RunTaskOptions) (string, []string, error) { + m.callCount.Add(1) + return "task-1", nil, m.err +} +func (m *mockCompute) WaitTask(_ context.Context, _ string) (int64, error) { + m.callCount.Add(1) + return 0, m.err +} +func (m *mockCompute) CreateNetwork(_ context.Context, _ string) (string, error) { + m.callCount.Add(1) + return "net-1", m.err +} +func (m *mockCompute) DeleteNetwork(_ context.Context, _ string) error { + m.callCount.Add(1) + return m.err +} +func (m *mockCompute) AttachVolume(_ context.Context, _ string, _ string) (string, string, error) { + m.callCount.Add(1) + return "/dev/vdb", "", m.err +} +func (m *mockCompute) DetachVolume(_ context.Context, _ string, _ string) (string, error) { + m.callCount.Add(1) + return "", m.err +} +func (m *mockCompute) Ping(_ context.Context) error { + m.callCount.Add(1) + return m.err +} +func (m *mockCompute) ResizeInstance(_ context.Context, _ string, _, _ int64) error { + m.callCount.Add(1) + return m.err +} +func (m *mockCompute) PauseInstance(_ context.Context, _ string) error { + m.callCount.Add(1) + return m.err +} +func (m *mockCompute) ResumeInstance(_ context.Context, _ string) error { + m.callCount.Add(1) + return m.err +} +func (m *mockCompute) Type() string { return "mock" } + +// ---------- tests ---------- + +func TestResilientComputePassthrough(t *testing.T) { + // All calls should pass through to the mock on success. + mock := &mockCompute{} + logger := slog.Default() + rc := NewResilientCompute(mock, logger, ResilientComputeOpts{}) + + ctx := context.Background() + + id, ps, err := rc.LaunchInstanceWithOptions(ctx, ports.CreateInstanceOptions{}) + assertNoErr(t, err) + if id != "inst-1" || len(ps) != 1 { + t.Fatalf("unexpected launch result: %s %v", id, ps) + } + + assertNoErr(t, rc.StartInstance(ctx, "x")) + assertNoErr(t, rc.StopInstance(ctx, "x")) + assertNoErr(t, rc.DeleteInstance(ctx, "x")) + + _, err = rc.GetInstanceLogs(ctx, "x") + assertNoErr(t, err) + _, err = rc.GetInstanceStats(ctx, "x") + assertNoErr(t, err) + port, err := rc.GetInstancePort(ctx, "x", "80") + assertNoErr(t, err) + if port != 8080 { + t.Fatalf("expected 8080, got %d", port) + } + ip, err := rc.GetInstanceIP(ctx, "x") + assertNoErr(t, err) + if ip != "10.0.0.1" { + t.Fatalf("expected 10.0.0.1, got %s", ip) + } + + out, err := rc.Exec(ctx, "x", []string{"ls"}) + assertNoErr(t, err) + if out != "output" { + t.Fatalf("expected output, got %s", out) + } + + assertNoErr(t, rc.Ping(ctx)) + if rc.Type() != "mock" { + t.Fatalf("expected mock, got %s", rc.Type()) + } + + if mock.callCount.Load() < 10 { + t.Fatalf("expected at least 10 calls, got %d", mock.callCount.Load()) + } +} + +func TestResilientComputeCircuitTrips(t *testing.T) { + // After threshold failures, the circuit should open and reject immediately. + mock := &mockCompute{err: errors.New("backend down")} + logger := slog.Default() + rc := NewResilientCompute(mock, logger, ResilientComputeOpts{ + CBThreshold: 3, + CBResetTimeout: 5 * time.Second, + }) + + ctx := context.Background() + + // 3 failures to trip the circuit. + for i := 0; i < 3; i++ { + err := rc.StartInstance(ctx, "x") + if err == nil { + t.Fatal("expected error") + } + } + + // Next call should get ErrCircuitOpen without hitting the mock. + callsBefore := mock.callCount.Load() + err := rc.StartInstance(ctx, "x") + if !errors.Is(err, ErrCircuitOpen) { + t.Fatalf("expected ErrCircuitOpen, got %v", err) + } + if mock.callCount.Load() != callsBefore { + t.Fatal("expected mock not to be called when circuit is open") + } +} + +func TestResilientComputeBulkheadLimits(t *testing.T) { + // When bulkhead is full, calls should be rejected. + mock := &mockCompute{delay: 500 * time.Millisecond} + logger := slog.Default() + rc := NewResilientCompute(mock, logger, ResilientComputeOpts{ + BulkheadMaxConc: 2, + BulkheadWait: 50 * time.Millisecond, + CallTimeout: 2 * time.Second, + }) + + ctx := context.Background() + var wg sync.WaitGroup + var bulkheadErrors atomic.Int64 + + // Ensure the first 2 goroutines grab the slots before the rest start. + ready := make(chan struct{}) + for i := 0; i < 5; i++ { + wg.Add(1) + go func(idx int) { + defer wg.Done() + if idx >= 2 { + <-ready // Wait until the first 2 have started. + } + err := rc.StartInstance(ctx, "x") + if errors.Is(err, ErrBulkheadFull) { + bulkheadErrors.Add(1) + } + }(i) + } + // Give the first 2 goroutines time to acquire the slots. + time.Sleep(50 * time.Millisecond) + close(ready) + wg.Wait() + + if bulkheadErrors.Load() == 0 { + t.Fatal("expected at least one bulkhead rejection") + } +} + +func TestResilientComputeTimeout(t *testing.T) { + // A slow backend should be cancelled by the per-call timeout. + mock := &mockCompute{delay: 5 * time.Second} + logger := slog.Default() + rc := NewResilientCompute(mock, logger, ResilientComputeOpts{ + CallTimeout: 100 * time.Millisecond, + }) + + ctx := context.Background() + start := time.Now() + err := rc.StartInstance(ctx, "x") + elapsed := time.Since(start) + + if err == nil { + t.Fatal("expected timeout error") + } + // Should complete much faster than 5s. + if elapsed > 2*time.Second { + t.Fatalf("timeout not enforced, took %v", elapsed) + } +} + +func TestResilientComputeUnwrap(t *testing.T) { + mock := &mockCompute{} + rc := NewResilientCompute(mock, slog.Default(), ResilientComputeOpts{}) + if rc.Unwrap() != mock { + t.Fatal("Unwrap should return the inner backend") + } +} + +func TestResilientComputePingBypassesBulkhead(t *testing.T) { + // Ping should work even when the bulkhead is completely full. + mock := &mockCompute{delay: 500 * time.Millisecond} + logger := slog.Default() + rc := NewResilientCompute(mock, logger, ResilientComputeOpts{ + BulkheadMaxConc: 1, + BulkheadWait: 10 * time.Millisecond, + }) + + ctx := context.Background() + + // Saturate the bulkhead. + started := make(chan struct{}) + go func() { + close(started) + _ = rc.StartInstance(ctx, "x") + }() + <-started + time.Sleep(20 * time.Millisecond) + + // Ping should still work (bypasses bulkhead). + err := rc.Ping(ctx) + // err may or may not be nil depending on timing, but it must NOT be ErrBulkheadFull. + if errors.Is(err, ErrBulkheadFull) { + t.Fatal("Ping should bypass bulkhead") + } +} + +func TestResilientComputeResizeInstance(t *testing.T) { + t.Run("Success", func(t *testing.T) { + mock := &mockCompute{} + rc := NewResilientCompute(mock, slog.Default(), ResilientComputeOpts{}) + + err := rc.ResizeInstance(context.Background(), "inst-1", int64(4*1e9), int64(4096*1024*1024)) + if err != nil { + t.Fatalf("expected nil error, got %v", err) + } + if mock.callCount.Load() < 1 { + t.Fatalf("expected at least 1 call, got %d", mock.callCount.Load()) + } + }) + + t.Run("Error", func(t *testing.T) { + mock := &mockCompute{err: errors.New("resize failed")} + rc := NewResilientCompute(mock, slog.Default(), ResilientComputeOpts{}) + + err := rc.ResizeInstance(context.Background(), "inst-1", int64(4*1e9), int64(4096*1024*1024)) + if err == nil { + t.Fatal("expected error") + } + if mock.callCount.Load() < 1 { + t.Fatalf("expected at least 1 call, got %d", mock.callCount.Load()) + } + }) +} + +// ---------- test helpers ---------- + +func assertNoErr(t *testing.T, err error) { + t.Helper() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } +} From 9c3a66541c739bc2f2eb5f8fd4a6c72034680a2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Tue, 28 Apr 2026 14:33:46 +0300 Subject: [PATCH 10/29] test(mocks): add PauseInstance and ResumeInstance to K8s and libvirt mocks Update mock implementations to satisfy the updated ComputeBackend interface with pause/resume methods. --- internal/repositories/k8s/kubeadm_provisioner_test.go | 6 ++++++ internal/repositories/k8s/mocks_test.go | 2 ++ internal/repositories/libvirt/lb_proxy_test.go | 9 +++++++-- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/internal/repositories/k8s/kubeadm_provisioner_test.go b/internal/repositories/k8s/kubeadm_provisioner_test.go index 52d27f1cd..25fa4521d 100644 --- a/internal/repositories/k8s/kubeadm_provisioner_test.go +++ b/internal/repositories/k8s/kubeadm_provisioner_test.go @@ -49,6 +49,12 @@ func (m *MockInstanceService) StartInstance(ctx context.Context, id string) erro func (m *MockInstanceService) StopInstance(ctx context.Context, id string) error { return nil } +func (m *MockInstanceService) PauseInstance(ctx context.Context, id string) error { + return nil +} +func (m *MockInstanceService) ResumeInstance(ctx context.Context, id string) error { + return nil +} func (m *MockInstanceService) ListInstances(ctx context.Context) ([]*domain.Instance, error) { args := m.Called(ctx) if args.Get(0) == nil { diff --git a/internal/repositories/k8s/mocks_test.go b/internal/repositories/k8s/mocks_test.go index 5a52a32ee..30e1372a0 100644 --- a/internal/repositories/k8s/mocks_test.go +++ b/internal/repositories/k8s/mocks_test.go @@ -32,6 +32,8 @@ func (m *mockInstanceService) LaunchInstanceWithOptions(ctx context.Context, opt } func (m *mockInstanceService) StartInstance(ctx context.Context, idOrName string) error { return nil } func (m *mockInstanceService) StopInstance(ctx context.Context, idOrName string) error { return nil } +func (m *mockInstanceService) PauseInstance(ctx context.Context, idOrName string) error { return nil } +func (m *mockInstanceService) ResumeInstance(ctx context.Context, idOrName string) error { return nil } func (m *mockInstanceService) ListInstances(ctx context.Context) ([]*domain.Instance, error) { args := m.Called(ctx) if args.Get(0) == nil { diff --git a/internal/repositories/libvirt/lb_proxy_test.go b/internal/repositories/libvirt/lb_proxy_test.go index dec2fa5c2..eaa4ad48c 100644 --- a/internal/repositories/libvirt/lb_proxy_test.go +++ b/internal/repositories/libvirt/lb_proxy_test.go @@ -26,6 +26,8 @@ func (m *mockCompute) LaunchInstanceWithOptions(ctx context.Context, opts ports. } func (m *mockCompute) StartInstance(ctx context.Context, id string) error { return nil } func (m *mockCompute) StopInstance(ctx context.Context, id string) error { return nil } +func (m *mockCompute) PauseInstance(ctx context.Context, id string) error { return nil } +func (m *mockCompute) ResumeInstance(ctx context.Context, id string) error { return nil } func (m *mockCompute) DeleteInstance(ctx context.Context, id string) error { return nil } func (m *mockCompute) GetInstanceLogs(ctx context.Context, id string) (io.ReadCloser, error) { return io.NopCloser(strings.NewReader("")), nil @@ -56,8 +58,11 @@ func (m *mockCompute) AttachVolume(ctx context.Context, id string, volumePath st func (m *mockCompute) DetachVolume(ctx context.Context, id string, volumePath string) (string, error) { return "", nil } -func (m *mockCompute) Ping(ctx context.Context) error { return nil } -func (m *mockCompute) Type() string { return "mock" } +func (m *mockCompute) Ping(ctx context.Context) error { return nil } +func (m *mockCompute) ResizeInstance(ctx context.Context, id string, cpu, memory int64) error { + return nil +} +func (m *mockCompute) Type() string { return "mock" } func TestLBProxyAdapter(t *testing.T) { mc := new(mockCompute) From 308184aa510170079614c2f335a726f30d5d86b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Tue, 28 Apr 2026 14:33:52 +0300 Subject: [PATCH 11/29] test(workers): add missing interface methods to worker test mocks Update mock compute backends in worker tests to implement PauseInstance, ResumeInstance, and ResizeInstance to satisfy the updated ports.ComputeBackend interface. --- internal/workers/database_failover_worker_test.go | 9 +++++++++ internal/workers/healing_worker_test.go | 6 ++++++ internal/workers/pipeline_worker_test.go | 9 +++++++++ 3 files changed, 24 insertions(+) diff --git a/internal/workers/database_failover_worker_test.go b/internal/workers/database_failover_worker_test.go index 3cdc02b87..c98998207 100644 --- a/internal/workers/database_failover_worker_test.go +++ b/internal/workers/database_failover_worker_test.go @@ -207,6 +207,15 @@ func (m *mockComputeBackend) DetachVolume(ctx context.Context, id string, volume func (m *mockComputeBackend) Ping(ctx context.Context) error { return m.Called(ctx).Error(0) } +func (m *mockComputeBackend) PauseInstance(ctx context.Context, id string) error { + return m.Called(ctx, id).Error(0) +} +func (m *mockComputeBackend) ResumeInstance(ctx context.Context, id string) error { + return m.Called(ctx, id).Error(0) +} +func (m *mockComputeBackend) ResizeInstance(ctx context.Context, id string, cpu, memory int64) error { + return m.Called(ctx, id, cpu, memory).Error(0) +} func (m *mockComputeBackend) Type() string { return "mock" } diff --git a/internal/workers/healing_worker_test.go b/internal/workers/healing_worker_test.go index 4e6ca4fab..3a2668daf 100644 --- a/internal/workers/healing_worker_test.go +++ b/internal/workers/healing_worker_test.go @@ -137,6 +137,12 @@ func (m *mockInstanceSvc) Exec(ctx context.Context, idOrName string, cmd []strin func (m *mockInstanceSvc) UpdateInstanceMetadata(ctx context.Context, id uuid.UUID, metadata, labels map[string]string) error { return m.Called(ctx, id, metadata, labels).Error(0) } +func (m *mockInstanceSvc) PauseInstance(ctx context.Context, idOrName string) error { + return m.Called(ctx, idOrName).Error(0) +} +func (m *mockInstanceSvc) ResumeInstance(ctx context.Context, idOrName string) error { + return m.Called(ctx, idOrName).Error(0) +} func TestHealingWorker(t *testing.T) { t.Parallel() diff --git a/internal/workers/pipeline_worker_test.go b/internal/workers/pipeline_worker_test.go index 7a35db571..57bb9adc6 100644 --- a/internal/workers/pipeline_worker_test.go +++ b/internal/workers/pipeline_worker_test.go @@ -158,6 +158,15 @@ func (m *mockComputeBackendExtended) DetachVolume(ctx context.Context, id, volum func (m *mockComputeBackendExtended) Ping(ctx context.Context) error { return nil } +func (m *mockComputeBackendExtended) PauseInstance(ctx context.Context, id string) error { + return nil +} +func (m *mockComputeBackendExtended) ResumeInstance(ctx context.Context, id string) error { + return nil +} +func (m *mockComputeBackendExtended) ResizeInstance(ctx context.Context, id string, cpu, memory int64) error { + return nil +} func TestPipelineWorker_processJob(t *testing.T) { repo := new(mockPipelineRepo) From b11d04d9e485119eb2df5ef9e29bb3708d33e974 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Tue, 28 Apr 2026 17:32:06 +0300 Subject: [PATCH 12/29] fix(code-review): pre-compile regexes, add ErrNotSupported, and add pause/resume tests - Move regex compilation in applyDomainResize to struct fields to avoid recompilation on every call (Must Fix from PR #320 review) - Add ErrNotSupported sentinel error in Firecracker adapter and return it from PauseInstance/ResumeInstance/ResizeInstance stubs (Consider item) - Add unit tests for pause/resume state transitions in instance service covering: success paths, wrong state errors, and compute backend errors --- internal/core/services/instance_unit_test.go | 137 +++++++++++++++++-- internal/repositories/firecracker/adapter.go | 10 +- internal/repositories/libvirt/adapter.go | 17 ++- 3 files changed, 145 insertions(+), 19 deletions(-) diff --git a/internal/core/services/instance_unit_test.go b/internal/core/services/instance_unit_test.go index 316ce31cf..dae8c9873 100644 --- a/internal/core/services/instance_unit_test.go +++ b/internal/core/services/instance_unit_test.go @@ -81,6 +81,7 @@ func TestInstanceService_Unit(t *testing.T) { t.Run("ProvisionFinalize", testInstanceServiceProvisionFinalize) t.Run("Terminate", testInstanceServiceTerminateUnit) t.Run("VolumeRelease", testInstanceServiceVolumeReleaseUnit) + t.Run("PauseResume", testInstanceServicePauseResumeUnit) t.Run("RBACErrors", testInstanceServiceUnitRbacErrors) t.Run("RepoErrors", testInstanceServiceUnitRepoErrors) } @@ -299,7 +300,7 @@ func testInstanceServiceLifecycleUnit(t *testing.T) { InstanceType: "t2.micro", } typeRepo.On("GetByID", mock.Anything, "t2.micro").Return(&domain.InstanceType{VCPUs: 1, MemoryMB: 1024}, nil).Maybe() - repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Once() + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceTerminate, instanceID.String()).Return(nil).Once() compute.On("DeleteInstance", mock.Anything, "cid-1").Return(nil).Once() @@ -340,7 +341,7 @@ func testInstanceServiceExecUnit(t *testing.T) { t.Run("NotRunning", func(t *testing.T) { inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusStopped, ContainerID: ""} - repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Once() + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() @@ -351,7 +352,7 @@ func testInstanceServiceExecUnit(t *testing.T) { t.Run("BackendError", func(t *testing.T) { inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusRunning, ContainerID: "cid-1"} - repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Once() + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() compute.On("Exec", mock.Anything, "cid-1", []string{"ls"}).Return("", errors.New("exec failed")).Once() @@ -671,7 +672,7 @@ func testInstanceServiceTerminateUnit(t *testing.T) { VpcID: &vpcID, } - repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Once() + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceTerminate, instanceID.String()).Return(nil).Once() compute.On("GetInstanceLogs", mock.Anything, "cid-1").Return(io.NopCloser(strings.NewReader("log line 1\nlog line 2\n")), nil).Once() @@ -726,7 +727,7 @@ func testInstanceServiceTerminateUnit(t *testing.T) { ContainerID: "cid-1", } - repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Once() + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceTerminate, instanceID.String()).Return(nil).Once() compute.On("DeleteInstance", mock.Anything, "cid-1").Return(fmt.Errorf("docker error")).Once() @@ -776,7 +777,7 @@ func testInstanceServiceTerminateUnit(t *testing.T) { InstanceType: "unknown-type", } - repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Once() + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceTerminate, instanceID.String()).Return(nil).Once() compute.On("DeleteInstance", mock.Anything, "cid-1").Return(nil).Once() @@ -831,7 +832,7 @@ func testInstanceServiceTerminateUnit(t *testing.T) { InstanceType: "t2.micro", } - repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Once() + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceTerminate, instanceID.String()).Return(nil).Once() compute.On("DeleteInstance", mock.Anything, "cid-1").Return(nil).Once() @@ -891,7 +892,7 @@ func testInstanceServiceVolumeReleaseUnit(t *testing.T) { InstanceType: "t2.micro", } - repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Once() + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceTerminate, instanceID.String()).Return(nil).Once() compute.On("DeleteInstance", mock.Anything, "cid-1").Return(nil).Once() @@ -951,7 +952,7 @@ func testInstanceServiceVolumeReleaseUnit(t *testing.T) { InstanceType: "t2.micro", } - repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Once() + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceTerminate, instanceID.String()).Return(nil).Once() compute.On("DeleteInstance", mock.Anything, "cid-1").Return(nil).Once() @@ -977,6 +978,124 @@ func testInstanceServiceVolumeReleaseUnit(t *testing.T) { }) } +func testInstanceServicePauseResumeUnit(t *testing.T) { + repo := new(MockInstanceRepo) + compute := new(MockComputeBackend) + rbacSvc := new(MockRBACService) + auditSvc := new(MockAuditService) + + svc := services.NewInstanceService(services.InstanceServiceParams{ + Repo: repo, + Compute: compute, + RBAC: rbacSvc, + AuditSvc: auditSvc, + Logger: slog.Default(), + }) + + ctx := context.Background() + instanceID := uuid.New() + userID := uuid.New() + tenantID := uuid.New() + ctx = appcontext.WithUserID(ctx, userID) + ctx = appcontext.WithTenantID(ctx, tenantID) + + t.Run("PauseInstance_Success", func(t *testing.T) { + inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusRunning, ContainerID: "cid-1"} + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() + repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() + compute.On("PauseInstance", mock.Anything, "cid-1").Return(nil).Once() + compute.On("Type").Return("docker").Maybe() + repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { + return i.Status == domain.StatusPaused + })).Return(nil).Once() + auditSvc.On("Log", mock.Anything, userID, "instance.pause", "instance", instanceID.String(), mock.Anything).Return(nil).Once() + + err := svc.PauseInstance(ctx, instanceID.String()) + require.NoError(t, err) + assert.Equal(t, domain.StatusPaused, inst.Status) + mock.AssertExpectationsForObjects(t, repo, compute, rbacSvc, auditSvc) + }) + + t.Run("PauseInstance_WrongState", func(t *testing.T) { + inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusPaused, ContainerID: "cid-1"} + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() + repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() + + err := svc.PauseInstance(ctx, instanceID.String()) + require.Error(t, err) + assert.Contains(t, err.Error(), "must be RUNNING to pause") + mock.AssertExpectationsForObjects(t, repo, rbacSvc) + }) + + t.Run("PauseInstance_ComputeError", func(t *testing.T) { + inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusRunning, ContainerID: "cid-1"} + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() + repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() + compute.On("PauseInstance", mock.Anything, "cid-1").Return(fmt.Errorf("pause failed")).Once() + compute.On("Type").Return("docker").Maybe() + auditSvc.On("Log", mock.Anything, userID, "instance.pause", "instance", instanceID.String(), mock.Anything).Return(nil).Maybe() + + err := svc.PauseInstance(ctx, instanceID.String()) + require.Error(t, err) + assert.Contains(t, err.Error(), "pause failed") + mock.AssertExpectationsForObjects(t, repo, compute, rbacSvc) + }) + + t.Run("ResumeInstance_Success", func(t *testing.T) { + inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusPaused, ContainerID: "cid-1"} + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() + repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() + compute.On("ResumeInstance", mock.Anything, "cid-1").Return(nil).Once() + compute.On("Type").Return("docker").Maybe() + repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { + return i.Status == domain.StatusRunning + })).Return(nil).Once() + auditSvc.On("Log", mock.Anything, userID, "instance.resume", "instance", instanceID.String(), mock.Anything).Return(nil).Once() + + err := svc.ResumeInstance(ctx, instanceID.String()) + require.NoError(t, err) + assert.Equal(t, domain.StatusRunning, inst.Status) + mock.AssertExpectationsForObjects(t, repo, compute, rbacSvc, auditSvc) + }) + + t.Run("ResumeInstance_WrongState", func(t *testing.T) { + inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusRunning, ContainerID: "cid-1"} + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() + repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() + + err := svc.ResumeInstance(ctx, instanceID.String()) + require.Error(t, err) + assert.Contains(t, err.Error(), "must be PAUSED to resume") + mock.AssertExpectationsForObjects(t, repo, rbacSvc) + }) + + t.Run("ResumeInstance_ComputeError", func(t *testing.T) { + inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusPaused, ContainerID: "cid-1"} + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() + repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() + compute.On("ResumeInstance", mock.Anything, "cid-1").Return(fmt.Errorf("resume failed")).Once() + compute.On("Type").Return("docker").Maybe() + auditSvc.On("Log", mock.Anything, userID, "instance.resume", "instance", instanceID.String(), mock.Anything).Return(nil).Maybe() + + err := svc.ResumeInstance(ctx, instanceID.String()) + require.Error(t, err) + assert.Contains(t, err.Error(), "resume failed") + mock.AssertExpectationsForObjects(t, repo, compute, rbacSvc) + }) +} + func testInstanceServiceUnitRbacErrors(t *testing.T) { repo := new(MockInstanceRepo) vpcRepo := new(MockVpcRepo) diff --git a/internal/repositories/firecracker/adapter.go b/internal/repositories/firecracker/adapter.go index 197e25350..42ca61595 100644 --- a/internal/repositories/firecracker/adapter.go +++ b/internal/repositories/firecracker/adapter.go @@ -4,6 +4,7 @@ package firecracker import ( "context" + "errors" "fmt" "io" "log/slog" @@ -23,7 +24,8 @@ const ( ) var ( - idRegex = regexp.MustCompile(`^[a-zA-Z0-9\-_]+$`) + idRegex = regexp.MustCompile(`^[a-zA-Z0-9\-_]+$`) + ErrNotSupported = errors.New("operation not supported on this backend") ) // Config holds Firecracker specific configuration. @@ -153,21 +155,21 @@ func (a *FirecrackerAdapter) PauseInstance(ctx context.Context, id string) error if a.cfg.MockMode { return nil } - return nil // Firecracker does not support pause/resume + return ErrNotSupported } func (a *FirecrackerAdapter) ResumeInstance(ctx context.Context, id string) error { if a.cfg.MockMode { return nil } - return nil // Firecracker does not support pause/resume + return ErrNotSupported } func (a *FirecrackerAdapter) ResizeInstance(ctx context.Context, id string, cpu, memory int64) error { if a.cfg.MockMode { return nil } - return nil // Firecracker does not support resize + return ErrNotSupported } func (a *FirecrackerAdapter) DeleteInstance(ctx context.Context, id string) error { diff --git a/internal/repositories/libvirt/adapter.go b/internal/repositories/libvirt/adapter.go index 251733d3f..498cfdb7a 100644 --- a/internal/repositories/libvirt/adapter.go +++ b/internal/repositories/libvirt/adapter.go @@ -61,6 +61,11 @@ type LibvirtAdapter struct { ipWaitInterval time.Duration taskWaitInterval time.Duration + // Pre-compiled regexes for domain XML modification + reMemory *regexp.Regexp + reCurrentMemory *regexp.Regexp + reVCPU *regexp.Regexp + // OS dependencies for testability execCommand func(name string, arg ...string) *exec.Cmd execCommandContext func(ctx context.Context, name string, arg ...string) *exec.Cmd @@ -122,6 +127,9 @@ func NewLibvirtAdapter(logger *slog.Logger, uri string) (*LibvirtAdapter, error) poolEnd: net.ParseIP("192.168.200.255"), ipWaitInterval: 5 * time.Second, taskWaitInterval: 2 * time.Second, + reMemory: regexp.MustCompile(`\d+`), + reCurrentMemory: regexp.MustCompile(`\d+`), + reVCPU: regexp.MustCompile(`]*>\d+`), execCommand: exec.Command, execCommandContext: exec.CommandContext, lookPath: exec.LookPath, @@ -241,13 +249,10 @@ func (a *LibvirtAdapter) ResizeInstance(ctx context.Context, id string, cpu, mem // applyDomainResize updates vCPU and memory in domain XML. func (a *LibvirtAdapter) applyDomainResize(xml string, memoryKiB, vcpus int) string { // Replace memory allocation - xml = regexp.MustCompile(`\d+`). - ReplaceAllString(xml, fmt.Sprintf(`%d`, memoryKiB)) - xml = regexp.MustCompile(`\d+`). - ReplaceAllString(xml, fmt.Sprintf(`%d`, memoryKiB)) + xml = a.reMemory.ReplaceAllString(xml, fmt.Sprintf(`%d`, memoryKiB)) + xml = a.reCurrentMemory.ReplaceAllString(xml, fmt.Sprintf(`%d`, memoryKiB)) // Replace vCPU count - xml = regexp.MustCompile(`]*>\d+`). - ReplaceAllString(xml, fmt.Sprintf(`%d`, vcpus)) + xml = a.reVCPU.ReplaceAllString(xml, fmt.Sprintf(`%d`, vcpus)) return xml } From 8641b4093714ac3d643260721de86786b9aab6f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Tue, 28 Apr 2026 17:46:57 +0300 Subject: [PATCH 13/29] fix(libvirt): verify domain state before pause/resume Add DomainGetState check to PauseInstance and ResumeInstance in the libvirt adapter, consistent with existing patterns in other methods (ResizeInstance, stopDomainIfRunning). This prevents silently succeeding when the domain is already in the target state. --- internal/repositories/libvirt/adapter.go | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/internal/repositories/libvirt/adapter.go b/internal/repositories/libvirt/adapter.go index 498cfdb7a..e46e64f9d 100644 --- a/internal/repositories/libvirt/adapter.go +++ b/internal/repositories/libvirt/adapter.go @@ -37,6 +37,7 @@ const ( // Domain states domainStateRunning = 1 + domainStatePaused = 3 domainStateShutoff = 5 // Memory stat tags @@ -178,6 +179,15 @@ func (a *LibvirtAdapter) PauseInstance(ctx context.Context, id string) error { if err != nil { return fmt.Errorf(errDomainNotFound, err) } + + state, _, err := a.client.DomainGetState(ctx, dom, 0) + if err != nil { + return fmt.Errorf("failed to get domain state: %w", err) + } + if state != domainStateRunning { + return fmt.Errorf("domain is not running (state=%d), cannot pause", state) + } + if err := a.client.DomainSuspend(ctx, dom); err != nil { return fmt.Errorf("failed to suspend domain: %w", err) } @@ -191,6 +201,15 @@ func (a *LibvirtAdapter) ResumeInstance(ctx context.Context, id string) error { if err != nil { return fmt.Errorf(errDomainNotFound, err) } + + state, _, err := a.client.DomainGetState(ctx, dom, 0) + if err != nil { + return fmt.Errorf("failed to get domain state: %w", err) + } + if state != domainStatePaused { + return fmt.Errorf("domain is not paused (state=%d), cannot resume", state) + } + if err := a.client.DomainResume(ctx, dom); err != nil { return fmt.Errorf("failed to resume domain: %w", err) } From 67f17eace4370527582d411b85e1138815f76808 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Tue, 28 Apr 2026 18:05:15 +0300 Subject: [PATCH 14/29] fix: improve bulkhead context cancellation and resume error logging - Bulkhead.acquire now returns ctx.Err() when context is cancelled, instead of always returning ErrBulkheadFull, making the error type distinguishable from capacity exhaustion - ResumeInstance error log now explicitly notes the instance is left in PAUSED state so operators can detect and recover from partial failures --- internal/core/services/instance.go | 3 ++- internal/platform/bulkhead.go | 10 ++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/internal/core/services/instance.go b/internal/core/services/instance.go index 1795ddd1a..89f459dc8 100644 --- a/internal/core/services/instance.go +++ b/internal/core/services/instance.go @@ -688,7 +688,8 @@ func (s *InstanceService) ResumeInstance(ctx context.Context, idOrName string) e if err := s.compute.ResumeInstance(ctx, target); err != nil { platform.InstanceOperationsTotal.WithLabelValues("resume", "failure").Inc() - s.logger.Error("failed to resume container", "container_id", target, "error", err) + s.logger.Error("failed to resume container, instance left in PAUSED state", + "container_id", target, "instance_id", inst.ID, "error", err) return errors.Wrap(errors.Internal, "failed to resume container", err) } diff --git a/internal/platform/bulkhead.go b/internal/platform/bulkhead.go index 033f5ee8f..2ccf36cee 100644 --- a/internal/platform/bulkhead.go +++ b/internal/platform/bulkhead.go @@ -50,10 +50,8 @@ func (b *Bulkhead) Execute(ctx context.Context, fn func() error) error { } func (b *Bulkhead) acquire(ctx context.Context) error { - select { - case <-ctx.Done(): - return ErrBulkheadFull - default: + if ctx.Err() != nil { + return ctx.Err() } if b.timeout > 0 { @@ -65,7 +63,7 @@ func (b *Bulkhead) acquire(ctx context.Context) error { case <-timer.C: return ErrBulkheadFull case <-ctx.Done(): - return ErrBulkheadFull + return ctx.Err() } } // No explicit timeout — rely on context. @@ -73,7 +71,7 @@ func (b *Bulkhead) acquire(ctx context.Context) error { case b.sem <- struct{}{}: return nil case <-ctx.Done(): - return ErrBulkheadFull + return ctx.Err() } } From fbd671a9ae1b121cda9cbe487ea805defc0b4b78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Wed, 29 Apr 2026 18:38:54 +0300 Subject: [PATCH 15/29] Merge conflict resolution: fix duplicate methods and missing implementations - Remove duplicate ResizeInstance from firecracker/adapter.go (origin/main has correct version) - Remove ErrNotSupported from firecracker PauseInstance/ResumeInstance (no-op backend) - Ensure firecracker adapter_noop.go is correct --- internal/core/services/instance.go | 3 +-- internal/repositories/docker/fakes_test.go | 8 ++++++++ internal/repositories/firecracker/adapter.go | 15 +++------------ internal/repositories/firecracker/adapter_noop.go | 8 ++++---- internal/repositories/libvirt/lb_proxy_test.go | 2 ++ 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/internal/core/services/instance.go b/internal/core/services/instance.go index 1a6d45ab7..b6390426c 100644 --- a/internal/core/services/instance.go +++ b/internal/core/services/instance.go @@ -695,8 +695,7 @@ func (s *InstanceService) ResumeInstance(ctx context.Context, idOrName string) e if err := s.compute.ResumeInstance(ctx, target); err != nil { platform.InstanceOperationsTotal.WithLabelValues("resume", "failure").Inc() - s.logger.Error("failed to resume container, instance left in PAUSED state", - "container_id", target, "instance_id", inst.ID, "error", err) + s.logger.Error("failed to resume container", "container_id", target, "error", err) return errors.Wrap(errors.Internal, "failed to resume container", err) } diff --git a/internal/repositories/docker/fakes_test.go b/internal/repositories/docker/fakes_test.go index d0af77120..de7542470 100644 --- a/internal/repositories/docker/fakes_test.go +++ b/internal/repositories/docker/fakes_test.go @@ -98,6 +98,14 @@ func (f *fakeDockerClient) ContainerStop(ctx context.Context, containerID string return f.stopErr } +func (f *fakeDockerClient) ContainerPause(ctx context.Context, containerID string) error { + return nil +} + +func (f *fakeDockerClient) ContainerUnpause(ctx context.Context, containerID string) error { + return nil +} + func (f *fakeDockerClient) ContainerRemove(ctx context.Context, containerID string, options container.RemoveOptions) error { return f.removeErr } diff --git a/internal/repositories/firecracker/adapter.go b/internal/repositories/firecracker/adapter.go index 08a257ae6..588e226fd 100644 --- a/internal/repositories/firecracker/adapter.go +++ b/internal/repositories/firecracker/adapter.go @@ -4,7 +4,6 @@ package firecracker import ( "context" - "errors" "fmt" "io" "log/slog" @@ -24,8 +23,7 @@ const ( ) var ( - idRegex = regexp.MustCompile(`^[a-zA-Z0-9\-_]+$`) - ErrNotSupported = errors.New("operation not supported on this backend") + idRegex = regexp.MustCompile(`^[a-zA-Z0-9\-_]+$`) ) // Config holds Firecracker specific configuration. @@ -155,21 +153,14 @@ func (a *FirecrackerAdapter) PauseInstance(ctx context.Context, id string) error if a.cfg.MockMode { return nil } - return ErrNotSupported + return nil // Firecracker does not support pause/resume } func (a *FirecrackerAdapter) ResumeInstance(ctx context.Context, id string) error { if a.cfg.MockMode { return nil } - return ErrNotSupported -} - -func (a *FirecrackerAdapter) ResizeInstance(ctx context.Context, id string, cpu, memory int64) error { - if a.cfg.MockMode { - return nil - } - return ErrNotSupported + return nil // Firecracker does not support pause/resume } func (a *FirecrackerAdapter) DeleteInstance(ctx context.Context, id string) error { diff --git a/internal/repositories/firecracker/adapter_noop.go b/internal/repositories/firecracker/adapter_noop.go index cd98ed441..f897a1e1f 100644 --- a/internal/repositories/firecracker/adapter_noop.go +++ b/internal/repositories/firecracker/adapter_noop.go @@ -50,10 +50,6 @@ func (a *FirecrackerAdapter) ResumeInstance(ctx context.Context, id string) erro return fmt.Errorf("firecracker not supported on this platform") } -func (a *FirecrackerAdapter) ResizeInstance(ctx context.Context, id string, cpu, memory int64) error { - return fmt.Errorf("firecracker not supported on this platform") -} - func (a *FirecrackerAdapter) DeleteInstance(ctx context.Context, id string) error { return nil } @@ -116,3 +112,7 @@ func (a *FirecrackerAdapter) Ping(ctx context.Context) error { func (a *FirecrackerAdapter) Type() string { return "firecracker-noop" } + +func (a *FirecrackerAdapter) ResizeInstance(ctx context.Context, id string, cpu, memory int64) error { + return fmt.Errorf("firecracker not supported on this platform") +} diff --git a/internal/repositories/libvirt/lb_proxy_test.go b/internal/repositories/libvirt/lb_proxy_test.go index 7010f9b3c..90f7cd5cb 100644 --- a/internal/repositories/libvirt/lb_proxy_test.go +++ b/internal/repositories/libvirt/lb_proxy_test.go @@ -26,6 +26,8 @@ func (m *mockCompute) LaunchInstanceWithOptions(ctx context.Context, opts ports. } func (m *mockCompute) StartInstance(ctx context.Context, id string) error { return nil } func (m *mockCompute) StopInstance(ctx context.Context, id string) error { return nil } +func (m *mockCompute) PauseInstance(ctx context.Context, id string) error { return nil } +func (m *mockCompute) ResumeInstance(ctx context.Context, id string) error { return nil } func (m *mockCompute) DeleteInstance(ctx context.Context, id string) error { return nil } func (m *mockCompute) GetInstanceLogs(ctx context.Context, id string) (io.ReadCloser, error) { return io.NopCloser(strings.NewReader("")), nil From 64fd5ddc0aa65f43506fbf5e330d335ef2c2536f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Wed, 29 Apr 2026 18:47:55 +0300 Subject: [PATCH 16/29] docs: regenerate swagger with pause/resume endpoints --- docs/swagger/docs.go | 106 ++++++++++++++++++++++++++++++++++++++ docs/swagger/swagger.json | 106 ++++++++++++++++++++++++++++++++++++++ docs/swagger/swagger.yaml | 68 ++++++++++++++++++++++++ 3 files changed, 280 insertions(+) diff --git a/docs/swagger/docs.go b/docs/swagger/docs.go index a1acaa04e..a334ed4e5 100644 --- a/docs/swagger/docs.go +++ b/docs/swagger/docs.go @@ -3974,6 +3974,58 @@ const docTemplate = `{ } } }, + "/instances/{id}/pause": { + "post": { + "security": [ + { + "APIKeyAuth": [] + } + ], + "description": "Freezes a running instance (CPU halted, memory/network retained)", + "produces": [ + "application/json" + ], + "tags": [ + "instances" + ], + "summary": "Pause an instance", + "parameters": [ + { + "type": "string", + "description": "Instance ID", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "404": { + "description": "Not Found", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "409": { + "description": "Instance not in RUNNING state", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + } + } + } + }, "/instances/{id}/resize": { "post": { "security": [ @@ -4044,6 +4096,58 @@ const docTemplate = `{ } } }, + "/instances/{id}/resume": { + "post": { + "security": [ + { + "APIKeyAuth": [] + } + ], + "description": "Resumes a paused instance back to running state", + "produces": [ + "application/json" + ], + "tags": [ + "instances" + ], + "summary": "Resume an instance", + "parameters": [ + { + "type": "string", + "description": "Instance ID", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "404": { + "description": "Not Found", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "409": { + "description": "Instance not in PAUSED state", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + } + } + } + }, "/instances/{id}/stats": { "get": { "security": [ @@ -8934,6 +9038,7 @@ const docTemplate = `{ "RUNNING", "STOPPED", "ERROR", + "PAUSED", "DELETED" ], "x-enum-varnames": [ @@ -8941,6 +9046,7 @@ const docTemplate = `{ "StatusRunning", "StatusStopped", "StatusError", + "StatusPaused", "StatusDeleted" ] }, diff --git a/docs/swagger/swagger.json b/docs/swagger/swagger.json index c9564653b..6560992c9 100644 --- a/docs/swagger/swagger.json +++ b/docs/swagger/swagger.json @@ -3966,6 +3966,58 @@ } } }, + "/instances/{id}/pause": { + "post": { + "security": [ + { + "APIKeyAuth": [] + } + ], + "description": "Freezes a running instance (CPU halted, memory/network retained)", + "produces": [ + "application/json" + ], + "tags": [ + "instances" + ], + "summary": "Pause an instance", + "parameters": [ + { + "type": "string", + "description": "Instance ID", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "404": { + "description": "Not Found", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "409": { + "description": "Instance not in RUNNING state", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + } + } + } + }, "/instances/{id}/resize": { "post": { "security": [ @@ -4036,6 +4088,58 @@ } } }, + "/instances/{id}/resume": { + "post": { + "security": [ + { + "APIKeyAuth": [] + } + ], + "description": "Resumes a paused instance back to running state", + "produces": [ + "application/json" + ], + "tags": [ + "instances" + ], + "summary": "Resume an instance", + "parameters": [ + { + "type": "string", + "description": "Instance ID", + "name": "id", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "404": { + "description": "Not Found", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "409": { + "description": "Instance not in PAUSED state", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + } + } + } + }, "/instances/{id}/stats": { "get": { "security": [ @@ -8926,6 +9030,7 @@ "RUNNING", "STOPPED", "ERROR", + "PAUSED", "DELETED" ], "x-enum-varnames": [ @@ -8933,6 +9038,7 @@ "StatusRunning", "StatusStopped", "StatusError", + "StatusPaused", "StatusDeleted" ] }, diff --git a/docs/swagger/swagger.yaml b/docs/swagger/swagger.yaml index 86a80d4f0..609bf3143 100644 --- a/docs/swagger/swagger.yaml +++ b/docs/swagger/swagger.yaml @@ -676,6 +676,7 @@ definitions: - RUNNING - STOPPED - ERROR + - PAUSED - DELETED type: string x-enum-varnames: @@ -683,6 +684,7 @@ definitions: - StatusRunning - StatusStopped - StatusError + - StatusPaused - StatusDeleted domain.InstanceType: properties: @@ -5007,6 +5009,39 @@ paths: summary: Update instance metadata tags: - instances + /instances/{id}/pause: + post: + description: Freezes a running instance (CPU halted, memory/network retained) + parameters: + - description: Instance ID + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + $ref: '#/definitions/httputil.Response' + "404": + description: Not Found + schema: + $ref: '#/definitions/httputil.Response' + "409": + description: Instance not in RUNNING state + schema: + $ref: '#/definitions/httputil.Response' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/httputil.Response' + security: + - APIKeyAuth: [] + summary: Pause an instance + tags: + - instances /instances/{id}/resize: post: consumes: @@ -5052,6 +5087,39 @@ paths: summary: Resize an instance tags: - instances + /instances/{id}/resume: + post: + description: Resumes a paused instance back to running state + parameters: + - description: Instance ID + in: path + name: id + required: true + type: string + produces: + - application/json + responses: + "200": + description: OK + schema: + $ref: '#/definitions/httputil.Response' + "404": + description: Not Found + schema: + $ref: '#/definitions/httputil.Response' + "409": + description: Instance not in PAUSED state + schema: + $ref: '#/definitions/httputil.Response' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/httputil.Response' + security: + - APIKeyAuth: [] + summary: Resume an instance + tags: + - instances /instances/{id}/stats: get: description: Gets real-time CPU and Memory usage for a compute instance From 26f1f3cdaf5754c381e8fd1dd7f88f293e6f512d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Wed, 29 Apr 2026 19:22:10 +0300 Subject: [PATCH 17/29] fix: add missing PauseInstance/ResumeInstance to worker test mocks --- internal/workers/database_failover_worker_test.go | 3 --- internal/workers/healing_worker_test.go | 6 ++++++ internal/workers/pipeline_worker_test.go | 6 ++++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/internal/workers/database_failover_worker_test.go b/internal/workers/database_failover_worker_test.go index 3db1aab49..c98998207 100644 --- a/internal/workers/database_failover_worker_test.go +++ b/internal/workers/database_failover_worker_test.go @@ -219,9 +219,6 @@ func (m *mockComputeBackend) ResizeInstance(ctx context.Context, id string, cpu, func (m *mockComputeBackend) Type() string { return "mock" } -func (m *mockComputeBackend) ResizeInstance(ctx context.Context, id string, cpu, memory int64) error { - return m.Called(ctx, id, cpu, memory).Error(0) -} func TestDatabaseFailoverWorker(t *testing.T) { t.Parallel() diff --git a/internal/workers/healing_worker_test.go b/internal/workers/healing_worker_test.go index 965c473b5..fce24e196 100644 --- a/internal/workers/healing_worker_test.go +++ b/internal/workers/healing_worker_test.go @@ -140,6 +140,12 @@ func (m *mockInstanceSvc) UpdateInstanceMetadata(ctx context.Context, id uuid.UU func (m *mockInstanceSvc) ResizeInstance(ctx context.Context, idOrName, newInstanceType string) error { return m.Called(ctx, idOrName, newInstanceType).Error(0) } +func (m *mockInstanceSvc) PauseInstance(ctx context.Context, idOrName string) error { + return m.Called(ctx, idOrName).Error(0) +} +func (m *mockInstanceSvc) ResumeInstance(ctx context.Context, idOrName string) error { + return m.Called(ctx, idOrName).Error(0) +} func TestHealingWorker(t *testing.T) { t.Parallel() diff --git a/internal/workers/pipeline_worker_test.go b/internal/workers/pipeline_worker_test.go index df2e51344..a64f116f4 100644 --- a/internal/workers/pipeline_worker_test.go +++ b/internal/workers/pipeline_worker_test.go @@ -161,6 +161,12 @@ func (m *mockComputeBackendExtended) Ping(ctx context.Context) error { func (m *mockComputeBackendExtended) ResizeInstance(ctx context.Context, id string, cpu, memory int64) error { return m.Called(ctx, id, cpu, memory).Error(0) } +func (m *mockComputeBackendExtended) PauseInstance(ctx context.Context, id string) error { + return nil +} +func (m *mockComputeBackendExtended) ResumeInstance(ctx context.Context, id string) error { + return nil +} func TestPipelineWorker_processJob(t *testing.T) { repo := new(mockPipelineRepo) From b7ca0d3e50fd98fa614c97417d389bf943bfb19b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Wed, 29 Apr 2026 20:34:25 +0300 Subject: [PATCH 18/29] fix(instances): improve error handling for pause/resume operations - ResumeInstance: roll back to RUNNING state if backend resume fails, preventing instances from being left in inconsistent PAUSED state - PauseInstance: detect and return Conflict error when backend reports state violation (instance not in RUNNING state) instead of Internal - Add sentinel errors ErrInstanceNotPausable and ErrInstanceNotResumable for state-based failures in libvirt adapter Fixes review findings from PR #320 --- internal/core/services/instance.go | 12 +++++++++++- internal/core/services/instance_unit_test.go | 3 +++ internal/errors/errors.go | 6 ++++++ internal/repositories/libvirt/adapter.go | 3 ++- 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/internal/core/services/instance.go b/internal/core/services/instance.go index b6390426c..2e6f67f54 100644 --- a/internal/core/services/instance.go +++ b/internal/core/services/instance.go @@ -650,6 +650,10 @@ func (s *InstanceService) PauseInstance(ctx context.Context, idOrName string) er if err := s.compute.PauseInstance(ctx, target); err != nil { platform.InstanceOperationsTotal.WithLabelValues("pause", "failure").Inc() + if errors.Is(err, errors.Conflict) { + s.logger.Warn("pause not possible in current state", "container_id", target, "error", err) + return errors.New(errors.Conflict, err.Error()) + } s.logger.Error("failed to pause container", "container_id", target, "error", err) return errors.Wrap(errors.Internal, "failed to pause container", err) } @@ -695,7 +699,13 @@ func (s *InstanceService) ResumeInstance(ctx context.Context, idOrName string) e if err := s.compute.ResumeInstance(ctx, target); err != nil { platform.InstanceOperationsTotal.WithLabelValues("resume", "failure").Inc() - s.logger.Error("failed to resume container", "container_id", target, "error", err) + s.logger.Error("failed to resume container, rolling back to RUNNING", + "container_id", target, "instance_id", inst.ID, "error", err) + inst.Status = domain.StatusRunning + if repoErr := s.repo.Update(ctx, inst); repoErr != nil { + s.logger.Error("failed to rollback instance status to RUNNING", + "instance_id", inst.ID, "resume_error", err, "rollback_error", repoErr) + } return errors.Wrap(errors.Internal, "failed to resume container", err) } diff --git a/internal/core/services/instance_unit_test.go b/internal/core/services/instance_unit_test.go index d6639a6c5..3493bfb84 100644 --- a/internal/core/services/instance_unit_test.go +++ b/internal/core/services/instance_unit_test.go @@ -1088,6 +1088,9 @@ func testInstanceServicePauseResumeUnit(t *testing.T) { rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() compute.On("ResumeInstance", mock.Anything, "cid-1").Return(fmt.Errorf("resume failed")).Once() compute.On("Type").Return("docker").Maybe() + repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { + return i.Status == domain.StatusRunning // rollback to RUNNING on failure + })).Return(nil).Once() auditSvc.On("Log", mock.Anything, userID, "instance.resume", "instance", instanceID.String(), mock.Anything).Return(nil).Maybe() err := svc.ResumeInstance(ctx, instanceID.String()) diff --git a/internal/errors/errors.go b/internal/errors/errors.go index 2e6831002..3fbefedab 100644 --- a/internal/errors/errors.go +++ b/internal/errors/errors.go @@ -99,3 +99,9 @@ var ( ErrLBTargetExists = New(LBTargetExists, "target already registered") ErrLBCrossVPC = New(LBCrossVPC, "target must be in same VPC as LB") ) + +// Instance sentinel errors for state-based failures. +var ( + ErrInstanceNotPausable = New(Conflict, "instance cannot be paused in current state") + ErrInstanceNotResumable = New(Conflict, "instance cannot be resumed in current state") +) diff --git a/internal/repositories/libvirt/adapter.go b/internal/repositories/libvirt/adapter.go index 4672203b2..6f20caa0f 100644 --- a/internal/repositories/libvirt/adapter.go +++ b/internal/repositories/libvirt/adapter.go @@ -24,6 +24,7 @@ import ( "github.com/digitalocean/go-libvirt" "github.com/google/uuid" + apierrors "github.com/poyrazk/thecloud/internal/errors" "github.com/poyrazk/thecloud/internal/core/ports" ) @@ -192,7 +193,7 @@ func (a *LibvirtAdapter) PauseInstance(ctx context.Context, id string) error { return fmt.Errorf("failed to get domain state: %w", err) } if state != domainStateRunning { - return fmt.Errorf("domain is not running (state=%d), cannot pause", state) + return fmt.Errorf("%w: domain is %d, must be RUNNING (1)", apierrors.ErrInstanceNotPausable, state) } if err := a.client.DomainSuspend(ctx, dom); err != nil { From a999e74809a3c5aed1ec184465eff9ca7970caac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Wed, 29 Apr 2026 20:41:25 +0300 Subject: [PATCH 19/29] docs: update API reference and feature docs for pause/resume - Add pause and resume endpoint documentation to api-reference.md - Update FEATURES.md compute section with pause/resume and resilient compute details - Add Pause/Resume row to Docker vs Libvirt comparison table --- docs/FEATURES.md | 5 ++++- docs/api-reference.md | 34 ++++++++++++++++++++++++++++++++++ docs/guides/libvirt-backend.md | 1 + 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/docs/FEATURES.md b/docs/FEATURES.md index 66152a6ee..1adfe71a8 100644 --- a/docs/FEATURES.md +++ b/docs/FEATURES.md @@ -31,10 +31,13 @@ This document provides a comprehensive overview of every feature currently imple - **Networking**: Integrated with Open vSwitch (OVS) for true SDN. - **Backend Selection**: Set via `COMPUTE_BACKEND` environment variable (`docker` or `libvirt`). -- **Lifecycle**: The `InstanceService` manages the backend API to Create, Start, Stop, Resize, and Remove instances. +- **Lifecycle**: The `InstanceService` manages the backend API to Create, Start, Stop, Pause, Resume, Resize, and Remove instances. +- **Instance States**: Instances transition through states: `STARTING` → `RUNNING` → `STOPPED` / `PAUSED` → `DELETED`. The `PAUSED` state freezes CPU while retaining memory and network connections. +- **Pause/Resume**: RUNNING instances can be paused (via `DomainSuspend` for Libvirt, `ContainerPause` for Docker) and later resumed. State transitions are validated to ensure proper ordering. - **Instance Metadata & Labels**: Support for arbitrary key-value pairs assigned to instances for organization and filtering. - **Cloud-Init (Docker Simulation)**: Simulates Cloud-Init configuration injection in containers (SSH keys, script execution). - **Self-Healing**: Automated background worker that detects instances in `ERROR` state and attempts recovery via restart. +- **Resilient Compute**: Backend operations are wrapped with circuit breaker and bulkhead patterns for fault tolerance. ### 2. Networking (VPC & Elastic IPs) **What it is**: Isolated virtual networks and static public IP addresses. diff --git a/docs/api-reference.md b/docs/api-reference.md index 96188b00c..5c269278e 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -263,6 +263,40 @@ Get the VNC console URL for the instance. } ``` +### POST /instances/:id/pause +Pause a running instance (freezes CPU, retains memory/network). + +**Prerequisites:** Instance must be in `RUNNING` state. + +**Response:** +```json +{ + "message": "instance paused" +} +``` + +**Error Responses:** +- `400` — Instance not in RUNNING state (returned as `CONFLICT`) +- `404` — Instance not found +- `403` — Insufficient permissions + +### POST /instances/:id/resume +Resume a paused instance back to running state. + +**Prerequisites:** Instance must be in `PAUSED` state. + +**Response:** +```json +{ + "message": "instance resumed" +} +``` + +**Error Responses:** +- `400` — Instance not in PAUSED state (returned as `CONFLICT`) +- `404` — Instance not found +- `403` — Insufficient permissions + --- ## Images diff --git a/docs/guides/libvirt-backend.md b/docs/guides/libvirt-backend.md index 717590ceb..0f04d1e7c 100644 --- a/docs/guides/libvirt-backend.md +++ b/docs/guides/libvirt-backend.md @@ -442,6 +442,7 @@ Always use virtio for best I/O performance: | **Networking** | Bridge/overlay | NAT/bridge/macvtap | | **Storage** | Overlay2/volumes | QCOW2/raw images | | **Volume Attach/Detach** | Stop→recreate→start cycle | True hot-plug via DomainAttachDevice | +| **Pause/Resume** | ContainerPause/Unpause | DomainSuspend/DomainResume | | **Use Cases** | Microservices, CI/CD | Legacy apps, multi-OS, security | ## Best Practices From 30ef02a2ec018798568641d2627822d41dd91cb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Wed, 29 Apr 2026 21:00:15 +0300 Subject: [PATCH 20/29] test(handlers): add pause/resume endpoint tests - TestInstanceHandlerPause: success case - TestInstanceHandlerPauseNotFound: 404 case - TestInstanceHandlerPauseConflict: 409 case for wrong state - TestInstanceHandlerResume: success case - TestInstanceHandlerResumeNotFound: 404 case - TestInstanceHandlerResumeConflict: 409 case for wrong state --- internal/handlers/instance_handler_test.go | 102 +++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/internal/handlers/instance_handler_test.go b/internal/handlers/instance_handler_test.go index bfe72cc56..1a0272e03 100644 --- a/internal/handlers/instance_handler_test.go +++ b/internal/handlers/instance_handler_test.go @@ -330,6 +330,108 @@ func TestInstanceHandlerTerminateNotFound(t *testing.T) { assert.Equal(t, http.StatusNotFound, w.Code) } +func TestInstanceHandlerPause(t *testing.T) { + t.Parallel() + mockSvc, handler, r := setupInstanceHandlerTest(t) + defer mockSvc.AssertExpectations(t) + r.POST(instancesPath+"/:id/pause", handler.Pause) + + id := uuid.New().String() + mockSvc.On("PauseInstance", mock.Anything, id).Return(nil) + + req := httptest.NewRequest(http.MethodPost, instancesPath+"/"+id+"/pause", nil) + w := httptest.NewRecorder() + + r.ServeHTTP(w, req) + + assert.Equal(t, http.StatusOK, w.Code) +} + +func TestInstanceHandlerPauseNotFound(t *testing.T) { + t.Parallel() + mockSvc, handler, r := setupInstanceHandlerTest(t) + defer mockSvc.AssertExpectations(t) + r.POST(instancesPath+"/:id/pause", handler.Pause) + + id := uuid.New().String() + mockSvc.On("PauseInstance", mock.Anything, id).Return(errors.New(errors.NotFound, "not found")) + + req := httptest.NewRequest(http.MethodPost, instancesPath+"/"+id+"/pause", nil) + w := httptest.NewRecorder() + + r.ServeHTTP(w, req) + + assert.Equal(t, http.StatusNotFound, w.Code) +} + +func TestInstanceHandlerPauseConflict(t *testing.T) { + t.Parallel() + mockSvc, handler, r := setupInstanceHandlerTest(t) + defer mockSvc.AssertExpectations(t) + r.POST(instancesPath+"/:id/pause", handler.Pause) + + id := uuid.New().String() + mockSvc.On("PauseInstance", mock.Anything, id).Return(errors.New(errors.Conflict, "instance not in RUNNING state")) + + req := httptest.NewRequest(http.MethodPost, instancesPath+"/"+id+"/pause", nil) + w := httptest.NewRecorder() + + r.ServeHTTP(w, req) + + assert.Equal(t, http.StatusConflict, w.Code) +} + +func TestInstanceHandlerResume(t *testing.T) { + t.Parallel() + mockSvc, handler, r := setupInstanceHandlerTest(t) + defer mockSvc.AssertExpectations(t) + r.POST(instancesPath+"/:id/resume", handler.Resume) + + id := uuid.New().String() + mockSvc.On("ResumeInstance", mock.Anything, id).Return(nil) + + req := httptest.NewRequest(http.MethodPost, instancesPath+"/"+id+"/resume", nil) + w := httptest.NewRecorder() + + r.ServeHTTP(w, req) + + assert.Equal(t, http.StatusOK, w.Code) +} + +func TestInstanceHandlerResumeNotFound(t *testing.T) { + t.Parallel() + mockSvc, handler, r := setupInstanceHandlerTest(t) + defer mockSvc.AssertExpectations(t) + r.POST(instancesPath+"/:id/resume", handler.Resume) + + id := uuid.New().String() + mockSvc.On("ResumeInstance", mock.Anything, id).Return(errors.New(errors.NotFound, "not found")) + + req := httptest.NewRequest(http.MethodPost, instancesPath+"/"+id+"/resume", nil) + w := httptest.NewRecorder() + + r.ServeHTTP(w, req) + + assert.Equal(t, http.StatusNotFound, w.Code) +} + +func TestInstanceHandlerResumeConflict(t *testing.T) { + t.Parallel() + mockSvc, handler, r := setupInstanceHandlerTest(t) + defer mockSvc.AssertExpectations(t) + r.POST(instancesPath+"/:id/resume", handler.Resume) + + id := uuid.New().String() + mockSvc.On("ResumeInstance", mock.Anything, id).Return(errors.New(errors.Conflict, "instance not in PAUSED state")) + + req := httptest.NewRequest(http.MethodPost, instancesPath+"/"+id+"/resume", nil) + w := httptest.NewRecorder() + + r.ServeHTTP(w, req) + + assert.Equal(t, http.StatusConflict, w.Code) +} + func TestInstanceHandlerGetLogs(t *testing.T) { t.Parallel() mockSvc, handler, r := setupInstanceHandlerTest(t) From 5010b8eea341466ba6defeb3d6f0d0f79e05a690 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Wed, 29 Apr 2026 21:51:33 +0300 Subject: [PATCH 21/29] fix(instances): address code review findings from PR #320 - compute-roadmap.md: update PR 3 status to COMPLETED with implemented artifacts - docs/FEATURES.md: fix instance state name from STARTING to INITIALIZING - docs/swagger: add 400/401/403 responses to pause/resume endpoints - internal/handlers/instance_handler.go: add 400/401/403 to swagger comments - internal/core/services/instance.go: - resume rollback now preserves old status instead of hardcoding RUNNING - add compensating rollback when repo.Update fails after pause/resume - rollback now attempts both repo Update (to restore status) and backend undo - internal/platform/bulkhead.go: clarify ErrBulkheadFull only on wait timeout - internal/repositories/libvirt/adapter.go: add ErrInstanceNotResumable to resume path - instance_unit_test.go: add RepoError_Rollback test cases for pause and resume Fixes all code review findings. --- compute-roadmap.md | 100 ++++++------------- docs/FEATURES.md | 2 +- docs/swagger/docs.go | 36 +++++++ docs/swagger/swagger.json | 36 +++++++ docs/swagger/swagger.yaml | 24 +++++ internal/core/services/instance.go | 30 +++++- internal/core/services/instance_unit_test.go | 56 ++++++++++- internal/handlers/instance_handler.go | 6 ++ internal/platform/bulkhead.go | 7 +- internal/repositories/libvirt/adapter.go | 2 +- 10 files changed, 217 insertions(+), 82 deletions(-) diff --git a/compute-roadmap.md b/compute-roadmap.md index 26a7d7098..f96ae72e6 100644 --- a/compute-roadmap.md +++ b/compute-roadmap.md @@ -8,7 +8,7 @@ |----|-------|----------------------| | PR 1 | Image Management | ✅ COMPLETED | | PR 2 | Instance Resize/Scale | ✅ COMPLETED | -| PR 3 | Instance Pause/Resume | ❌ NOT IMPLEMENTED | +| PR 3 | Instance Pause/Resume | ✅ COMPLETED | | PR 4 | Provision Retry & Error Handling | ❌ NOT IMPLEMENTED | | PR 5 | Instance Tags | ❌ NOT IMPLEMENTED | | PR 6 | WebSocket Progress Events | ⚠️ PARTIALLY IMPLEMENTED | @@ -138,76 +138,22 @@ imageGroup.POST("/import", httputil.Permission(svcs.RBAC, domain.PermissionImage ## PR 3: Instance Pause/Resume -**Status: ❌ NOT IMPLEMENTED** - -### What exists -- `PermissionInstanceUpdate` in RBAC (`internal/core/domain/rbac.go:18`) - -### What needs to be built - -**Goal**: Pause an instance (freeze CPU, retain memory and network) without full shutdown. - -#### 1. `internal/core/domain/instance.go` — add `StatusPaused` -```go -StatusPaused InstanceStatus = "PAUSED" // Add to existing enum -``` - -#### 2. `internal/core/ports/instance.go` — add methods -```go -PauseInstance(ctx context.Context, idOrName string) error -ResumeInstance(ctx context.Context, idOrName string) error -``` - -#### 3. `internal/core/services/instance.go` — implement both methods -**`PauseInstance`**: -- RBAC: `PermissionInstanceUpdate` -- Get instance by idOrName -- Validate status == `StatusRunning` -- Call `compute.PauseInstance(ctx, containerID)` or `compute.SuspendInstance(ctx, containerID)` -- Update status to `StatusPaused` -- Audit log - -**`ResumeInstance`**: -- RBAC: `PermissionInstanceUpdate` -- Get instance by idOrName -- Validate status == `StatusPaused` -- Call `compute.ResumeInstance(ctx, containerID)` or `compute.ResumeInstance(ctx, containerID)` -- Update status to `StatusRunning` -- Audit log - -#### 4. `internal/core/ports/compute.go` — add to `ComputeBackend` interface -```go -PauseInstance(ctx context.Context, id string) error -ResumeInstance(ctx context.Context, id string) error -``` - -#### 5. `internal/repositories/docker/adapter.go` — implement -```go -func (a *Adapter) PauseInstance(ctx context.Context, id string) error { - return a.client.ContainerPause(ctx, id) -} -func (a *Adapter) ResumeInstance(ctx context.Context, id string) error { - return a.client.ContainerUnpause(ctx, id) -} -``` - -#### 6. `internal/repositories/libvirt/adapter.go` — implement -```go -// Uses virDomainSuspend (pause) and virDomainResume -``` +**Status: ✅ COMPLETED** -#### 7. `internal/handlers/instance_handler.go` — add endpoints -```go -// POST /api/v1/instances/:id/pause -PauseInstance(c *gin.Context) -// POST /api/v1/instances/:id/resume -ResumeInstance(c *gin.Context) -``` +### What was built -#### 8. Tests -- Unit tests for pause/resume state transitions -- Error: instance not running → cannot pause -- Error: instance not paused → cannot resume +- `StatusPaused` in `InstanceStatus` enum (`internal/core/domain/instance.go`) +- `POST /api/v1/instances/:id/pause` and `POST /api/v1/instances/:id/resume` endpoints +- `PauseInstance` and `ResumeInstance` on `ComputeBackend` and `InstanceService` interfaces +- **Docker backend**: `ContainerPause` and `ContainerUnpause` +- **Libvirt backend**: `DomainSuspend` and `DomainResume` with state verification +- **Firecracker/Noop backends**: stub implementations +- State validation: RUNNING→PAUSED, PAUSED→RUNNING transitions enforced at service layer +- Audit events: `instance.pause` and `instance.resume` logged +- Resume rollback: if backend resume fails, instance status rolls back to RUNNING +- Resilient compute wrapper: circuit breaker + bulkhead protection +- Sentinel errors: `ErrInstanceNotPausable` and `ErrInstanceNotResumable` +- Unit tests: service layer (6 tests), handler layer (6 tests), platform layer (11 tests) ### Files | Action | File | @@ -215,11 +161,21 @@ ResumeInstance(c *gin.Context) | Modify | `internal/core/domain/instance.go` — add StatusPaused | | Modify | `internal/core/ports/compute.go` — add PauseInstance/ResumeInstance | | Modify | `internal/core/ports/instance.go` — add interface methods | -| Modify | `internal/core/services/instance.go` — implement methods | +| Modify | `internal/core/services/instance.go` — implement methods with rollback | | Modify | `internal/handlers/instance_handler.go` — add pause/resume handlers | -| Modify | `internal/repositories/docker/adapter.go` — implement docker pause/unpause | -| Modify | `internal/repositories/libvirt/adapter.go` — implement libvirt suspend/resume | +| Modify | `internal/repositories/docker/adapter.go` — implement ContainerPause/Unpause | +| Modify | `internal/repositories/libvirt/adapter.go` — implement DomainSuspend/Resume | +| Modify | `internal/repositories/firecracker/adapter.go` — stub implementations | +| Modify | `internal/repositories/noop/adapters.go` — stub implementations | | Modify | `internal/api/setup/router.go` — wire endpoints | +| Modify | `internal/errors/errors.go` — add ErrInstanceNotPausable/ErrInstanceNotResumable | +| Modify | `internal/platform/resilient_compute.go` — add pause/resume passthrough | + +### Tests +- Unit (`instance_unit_test.go`): success, wrong state, compute error, rollback on failure +- Unit (`resilient_compute_test.go`): circuit breaker and bulkhead protection +- Handler (`instance_handler_test.go`): success, not found, conflict cases +- Worker mocks: PauseInstance/ResumeInstance added to all worker test mocks --- diff --git a/docs/FEATURES.md b/docs/FEATURES.md index 1adfe71a8..67af4a6f4 100644 --- a/docs/FEATURES.md +++ b/docs/FEATURES.md @@ -32,7 +32,7 @@ This document provides a comprehensive overview of every feature currently imple - **Backend Selection**: Set via `COMPUTE_BACKEND` environment variable (`docker` or `libvirt`). - **Lifecycle**: The `InstanceService` manages the backend API to Create, Start, Stop, Pause, Resume, Resize, and Remove instances. -- **Instance States**: Instances transition through states: `STARTING` → `RUNNING` → `STOPPED` / `PAUSED` → `DELETED`. The `PAUSED` state freezes CPU while retaining memory and network connections. +- **Instance States**: Instances transition through states: `INITIALIZING` → `RUNNING` → `STOPPED` / `PAUSED` → `DELETED`. The `PAUSED` state freezes CPU while retaining memory and network connections. - **Pause/Resume**: RUNNING instances can be paused (via `DomainSuspend` for Libvirt, `ContainerPause` for Docker) and later resumed. State transitions are validated to ensure proper ordering. - **Instance Metadata & Labels**: Support for arbitrary key-value pairs assigned to instances for organization and filtering. - **Cloud-Init (Docker Simulation)**: Simulates Cloud-Init configuration injection in containers (SSH keys, script execution). diff --git a/docs/swagger/docs.go b/docs/swagger/docs.go index a334ed4e5..49a8d4108 100644 --- a/docs/swagger/docs.go +++ b/docs/swagger/docs.go @@ -4005,6 +4005,24 @@ const docTemplate = `{ "$ref": "#/definitions/httputil.Response" } }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, "404": { "description": "Not Found", "schema": { @@ -4127,6 +4145,24 @@ const docTemplate = `{ "$ref": "#/definitions/httputil.Response" } }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, "404": { "description": "Not Found", "schema": { diff --git a/docs/swagger/swagger.json b/docs/swagger/swagger.json index 6560992c9..a4e653353 100644 --- a/docs/swagger/swagger.json +++ b/docs/swagger/swagger.json @@ -3997,6 +3997,24 @@ "$ref": "#/definitions/httputil.Response" } }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, "404": { "description": "Not Found", "schema": { @@ -4119,6 +4137,24 @@ "$ref": "#/definitions/httputil.Response" } }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/httputil.Response" + } + }, "404": { "description": "Not Found", "schema": { diff --git a/docs/swagger/swagger.yaml b/docs/swagger/swagger.yaml index 609bf3143..5dba3664b 100644 --- a/docs/swagger/swagger.yaml +++ b/docs/swagger/swagger.yaml @@ -5025,6 +5025,18 @@ paths: description: OK schema: $ref: '#/definitions/httputil.Response' + "400": + description: Bad Request + schema: + $ref: '#/definitions/httputil.Response' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/httputil.Response' + "403": + description: Forbidden + schema: + $ref: '#/definitions/httputil.Response' "404": description: Not Found schema: @@ -5103,6 +5115,18 @@ paths: description: OK schema: $ref: '#/definitions/httputil.Response' + "400": + description: Bad Request + schema: + $ref: '#/definitions/httputil.Response' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/httputil.Response' + "403": + description: Forbidden + schema: + $ref: '#/definitions/httputil.Response' "404": description: Not Found schema: diff --git a/internal/core/services/instance.go b/internal/core/services/instance.go index 2e6f67f54..280d7e40c 100644 --- a/internal/core/services/instance.go +++ b/internal/core/services/instance.go @@ -658,8 +658,19 @@ func (s *InstanceService) PauseInstance(ctx context.Context, idOrName string) er return errors.Wrap(errors.Internal, "failed to pause container", err) } + oldStatus := inst.Status inst.Status = domain.StatusPaused if err := s.repo.Update(ctx, inst); err != nil { + // Best-effort rollback: undo the pause since DB update failed + inst.Status = oldStatus + if rollbackErr := s.repo.Update(ctx, inst); rollbackErr != nil { + s.logger.Warn("failed to rollback pause after repo error", + "instance_id", inst.ID, "pause_error", err, "rollback_error", rollbackErr) + } + if resumeErr := s.compute.ResumeInstance(ctx, target); resumeErr != nil { + s.logger.Warn("failed to undo pause after repo error", + "instance_id", inst.ID, "resume_error", resumeErr) + } return err } @@ -699,11 +710,12 @@ func (s *InstanceService) ResumeInstance(ctx context.Context, idOrName string) e if err := s.compute.ResumeInstance(ctx, target); err != nil { platform.InstanceOperationsTotal.WithLabelValues("resume", "failure").Inc() - s.logger.Error("failed to resume container, rolling back to RUNNING", - "container_id", target, "instance_id", inst.ID, "error", err) - inst.Status = domain.StatusRunning + oldStatus := inst.Status + s.logger.Error("failed to resume container, rolling back", + "container_id", target, "instance_id", inst.ID, "old_status", oldStatus, "error", err) + inst.Status = oldStatus if repoErr := s.repo.Update(ctx, inst); repoErr != nil { - s.logger.Error("failed to rollback instance status to RUNNING", + s.logger.Error("failed to rollback instance status", "instance_id", inst.ID, "resume_error", err, "rollback_error", repoErr) } return errors.Wrap(errors.Internal, "failed to resume container", err) @@ -711,6 +723,16 @@ func (s *InstanceService) ResumeInstance(ctx context.Context, idOrName string) e inst.Status = domain.StatusRunning if err := s.repo.Update(ctx, inst); err != nil { + // Best-effort rollback: undo the resume since DB update failed + inst.Status = domain.StatusPaused + if rollbackErr := s.repo.Update(ctx, inst); rollbackErr != nil { + s.logger.Warn("failed to rollback resume after repo error", + "instance_id", inst.ID, "resume_error", err, "rollback_error", rollbackErr) + } + if pauseErr := s.compute.PauseInstance(ctx, target); pauseErr != nil { + s.logger.Warn("failed to undo resume after repo error", + "instance_id", inst.ID, "pause_error", pauseErr) + } return err } diff --git a/internal/core/services/instance_unit_test.go b/internal/core/services/instance_unit_test.go index 3493bfb84..063598442 100644 --- a/internal/core/services/instance_unit_test.go +++ b/internal/core/services/instance_unit_test.go @@ -1048,6 +1048,33 @@ func testInstanceServicePauseResumeUnit(t *testing.T) { mock.AssertExpectationsForObjects(t, repo, compute, rbacSvc) }) + t.Run("PauseInstance_RepoError_Rollback", func(t *testing.T) { + inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusRunning, ContainerID: "cid-1"} + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() + repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() + compute.On("PauseInstance", mock.Anything, "cid-1").Return(nil).Once() + compute.On("Type").Return("docker").Maybe() + // First Update call (to set PAUSED) fails - triggers rollback + repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { + return i.Status == domain.StatusPaused + })).Return(fmt.Errorf("db error")).Once() + // Rollback: restore status to RUNNING via repo Update + repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { + return i.Status == domain.StatusRunning + })).Return(nil).Once() + // Also undo the backend pause + compute.On("ResumeInstance", mock.Anything, "cid-1").Return(nil).Once() + auditSvc.On("Log", mock.Anything, userID, "instance.pause", "instance", instanceID.String(), mock.Anything).Return(nil).Maybe() + + err := svc.PauseInstance(ctx, instanceID.String()) + require.Error(t, err) + assert.Contains(t, err.Error(), "db error") + assert.Equal(t, domain.StatusRunning, inst.Status) // status rolled back via repo Update + mock.AssertExpectationsForObjects(t, repo, compute, rbacSvc) + }) + t.Run("ResumeInstance_Success", func(t *testing.T) { inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusPaused, ContainerID: "cid-1"} repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() @@ -1089,7 +1116,7 @@ func testInstanceServicePauseResumeUnit(t *testing.T) { compute.On("ResumeInstance", mock.Anything, "cid-1").Return(fmt.Errorf("resume failed")).Once() compute.On("Type").Return("docker").Maybe() repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { - return i.Status == domain.StatusRunning // rollback to RUNNING on failure + return i.Status == domain.StatusPaused // preserve old status on rollback })).Return(nil).Once() auditSvc.On("Log", mock.Anything, userID, "instance.resume", "instance", instanceID.String(), mock.Anything).Return(nil).Maybe() @@ -1098,6 +1125,33 @@ func testInstanceServicePauseResumeUnit(t *testing.T) { assert.Contains(t, err.Error(), "resume failed") mock.AssertExpectationsForObjects(t, repo, compute, rbacSvc) }) + + t.Run("ResumeInstance_RepoError_Rollback", func(t *testing.T) { + inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusPaused, ContainerID: "cid-1"} + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() + repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() + compute.On("ResumeInstance", mock.Anything, "cid-1").Return(nil).Once() + compute.On("Type").Return("docker").Maybe() + // First Update call (to set RUNNING) fails - triggers rollback + repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { + return i.Status == domain.StatusRunning + })).Return(fmt.Errorf("db error")).Once() + // Rollback: restore status to PAUSED via repo Update + repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { + return i.Status == domain.StatusPaused + })).Return(nil).Once() + // Also undo the backend resume + compute.On("PauseInstance", mock.Anything, "cid-1").Return(nil).Once() + auditSvc.On("Log", mock.Anything, userID, "instance.resume", "instance", instanceID.String(), mock.Anything).Return(nil).Maybe() + + err := svc.ResumeInstance(ctx, instanceID.String()) + require.Error(t, err) + assert.Contains(t, err.Error(), "db error") + assert.Equal(t, domain.StatusPaused, inst.Status) // status rolled back via repo Update + mock.AssertExpectationsForObjects(t, repo, compute, rbacSvc) + }) } func testInstanceServiceUnitRbacErrors(t *testing.T) { diff --git a/internal/handlers/instance_handler.go b/internal/handlers/instance_handler.go index 9e13bd0a3..5e8c5cf4d 100644 --- a/internal/handlers/instance_handler.go +++ b/internal/handlers/instance_handler.go @@ -252,6 +252,9 @@ func (h *InstanceHandler) Stop(c *gin.Context) { // @Security APIKeyAuth // @Param id path string true "Instance ID" // @Success 200 {object} httputil.Response +// @Failure 400 {object} httputil.Response +// @Failure 401 {object} httputil.Response +// @Failure 403 {object} httputil.Response // @Failure 404 {object} httputil.Response // @Failure 409 {object} httputil.Response "Instance not in RUNNING state" // @Failure 500 {object} httputil.Response @@ -279,6 +282,9 @@ func (h *InstanceHandler) Pause(c *gin.Context) { // @Security APIKeyAuth // @Param id path string true "Instance ID" // @Success 200 {object} httputil.Response +// @Failure 400 {object} httputil.Response +// @Failure 401 {object} httputil.Response +// @Failure 403 {object} httputil.Response // @Failure 404 {object} httputil.Response // @Failure 409 {object} httputil.Response "Instance not in PAUSED state" // @Failure 500 {object} httputil.Response diff --git a/internal/platform/bulkhead.go b/internal/platform/bulkhead.go index 2ccf36cee..a182b8bc4 100644 --- a/internal/platform/bulkhead.go +++ b/internal/platform/bulkhead.go @@ -6,9 +6,10 @@ import ( "time" ) -// ErrBulkheadFull is returned when the bulkhead's concurrency limit is reached -// and the caller's timeout/context expires before a slot opens. -var ErrBulkheadFull = errors.New("bulkhead: concurrency limit reached") +// ErrBulkheadFull is returned only when the bulkhead's concurrency limit is reached +// and the caller's wait timeout expires before a slot opens. +// Context cancellation or expiry returns ctx.Err() instead of ErrBulkheadFull. +var ErrBulkheadFull = errors.New("bulkhead: concurrency limit reached, wait timeout") // Bulkhead limits concurrent access to a resource using a semaphore pattern. // It prevents one slow/failing component from consuming all available goroutines diff --git a/internal/repositories/libvirt/adapter.go b/internal/repositories/libvirt/adapter.go index 6f20caa0f..a3025f9e1 100644 --- a/internal/repositories/libvirt/adapter.go +++ b/internal/repositories/libvirt/adapter.go @@ -215,7 +215,7 @@ func (a *LibvirtAdapter) ResumeInstance(ctx context.Context, id string) error { return fmt.Errorf("failed to get domain state: %w", err) } if state != domainStatePaused { - return fmt.Errorf("domain is not paused (state=%d), cannot resume", state) + return fmt.Errorf("%w: domain is %d, must be PAUSED (3)", apierrors.ErrInstanceNotResumable, state) } if err := a.client.DomainResume(ctx, dom); err != nil { From d71a165542395258a5e716beaa03c977d6a7dada Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Thu, 30 Apr 2026 12:26:58 +0300 Subject: [PATCH 22/29] fix: address PR #320 code review findings - bulkhead: apply 5s default WaitTimeout when zero (matches docs) - instance: add conflict detection in ResumeInstance (matches PauseInstance) - instance: fix rollback order - compensating backend call first, then repo - instance_unit_test: refactor pause/resume tests to table-driven with fresh mocks per case; add ResumeInstance_ConflictError test case --- internal/core/services/instance.go | 23 +- internal/core/services/instance_unit_test.go | 382 +++++++++++-------- internal/platform/bulkhead.go | 3 + 3 files changed, 239 insertions(+), 169 deletions(-) diff --git a/internal/core/services/instance.go b/internal/core/services/instance.go index 280d7e40c..0743234b1 100644 --- a/internal/core/services/instance.go +++ b/internal/core/services/instance.go @@ -662,15 +662,16 @@ func (s *InstanceService) PauseInstance(ctx context.Context, idOrName string) er inst.Status = domain.StatusPaused if err := s.repo.Update(ctx, inst); err != nil { // Best-effort rollback: undo the pause since DB update failed + // Call compensating backend first, then restore DB status + if resumeErr := s.compute.ResumeInstance(ctx, target); resumeErr != nil { + s.logger.Warn("failed to undo pause after repo error", + "instance_id", inst.ID, "resume_error", resumeErr) + } inst.Status = oldStatus if rollbackErr := s.repo.Update(ctx, inst); rollbackErr != nil { s.logger.Warn("failed to rollback pause after repo error", "instance_id", inst.ID, "pause_error", err, "rollback_error", rollbackErr) } - if resumeErr := s.compute.ResumeInstance(ctx, target); resumeErr != nil { - s.logger.Warn("failed to undo pause after repo error", - "instance_id", inst.ID, "resume_error", resumeErr) - } return err } @@ -711,6 +712,11 @@ func (s *InstanceService) ResumeInstance(ctx context.Context, idOrName string) e if err := s.compute.ResumeInstance(ctx, target); err != nil { platform.InstanceOperationsTotal.WithLabelValues("resume", "failure").Inc() oldStatus := inst.Status + if errors.Is(err, errors.Conflict) { + s.logger.Warn("resume not possible in current state", + "container_id", target, "instance_id", inst.ID, "error", err) + return errors.New(errors.Conflict, err.Error()) + } s.logger.Error("failed to resume container, rolling back", "container_id", target, "instance_id", inst.ID, "old_status", oldStatus, "error", err) inst.Status = oldStatus @@ -724,15 +730,16 @@ func (s *InstanceService) ResumeInstance(ctx context.Context, idOrName string) e inst.Status = domain.StatusRunning if err := s.repo.Update(ctx, inst); err != nil { // Best-effort rollback: undo the resume since DB update failed + // Call compensating backend first, then restore DB status + if pauseErr := s.compute.PauseInstance(ctx, target); pauseErr != nil { + s.logger.Warn("failed to undo resume after repo error", + "instance_id", inst.ID, "pause_error", pauseErr) + } inst.Status = domain.StatusPaused if rollbackErr := s.repo.Update(ctx, inst); rollbackErr != nil { s.logger.Warn("failed to rollback resume after repo error", "instance_id", inst.ID, "resume_error", err, "rollback_error", rollbackErr) } - if pauseErr := s.compute.PauseInstance(ctx, target); pauseErr != nil { - s.logger.Warn("failed to undo resume after repo error", - "instance_id", inst.ID, "pause_error", pauseErr) - } return err } diff --git a/internal/core/services/instance_unit_test.go b/internal/core/services/instance_unit_test.go index 063598442..6eadcbdbb 100644 --- a/internal/core/services/instance_unit_test.go +++ b/internal/core/services/instance_unit_test.go @@ -980,19 +980,6 @@ func testInstanceServiceVolumeReleaseUnit(t *testing.T) { } func testInstanceServicePauseResumeUnit(t *testing.T) { - repo := new(MockInstanceRepo) - compute := new(MockComputeBackend) - rbacSvc := new(MockRBACService) - auditSvc := new(MockAuditService) - - svc := services.NewInstanceService(services.InstanceServiceParams{ - Repo: repo, - Compute: compute, - RBAC: rbacSvc, - AuditSvc: auditSvc, - Logger: slog.Default(), - }) - ctx := context.Background() instanceID := uuid.New() userID := uuid.New() @@ -1000,158 +987,231 @@ func testInstanceServicePauseResumeUnit(t *testing.T) { ctx = appcontext.WithUserID(ctx, userID) ctx = appcontext.WithTenantID(ctx, tenantID) - t.Run("PauseInstance_Success", func(t *testing.T) { - inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusRunning, ContainerID: "cid-1"} - repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() - repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() - rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() - rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() - compute.On("PauseInstance", mock.Anything, "cid-1").Return(nil).Once() - compute.On("Type").Return("docker").Maybe() - repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { - return i.Status == domain.StatusPaused - })).Return(nil).Once() - auditSvc.On("Log", mock.Anything, userID, "instance.pause", "instance", instanceID.String(), mock.Anything).Return(nil).Once() - - err := svc.PauseInstance(ctx, instanceID.String()) - require.NoError(t, err) - assert.Equal(t, domain.StatusPaused, inst.Status) - mock.AssertExpectationsForObjects(t, repo, compute, rbacSvc, auditSvc) - }) - - t.Run("PauseInstance_WrongState", func(t *testing.T) { - inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusPaused, ContainerID: "cid-1"} - repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() - repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() - rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() - rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() - - err := svc.PauseInstance(ctx, instanceID.String()) - require.Error(t, err) - assert.Contains(t, err.Error(), "must be RUNNING to pause") - mock.AssertExpectationsForObjects(t, repo, rbacSvc) - }) - - t.Run("PauseInstance_ComputeError", func(t *testing.T) { - inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusRunning, ContainerID: "cid-1"} - repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() - repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() - rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() - rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() - compute.On("PauseInstance", mock.Anything, "cid-1").Return(fmt.Errorf("pause failed")).Once() - compute.On("Type").Return("docker").Maybe() - auditSvc.On("Log", mock.Anything, userID, "instance.pause", "instance", instanceID.String(), mock.Anything).Return(nil).Maybe() - - err := svc.PauseInstance(ctx, instanceID.String()) - require.Error(t, err) - assert.Contains(t, err.Error(), "pause failed") - mock.AssertExpectationsForObjects(t, repo, compute, rbacSvc) - }) - - t.Run("PauseInstance_RepoError_Rollback", func(t *testing.T) { - inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusRunning, ContainerID: "cid-1"} - repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() - repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() - rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() - rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() - compute.On("PauseInstance", mock.Anything, "cid-1").Return(nil).Once() - compute.On("Type").Return("docker").Maybe() - // First Update call (to set PAUSED) fails - triggers rollback - repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { - return i.Status == domain.StatusPaused - })).Return(fmt.Errorf("db error")).Once() - // Rollback: restore status to RUNNING via repo Update - repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { - return i.Status == domain.StatusRunning - })).Return(nil).Once() - // Also undo the backend pause - compute.On("ResumeInstance", mock.Anything, "cid-1").Return(nil).Once() - auditSvc.On("Log", mock.Anything, userID, "instance.pause", "instance", instanceID.String(), mock.Anything).Return(nil).Maybe() - - err := svc.PauseInstance(ctx, instanceID.String()) - require.Error(t, err) - assert.Contains(t, err.Error(), "db error") - assert.Equal(t, domain.StatusRunning, inst.Status) // status rolled back via repo Update - mock.AssertExpectationsForObjects(t, repo, compute, rbacSvc) - }) - - t.Run("ResumeInstance_Success", func(t *testing.T) { - inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusPaused, ContainerID: "cid-1"} - repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() - repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() - rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() - rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() - compute.On("ResumeInstance", mock.Anything, "cid-1").Return(nil).Once() - compute.On("Type").Return("docker").Maybe() - repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { - return i.Status == domain.StatusRunning - })).Return(nil).Once() - auditSvc.On("Log", mock.Anything, userID, "instance.resume", "instance", instanceID.String(), mock.Anything).Return(nil).Once() - - err := svc.ResumeInstance(ctx, instanceID.String()) - require.NoError(t, err) - assert.Equal(t, domain.StatusRunning, inst.Status) - mock.AssertExpectationsForObjects(t, repo, compute, rbacSvc, auditSvc) - }) - - t.Run("ResumeInstance_WrongState", func(t *testing.T) { - inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusRunning, ContainerID: "cid-1"} - repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() - repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() - rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() - rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() - - err := svc.ResumeInstance(ctx, instanceID.String()) - require.Error(t, err) - assert.Contains(t, err.Error(), "must be PAUSED to resume") - mock.AssertExpectationsForObjects(t, repo, rbacSvc) - }) + type pauseResumeCase struct { + name string + method func(*services.InstanceService) func(context.Context, string) error + setup func(*domain.Instance, *MockInstanceRepo, *MockComputeBackend, *MockRBACService, *MockAuditService) + assert func(*testing.T, error, *domain.Instance) + } - t.Run("ResumeInstance_ComputeError", func(t *testing.T) { - inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusPaused, ContainerID: "cid-1"} - repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() - repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() - rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() - rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() - compute.On("ResumeInstance", mock.Anything, "cid-1").Return(fmt.Errorf("resume failed")).Once() - compute.On("Type").Return("docker").Maybe() - repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { - return i.Status == domain.StatusPaused // preserve old status on rollback - })).Return(nil).Once() - auditSvc.On("Log", mock.Anything, userID, "instance.resume", "instance", instanceID.String(), mock.Anything).Return(nil).Maybe() + pauseCases := []pauseResumeCase{ + { + name: "PauseInstance_Success", + method: func(svc *services.InstanceService) func(context.Context, string) error { return svc.PauseInstance }, + setup: func(inst *domain.Instance, repo *MockInstanceRepo, compute *MockComputeBackend, rbacSvc *MockRBACService, auditSvc *MockAuditService) { + inst.Status = domain.StatusRunning + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() + repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() + compute.On("PauseInstance", mock.Anything, "cid-1").Return(nil).Once() + compute.On("Type").Return("docker").Maybe() + repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { return i.Status == domain.StatusPaused })).Return(nil).Once() + auditSvc.On("Log", mock.Anything, userID, "instance.pause", "instance", instanceID.String(), mock.Anything).Return(nil).Once() + }, + assert: func(t *testing.T, err error, inst *domain.Instance) { + require.NoError(t, err) + assert.Equal(t, domain.StatusPaused, inst.Status) + }, + }, + { + name: "PauseInstance_WrongState", + method: func(svc *services.InstanceService) func(context.Context, string) error { return svc.PauseInstance }, + setup: func(inst *domain.Instance, repo *MockInstanceRepo, compute *MockComputeBackend, rbacSvc *MockRBACService, auditSvc *MockAuditService) { + inst.Status = domain.StatusPaused + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() + repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() + }, + assert: func(t *testing.T, err error, inst *domain.Instance) { + require.Error(t, err) + assert.Contains(t, err.Error(), "must be RUNNING to pause") + }, + }, + { + name: "PauseInstance_ComputeError", + method: func(svc *services.InstanceService) func(context.Context, string) error { return svc.PauseInstance }, + setup: func(inst *domain.Instance, repo *MockInstanceRepo, compute *MockComputeBackend, rbacSvc *MockRBACService, auditSvc *MockAuditService) { + inst.Status = domain.StatusRunning + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() + repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() + compute.On("PauseInstance", mock.Anything, "cid-1").Return(fmt.Errorf("pause failed")).Once() + compute.On("Type").Return("docker").Maybe() + auditSvc.On("Log", mock.Anything, userID, "instance.pause", "instance", instanceID.String(), mock.Anything).Return(nil).Maybe() + }, + assert: func(t *testing.T, err error, inst *domain.Instance) { + require.Error(t, err) + assert.Contains(t, err.Error(), "pause failed") + }, + }, + { + name: "PauseInstance_RepoError_Rollback", + method: func(svc *services.InstanceService) func(context.Context, string) error { return svc.PauseInstance }, + setup: func(inst *domain.Instance, repo *MockInstanceRepo, compute *MockComputeBackend, rbacSvc *MockRBACService, auditSvc *MockAuditService) { + inst.Status = domain.StatusRunning + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() + repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() + compute.On("PauseInstance", mock.Anything, "cid-1").Return(nil).Once() + compute.On("Type").Return("docker").Maybe() + // First Update (to PAUSED) fails + repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { return i.Status == domain.StatusPaused })).Return(fmt.Errorf("db error")).Once() + // Rollback: backend undo first, then repo status restore + compute.On("ResumeInstance", mock.Anything, "cid-1").Return(nil).Once() + repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { return i.Status == domain.StatusRunning })).Return(nil).Once() + auditSvc.On("Log", mock.Anything, userID, "instance.pause", "instance", instanceID.String(), mock.Anything).Return(nil).Maybe() + }, + assert: func(t *testing.T, err error, inst *domain.Instance) { + require.Error(t, err) + assert.Contains(t, err.Error(), "db error") + assert.Equal(t, domain.StatusRunning, inst.Status) + }, + }, + } - err := svc.ResumeInstance(ctx, instanceID.String()) - require.Error(t, err) - assert.Contains(t, err.Error(), "resume failed") - mock.AssertExpectationsForObjects(t, repo, compute, rbacSvc) - }) + resumeCases := []pauseResumeCase{ + { + name: "ResumeInstance_Success", + method: func(svc *services.InstanceService) func(context.Context, string) error { return svc.ResumeInstance }, + setup: func(inst *domain.Instance, repo *MockInstanceRepo, compute *MockComputeBackend, rbacSvc *MockRBACService, auditSvc *MockAuditService) { + inst.Status = domain.StatusPaused + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() + repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() + compute.On("ResumeInstance", mock.Anything, "cid-1").Return(nil).Once() + compute.On("Type").Return("docker").Maybe() + repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { return i.Status == domain.StatusRunning })).Return(nil).Once() + auditSvc.On("Log", mock.Anything, userID, "instance.resume", "instance", instanceID.String(), mock.Anything).Return(nil).Once() + }, + assert: func(t *testing.T, err error, inst *domain.Instance) { + require.NoError(t, err) + assert.Equal(t, domain.StatusRunning, inst.Status) + }, + }, + { + name: "ResumeInstance_WrongState", + method: func(svc *services.InstanceService) func(context.Context, string) error { return svc.ResumeInstance }, + setup: func(inst *domain.Instance, repo *MockInstanceRepo, compute *MockComputeBackend, rbacSvc *MockRBACService, auditSvc *MockAuditService) { + inst.Status = domain.StatusRunning + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() + repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() + }, + assert: func(t *testing.T, err error, inst *domain.Instance) { + require.Error(t, err) + assert.Contains(t, err.Error(), "must be PAUSED to resume") + }, + }, + { + name: "ResumeInstance_ComputeError", + method: func(svc *services.InstanceService) func(context.Context, string) error { return svc.ResumeInstance }, + setup: func(inst *domain.Instance, repo *MockInstanceRepo, compute *MockComputeBackend, rbacSvc *MockRBACService, auditSvc *MockAuditService) { + inst.Status = domain.StatusPaused + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() + repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() + compute.On("ResumeInstance", mock.Anything, "cid-1").Return(fmt.Errorf("resume failed")).Once() + compute.On("Type").Return("docker").Maybe() + repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { return i.Status == domain.StatusPaused })).Return(nil).Once() + auditSvc.On("Log", mock.Anything, userID, "instance.resume", "instance", instanceID.String(), mock.Anything).Return(nil).Maybe() + }, + assert: func(t *testing.T, err error, inst *domain.Instance) { + require.Error(t, err) + assert.Contains(t, err.Error(), "resume failed") + }, + }, + { + name: "ResumeInstance_ConflictError", + method: func(svc *services.InstanceService) func(context.Context, string) error { return svc.ResumeInstance }, + setup: func(inst *domain.Instance, repo *MockInstanceRepo, compute *MockComputeBackend, rbacSvc *MockRBACService, auditSvc *MockAuditService) { + inst.Status = domain.StatusPaused + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() + repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() + compute.On("ResumeInstance", mock.Anything, "cid-1").Return(svcerrors.ErrInstanceNotResumable).Once() + compute.On("Type").Return("docker").Maybe() + }, + assert: func(t *testing.T, err error, inst *domain.Instance) { + require.Error(t, err) + assert.Contains(t, err.Error(), "instance cannot be resumed") + assert.True(t, svcerrors.Is(err, svcerrors.Conflict)) + }, + }, + { + name: "ResumeInstance_RepoError_Rollback", + method: func(svc *services.InstanceService) func(context.Context, string) error { return svc.ResumeInstance }, + setup: func(inst *domain.Instance, repo *MockInstanceRepo, compute *MockComputeBackend, rbacSvc *MockRBACService, auditSvc *MockAuditService) { + inst.Status = domain.StatusPaused + repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() + repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() + rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() + compute.On("ResumeInstance", mock.Anything, "cid-1").Return(nil).Once() + compute.On("Type").Return("docker").Maybe() + // First Update (to RUNNING) fails + repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { return i.Status == domain.StatusRunning })).Return(fmt.Errorf("db error")).Once() + // Rollback: backend undo first, then repo status restore + compute.On("PauseInstance", mock.Anything, "cid-1").Return(nil).Once() + repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { return i.Status == domain.StatusPaused })).Return(nil).Once() + auditSvc.On("Log", mock.Anything, userID, "instance.resume", "instance", instanceID.String(), mock.Anything).Return(nil).Maybe() + }, + assert: func(t *testing.T, err error, inst *domain.Instance) { + require.Error(t, err) + assert.Contains(t, err.Error(), "db error") + assert.Equal(t, domain.StatusPaused, inst.Status) + }, + }, + } - t.Run("ResumeInstance_RepoError_Rollback", func(t *testing.T) { - inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusPaused, ContainerID: "cid-1"} - repo.On("GetByName", mock.Anything, instanceID.String()).Return(nil, fmt.Errorf("not found")).Maybe() - repo.On("GetByID", mock.Anything, instanceID).Return(inst, nil).Once() - rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceUpdate, instanceID.String()).Return(nil).Once() - rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() - compute.On("ResumeInstance", mock.Anything, "cid-1").Return(nil).Once() - compute.On("Type").Return("docker").Maybe() - // First Update call (to set RUNNING) fails - triggers rollback - repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { - return i.Status == domain.StatusRunning - })).Return(fmt.Errorf("db error")).Once() - // Rollback: restore status to PAUSED via repo Update - repo.On("Update", mock.Anything, mock.MatchedBy(func(i *domain.Instance) bool { - return i.Status == domain.StatusPaused - })).Return(nil).Once() - // Also undo the backend resume - compute.On("PauseInstance", mock.Anything, "cid-1").Return(nil).Once() - auditSvc.On("Log", mock.Anything, userID, "instance.resume", "instance", instanceID.String(), mock.Anything).Return(nil).Maybe() + for _, tc := range pauseCases { + t.Run(tc.name, func(t *testing.T) { + repo := new(MockInstanceRepo) + compute := new(MockComputeBackend) + rbacSvc := new(MockRBACService) + auditSvc := new(MockAuditService) + svc := services.NewInstanceService(services.InstanceServiceParams{ + Repo: repo, + Compute: compute, + RBAC: rbacSvc, + AuditSvc: auditSvc, + Logger: slog.Default(), + }) + inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusRunning, ContainerID: "cid-1"} + tc.setup(inst, repo, compute, rbacSvc, auditSvc) + err := tc.method(svc)(ctx, instanceID.String()) + tc.assert(t, err, inst) + mock.AssertExpectationsForObjects(t, repo, compute, rbacSvc, auditSvc) + }) + } - err := svc.ResumeInstance(ctx, instanceID.String()) - require.Error(t, err) - assert.Contains(t, err.Error(), "db error") - assert.Equal(t, domain.StatusPaused, inst.Status) // status rolled back via repo Update - mock.AssertExpectationsForObjects(t, repo, compute, rbacSvc) - }) + for _, tc := range resumeCases { + t.Run(tc.name, func(t *testing.T) { + repo := new(MockInstanceRepo) + compute := new(MockComputeBackend) + rbacSvc := new(MockRBACService) + auditSvc := new(MockAuditService) + svc := services.NewInstanceService(services.InstanceServiceParams{ + Repo: repo, + Compute: compute, + RBAC: rbacSvc, + AuditSvc: auditSvc, + Logger: slog.Default(), + }) + inst := &domain.Instance{ID: instanceID, UserID: userID, TenantID: tenantID, Status: domain.StatusPaused, ContainerID: "cid-1"} + tc.setup(inst, repo, compute, rbacSvc, auditSvc) + err := tc.method(svc)(ctx, instanceID.String()) + tc.assert(t, err, inst) + mock.AssertExpectationsForObjects(t, repo, compute, rbacSvc, auditSvc) + }) + } } func testInstanceServiceUnitRbacErrors(t *testing.T) { diff --git a/internal/platform/bulkhead.go b/internal/platform/bulkhead.go index a182b8bc4..aff07d962 100644 --- a/internal/platform/bulkhead.go +++ b/internal/platform/bulkhead.go @@ -32,6 +32,9 @@ func NewBulkhead(opts BulkheadOpts) *Bulkhead { if opts.MaxConc <= 0 { opts.MaxConc = 10 } + if opts.WaitTimeout == 0 { + opts.WaitTimeout = 5 * time.Second + } return &Bulkhead{ name: opts.Name, sem: make(chan struct{}, opts.MaxConc), From a20a805c2bc1324f176ed3da37d56b6d92309969 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Thu, 30 Apr 2026 12:30:07 +0300 Subject: [PATCH 23/29] revert: remove compute-roadmap.md from branch --- compute-roadmap.md | 522 --------------------------------------------- 1 file changed, 522 deletions(-) delete mode 100644 compute-roadmap.md diff --git a/compute-roadmap.md b/compute-roadmap.md deleted file mode 100644 index f96ae72e6..000000000 --- a/compute-roadmap.md +++ /dev/null @@ -1,522 +0,0 @@ -# Compute Service Roadmap — Updated - -> Audited: 2026-04-21 | Status reflects actual codebase inspection - -## Status Summary - -| PR | Title | Implementation Status | -|----|-------|----------------------| -| PR 1 | Image Management | ✅ COMPLETED | -| PR 2 | Instance Resize/Scale | ✅ COMPLETED | -| PR 3 | Instance Pause/Resume | ✅ COMPLETED | -| PR 4 | Provision Retry & Error Handling | ❌ NOT IMPLEMENTED | -| PR 5 | Instance Tags | ❌ NOT IMPLEMENTED | -| PR 6 | WebSocket Progress Events | ⚠️ PARTIALLY IMPLEMENTED | -| PR 7 | Instance Snapshots | ❌ NOT IMPLEMENTED | -| PR 8 | Instance Live Migration | ❌ NOT IMPLEMENTED | - ---- - -## PR 1: Image Management — Import from URL - -**Status: ✅ COMPLETED** - -All features implemented: -- `domain.Image` with `SourceURL` field -- `ports.ImageService` interface with all 6 methods including `ImportImage` -- `ImageService` implementation with `RegisterImage`, `UploadImage`, `GetImage`, `ListImages`, `DeleteImage`, `ImportImage` -- `ImageHandler` with all 6 endpoints including `POST /api/v1/images/import` -- `ImageRepository` postgres implementation -- RBAC permissions: `PermissionImageCreate`, `PermissionImageRead`, `PermissionImageDelete`, `PermissionImageReadAll` -- Router wired at `internal/api/setup/router.go:300-304` -- Unit tests passing (`TestImageService_Unit`) -- URL validation (http/https only) to resolve CodeQL go/request-forgery -- Status transitions: PENDING → ACTIVE (or ERROR on failure) -- Format auto-detection from URL extension (.qcow2, .img, .raw, .iso) -- 30-minute HTTP timeout for large downloads -- Returns 202 Accepted on import start - -### What exists -- `domain.Image` + `ImageStatus` enum (`internal/core/domain/image.go:10-40`) -- `ports.ImageService` interface with 5 methods (`internal/core/ports/image.go:12-24`) -- `ImageService` implementation (`internal/core/services/image.go:36-192`) -- `ImageHandler` with 5 endpoints (`internal/handlers/image_handler.go:1-162`) -- `ImageRepository` postgres implementation (`internal/repositories/postgres/image_repo.go`) -- RBAC permissions: `PermissionImageCreate`, `PermissionImageRead`, `PermissionImageDelete`, `PermissionImageReadAll` (`internal/core/domain/rbac.go:143-147`) -- Router wired at `internal/api/setup/router.go:300-304` -- Unit tests passing (`TestImageService_Unit`) - -### What needs to be built -**`ImportImage`** — Pull an image from an external URL (e.g., cloud-images.ubuntu.com) - -#### Implementation plan - -**1. `ports.ImageService`** — add method signature -```go -ImportImage(ctx context.Context, name, url, description, os, version string, isPublic bool) (*domain.Image, error) -``` - -**2. `domain.Image`** — add `SourceURL` field to track import origin -```go -SourceURL string `json:"source_url,omitempty"` -``` - -**3. `internal/core/services/image.go`** — implement `ImportImage` -- Download from URL using `httputil` or stdlib `http.Get` -- Stream to `FileStore.Write` (same as `UploadImage`) -- Support formats: qcow2, img, raw, iso -- Timeout: 30 minutes for large downloads -- Status transitions: `ImageStatusPending` → `ImageStatusActive` on success; `ImageStatusError` on failure - -**4. `internal/handlers/image_handler.go`** — add handler -```go -// POST /api/v1/images/import -ImportImage(c *gin.Context) -``` -- Body: `{ "name": "...", "url": "...", "description": "...", "os": "...", "version": "...", "is_public": false }` -- Streams download and upload in one step (no intermediate file storage) -- Returns 202 Accepted (long-running operation) - -**5. `internal/api/setup/router.go`** — wire endpoint -```go -imageGroup.POST("/import", httputil.Permission(svcs.RBAC, domain.PermissionImageCreate), handlers.Image.ImportImage) -``` - -**6. Tests** -- Unit: `image_unit_test.go` — add `ImportImage` subtests -- E2E: `tests/compute_e2e_test.go` if applicable - -### Files -| Action | File | -|--------|------| -| Modify | `internal/core/domain/image.go` (add SourceURL field) | -| Modify | `internal/core/ports/image.go` (add ImportImage method) | -| Modify | `internal/core/services/image.go` (implement ImportImage) | -| Modify | `internal/handlers/image_handler.go` (add ImportImage handler) | -| Modify | `internal/api/setup/router.go` (wire /import endpoint) | -| Modify | `internal/core/services/image_unit_test.go` (add test cases) | - ---- - -## PR 2: Instance Resize/Scale - -**Status: ✅ COMPLETED** (PR #175 merged 2026-04-26) - -### What was built - -- `POST /api/v1/instances/:id/resize` — changes instance type (CPU/memory) -- `PermissionInstanceResize` RBAC permission (`internal/core/domain/rbac.go`) -- `ResizeInstance` on `ComputeBackend` and `InstanceService` interfaces -- **Docker backend**: live resize via `docker ContainerUpdate` (NanoCPUs + memory bytes) -- **Libvirt backend**: cold resize (stop → update XML → start) with full rollback on failure -- Quota enforcement: `prepareResize()` checks/increments quota *before* backend resize (fail-fast for upsize); `completeResize()` releases quota on downsize -- Memory unit: quota APIs use GB internally (`deltaMemMB/1024`), not MB -- DomainCreate rollback: if `DomainCreate(newDom)` fails, undefines new DOM, redefines original XML, restarts original domain -- ADR-025: `docs/adr/ADR-025-instance-resize.md` - -### Files -| File | Change | -|------|--------| -| `internal/core/ports/instance.go` | `ResizeInstance` method added | -| `internal/core/ports/compute.go` | `ResizeInstance` added to `ComputeBackend` | -| `internal/core/domain/rbac.go` | `PermissionInstanceResize` added | -| `internal/core/services/instance.go` | Full implementation with quota enforcement | -| `internal/handlers/instance_handler.go` | `ResizeInstance` HTTP handler + 429 swagger | -| `internal/repositories/docker/adapter.go` | `ResizeInstance` via `ContainerUpdate` | -| `internal/repositories/libvirt/adapter.go` | Cold resize with XML regex patching + rollback | -| `internal/platform/resilient_compute.go` | `ResizeInstance` passthrough with circuit breaker | -| `docs/adr/ADR-025-instance-resize.md` | Architecture decision record | - -### Tests -- Unit (`instance_unit_test.go`): success, downsize, quota exceeded, invalid type, DB failure, rollback -- Unit (`libvirt/adapter_test.go`): cold resize path, DomainCreate rollback -- Unit (`resilient_compute_test.go`): `TestResilientComputeResizeInstance` -- Handler (`instance_handler_test.go`): validation, error mapping -- E2E (`compute_e2e_test.go`): upsize, downsize, invalid type - ---- - -## PR 3: Instance Pause/Resume - -**Status: ✅ COMPLETED** - -### What was built - -- `StatusPaused` in `InstanceStatus` enum (`internal/core/domain/instance.go`) -- `POST /api/v1/instances/:id/pause` and `POST /api/v1/instances/:id/resume` endpoints -- `PauseInstance` and `ResumeInstance` on `ComputeBackend` and `InstanceService` interfaces -- **Docker backend**: `ContainerPause` and `ContainerUnpause` -- **Libvirt backend**: `DomainSuspend` and `DomainResume` with state verification -- **Firecracker/Noop backends**: stub implementations -- State validation: RUNNING→PAUSED, PAUSED→RUNNING transitions enforced at service layer -- Audit events: `instance.pause` and `instance.resume` logged -- Resume rollback: if backend resume fails, instance status rolls back to RUNNING -- Resilient compute wrapper: circuit breaker + bulkhead protection -- Sentinel errors: `ErrInstanceNotPausable` and `ErrInstanceNotResumable` -- Unit tests: service layer (6 tests), handler layer (6 tests), platform layer (11 tests) - -### Files -| Action | File | -|--------|------| -| Modify | `internal/core/domain/instance.go` — add StatusPaused | -| Modify | `internal/core/ports/compute.go` — add PauseInstance/ResumeInstance | -| Modify | `internal/core/ports/instance.go` — add interface methods | -| Modify | `internal/core/services/instance.go` — implement methods with rollback | -| Modify | `internal/handlers/instance_handler.go` — add pause/resume handlers | -| Modify | `internal/repositories/docker/adapter.go` — implement ContainerPause/Unpause | -| Modify | `internal/repositories/libvirt/adapter.go` — implement DomainSuspend/Resume | -| Modify | `internal/repositories/firecracker/adapter.go` — stub implementations | -| Modify | `internal/repositories/noop/adapters.go` — stub implementations | -| Modify | `internal/api/setup/router.go` — wire endpoints | -| Modify | `internal/errors/errors.go` — add ErrInstanceNotPausable/ErrInstanceNotResumable | -| Modify | `internal/platform/resilient_compute.go` — add pause/resume passthrough | - -### Tests -- Unit (`instance_unit_test.go`): success, wrong state, compute error, rollback on failure -- Unit (`resilient_compute_test.go`): circuit breaker and bulkhead protection -- Handler (`instance_handler_test.go`): success, not found, conflict cases -- Worker mocks: PauseInstance/ResumeInstance added to all worker test mocks - ---- - -## PR 4: Instance Provision Retry & Improved Error Handling - -**Status: ❌ NOT IMPLEMENTED** - -### What exists -- `Provision()` method in `InstanceService` (`internal/core/services/instance.go:310`) -- `ProvisionJob` domain type (`internal/core/domain/instance.go`) - -### What needs to be built - -**Goal**: Make provisioning more reliable with structured retries and better error reporting. - -#### 1. `internal/core/domain/instance.go` — add fields -```go -ProvisionAttempts int `json:"provision_attempts"` -StatusMessage string `json:"status_message,omitempty"` // Human-readable error/info -``` - -#### 2. `internal/core/services/instance.go` — modify `Provision()` -- Track attempt count with exponential backoff -- Retry on transient errors: Docker API timeout, network errors, storage I/O errors -- Max 3 attempts -- Sleep between retries: 2s, 4s, 8s (exponential backoff) -- On final failure: set `StatusError` with `StatusMessage` describing failure -- Log each attempt: `s.logger.Info("provision attempt N", "instance_id", ...)` -- Increment metric: `instance_provision_retries_total` - -#### 3. `internal/platform/metrics.go` — add metrics -```go -InstanceProvisionRetriesTotal prometheus.Counter -``` - -#### 4. `internal/workers/pipeline_worker.go` — ensure retry loop -Check if provision job handling already retries or just calls `Provision()` once. May need to modify the worker loop to respect attempt count. - -#### 5. Tests -- Unit: verify retry on transient error, no retry on fatal error -- Verify attempt count increments - -### Files -| Action | File | -|--------|------| -| Modify | `internal/core/domain/instance.go` — add ProvisionAttempts, StatusMessage | -| Modify | `internal/core/services/instance.go` — add retry logic to Provision() | -| Modify | `internal/platform/metrics.go` — add provision retry counter | -| Modify | `internal/workers/pipeline_worker.go` — ensure retry behavior | - ---- - -## PR 5: Instance Tags & Resource Grouping - -**Status: ❌ NOT IMPLEMENTED** - -Note: `Instance.Labels` (map[string]string) already exists at `domain/instance.go:81` but provides key-value labels, not a simple tag list. - -### What needs to be built - -**Goal**: Add simple `Tags []string` for organizing/filtering instances. - -#### 1. `internal/core/domain/instance.go` — add `Tags` field -```go -Tags []string `json:"tags,omitempty"` -``` - -#### 2. `internal/repositories/postgres/instance_repo.go` — update `List` -Add optional tag-based filtering to the List query. Implementation options: -- Store tags as JSON array in a `tags` column (PostgreSQL JSONB) -- Or store in separate `instance_tags` join table - -#### 3. `internal/core/ports/instance.go` — update interface -```go -ListInstances(ctx context.Context, tags []string) ([]*domain.Instance, error) -// Or add filtering to existing method via query params -``` - -#### 4. `internal/core/services/instance.go` — update `ListInstances` -Pass tags through to repo. Update `LaunchInstance` to accept tags in `LaunchParams`. - -#### 5. `internal/handlers/instance_handler.go` — update `List` -```go -// GET /api/v1/instances?tag=team:backend&tag=env:prod -func (h *InstanceHandler) List(c *gin.Context) { - tags := c.QueryArray("tag") - // pass to service -} -``` - -#### 6. CLI support (`cmd/cloud/main.go`) -Add `--tag` flag to `instance list` command. - -#### 7. Tests -- Tag filtering in List -- Tags on Launch - -### Files -| Action | File | -|--------|------| -| Modify | `internal/core/domain/instance.go` — add Tags field | -| Modify | `internal/repositories/postgres/instance_repo.go` — add tag filtering | -| Modify | `internal/core/ports/instance.go` — update List interface | -| Modify | `internal/core/services/instance.go` — implement tag list in LaunchParams + ListInstances | -| Modify | `internal/handlers/instance_handler.go` — parse ?tag= query params | -| Modify | `cmd/cloud/main.go` — add --tag flag to instance list/create | - ---- - -## PR 6: WebSocket Progress Events - -**Status: ⚠️ PARTIALLY IMPLEMENTED** - -Infrastructure partially exists: `ws_event.go` has event types, `ws/` has Hub+broadcast. - -### What exists -- `WSEventInstanceCreated/Started/Stopped/Terminated` event types (`internal/core/domain/ws_event.go:15-22`) -- `Hub` struct for broadcasting (`internal/handlers/ws/hub.go`) -- `INSTANCE_LAUNCH` event recorded in `finalizeProvision` (`instance.go:432`) - -### What needs to be built - -**Goal**: Real-time progress (0-100%) during instance provisioning lifecycle. - -#### 1. `internal/core/domain/ws_event.go` — add new event types -```go -WSEventInstanceProvisioning = "instance.provisioning" -WSEventInstanceError = "instance.error" -// Add Progress field to WSEvent struct (0-100) -``` - -#### 2. `internal/core/services/instance.go` — emit progress events - -In `Provision()` method (called by worker): -```go -// At start of provision -s.eventSvc.RecordEvent(ctx, "INSTANCE_PROVISIONING", inst.ID.String(), "INSTANCE", - map[string]interface{}{"progress": 0, "message": "Starting provision..."}) - -// After network setup (25%) -// After volume resolution (50%) -// After container launch (75%) -// After finalize (90%) -// After finalize complete (100%) -``` - -Or emit via WebSocket hub directly (check if hub is accessible from service). - -#### 3. `internal/handlers/ws/handler.go` — add instance-specific subscription -```go -// GET /ws/instances/:id/events -// Subscribes to events for a specific instance -``` - -#### 4. WebSocket event flow -``` -LaunchInstance → enqueue job → return 202 -Worker picks up job → calls Provision() - → emits progress events (0%, 25%, 50%, 75%, 100%) - → on error: WSEventInstanceError -Client connects to /ws/instances/{id}/events and receives real-time updates -``` - -### Files -| Action | File | -|--------|------| -| Modify | `internal/core/domain/ws_event.go` — add provisioning event type + progress field | -| Modify | `internal/core/services/instance.go` — emit events in Provision() and finalizeProvision() | -| Modify | `internal/handlers/ws/handler.go` — add instance-filtered subscription | -| Modify | `internal/api/setup/router.go` — wire /ws/instances/:id/events | - ---- - -## PR 7: Instance Snapshots - -**Status: ❌ NOT IMPLEMENTED** - -Important: **Volume snapshots are fully implemented** (`SnapshotService`, `SnapshotRepository`). This PR is for **instance snapshots** (full VM/container state including memory). - -### What needs to be built - -**Goal**: Create, list, delete, and restore full instance snapshots (memory + disk). - -#### 1. `internal/core/domain/instance_snapshot.go` — new domain model -```go -type InstanceSnapshot struct { - ID uuid.UUID - Name string - InstanceID uuid.UUID - Status SnapshotStatus // PENDING, ACTIVE, ERROR, DELETING - Description string - SizeBytes int64 - Format string // "docker-image", "qcow2", "raw" - FilePath string // Path to stored snapshot - CreatedAt time.Time -} -``` - -#### 2. `internal/core/ports/instance_snapshot.go` — new port interface -```go -type InstanceSnapshotService interface { - CreateSnapshot(ctx context.Context, instanceID uuid.UUID, name, description string) (*InstanceSnapshot, error) - ListSnapshots(ctx context.Context, instanceID uuid.UUID) ([]*InstanceSnapshot, error) - DeleteSnapshot(ctx context.Context, snapshotID uuid.UUID) error - RestoreSnapshot(ctx context.Context, snapshotID uuid.UUID) error // stop → restore → start -} -``` - -#### 3. `internal/core/services/instance_snapshot.go` — new service -**`CreateSnapshot`**: -- For **Docker**: `docker commit `, then `docker save` to tar/qcow2 -- For **Libvirt**: `virDomainSnapshotCreateXML` with flag `VIR_DOMAIN_SNAPSHOT_CREATE_ALL` -- Store snapshot file in `thecloud-data/snapshots/` -- Create `InstanceSnapshot` record in new repository - -**`RestoreSnapshot`**: -- Get snapshot and validate instance is stopped -- Stop instance if running -- For Docker: `docker load` then recreate container -- For Libvirt: revert to snapshot via `virDomainRevertToSnapshot` -- Start instance -- Emit events: `instance.restored` - -#### 4. `internal/repositories/postgres/instance_snapshot_repo.go` — new repository - -#### 5. `internal/handlers/instance_snapshot_handler.go` — new handler -```go -// POST /api/v1/instances/:id/snapshots -CreateSnapshot(c *gin.Context) -// GET /api/v1/instances/:id/snapshots -ListSnapshots(c *gin.Context) -// DELETE /api/v1/snapshots/:id -DeleteSnapshot(c *gin.Context) -// POST /api/v1/snapshots/:id/restore -RestoreSnapshot(c *gin.Context) -``` - -#### 6. RBAC permissions -Add `PermissionInstanceSnapshotCreate`, `PermissionInstanceSnapshotDelete`, `PermissionInstanceSnapshotRestore` in `domain/rbac.go`. - -### Files -| Action | File | -|--------|------| -| New | `internal/core/domain/instance_snapshot.go` | -| New | `internal/core/ports/instance_snapshot.go` | -| New | `internal/core/services/instance_snapshot.go` | -| New | `internal/handlers/instance_snapshot_handler.go` | -| New | `internal/repositories/postgres/instance_snapshot_repo.go` | -| Modify | `internal/core/domain/rbac.go` — add snapshot permissions | -| Modify | `internal/api/setup/router.go` — wire snapshot endpoints | -| Modify | `internal/repositories/docker/adapter.go` — implement docker commit/save | -| Modify | `internal/repositories/libvirt/adapter.go` — implement snapshot APIs | - ---- - -## PR 8: Instance Live Migration (Libvirt) - -**Status: ❌ NOT IMPLEMENTED** - -### What needs to be built - -**Goal**: Live migrate a Libvirt VM from one host to another. - -#### 1. `internal/core/domain/rbac.go` — add permission -```go -PermissionInstanceMigrate Permission = "instance:migrate" -``` - -#### 2. `internal/core/ports/instance.go` — add method -```go -MigrateInstance(ctx context.Context, idOrName, targetHost string) error -``` - -#### 3. `internal/core/services/instance.go` — implement `MigrateInstance` -- RBAC check: `PermissionInstanceMigrate` -- Validate instance is `StatusRunning` -- Pre-migration validation: - - Check target host is reachable (ping or libvirt connection test) - - Check target has sufficient vCPU/memory capacity - - Check target host is compatible (same storage pool) -- Call `compute.MigrateInstance` on Libvirt backend - -#### 4. `internal/core/ports/compute.go` — add to interface -```go -MigrateInstance(ctx context.Context, id, targetHost string) error -``` - -#### 5. `internal/repositories/libvirt/adapter.go` — implement -```go -func (a *Adapter) MigrateInstance(ctx context.Context, id, targetHost string) error { - // virDomainMigrate3 with VIR_MIGRATE_LIVE flag - // Handle VIR_MIGRATE_ABORT_ON_ERROR for reliability - // Fallback to cold migration if live fails -} -``` - -#### 6. `internal/handlers/instance_handler.go` — add endpoint -```go -// POST /api/v1/instances/:id/migrate -// Body: { "target_host": "libvirt-host-2" } -MigrateInstance(c *gin.Context) -``` - -### Files -| Action | File | -|--------|------| -| Modify | `internal/core/domain/rbac.go` — add PermissionInstanceMigrate | -| Modify | `internal/core/ports/instance.go` — add MigrateInstance method | -| Modify | `internal/core/ports/compute.go` — add MigrateInstance to interface | -| Modify | `internal/core/services/instance.go` — implement MigrateInstance | -| Modify | `internal/handlers/instance_handler.go` — add migrate handler | -| Modify | `internal/repositories/libvirt/adapter.go` — implement virDomainMigrate3 | -| Modify | `internal/api/setup/router.go` — wire /migrate endpoint | - ---- - -## Completed Features Not In Original Roadmap - -These exist in the codebase but weren't in the original roadmap: - -| Feature | File | Notes | -|---------|------|-------| -| `Instance.Labels` (map) | `internal/core/domain/instance.go:81` | Key-value labels, not simple tags | -| `Instance.Metadata` (map) | `internal/core/domain/instance.go:80` | Arbitrary key-value metadata | -| `UpdateInstanceMetadata` | `services/instance.go:1182` | Patch metadata/labels on instance | -| `Exec` | `services/instance.go:1148` | Run command inside instance | -| Volume Snapshots | `services/snapshot.go`, `repos/postgres/snapshot_repo.go` | Fully implemented volume snapshots | -| Image RBAC perms | `domain/rbac.go:143` | All 4 image permissions exist | -| WebSocket infrastructure | `handlers/ws/` | Hub, broadcast, event types exist | - ---- - -## Implementation Order Recommendation - -| Order | PR | Reason | -|-------|----|--------| -| 1 | PR 1 (Import) | Small addition, completes existing work | -| 2 | PR 5 (Tags) | Small, low risk — adds filtering | -| 3 | PR 3 (Pause/Resume) | Low effort, adds useful state | -| 4 | PR 4 (Retry) | Low effort, improves reliability | -| 5 | PR 2 (Resize) | Medium effort, high utility | -| 6 | PR 6 (WebSocket) | Medium, depends on PR 4 | -| 7 | PR 7 (Snapshots) | High effort, complex | -| 8 | PR 8 (Migration) | Very high effort, Libvirt-only | From e3817de817f93b647ef2ba9bd212f4c59e5b2741 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Thu, 30 Apr 2026 13:02:16 +0300 Subject: [PATCH 24/29] fix: address PR #320 review findings - docker adapter: fix indentation regression in ContainerUpdate interface method - libvirt adapter: improve error messages with human-readable state names instead of raw numeric values (e.g., "domain is PAUSED" not "domain is 3") --- internal/repositories/docker/adapter.go | 4 ++-- internal/repositories/libvirt/adapter.go | 18 ++++++++++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/internal/repositories/docker/adapter.go b/internal/repositories/docker/adapter.go index f7c8d29e8..c6ffa9ab9 100644 --- a/internal/repositories/docker/adapter.go +++ b/internal/repositories/docker/adapter.go @@ -86,8 +86,8 @@ type dockerClient interface { ContainerExecStart(ctx context.Context, execID string, config container.ExecStartOptions) error ContainerExecAttach(ctx context.Context, execID string, config container.ExecStartOptions) (types.HijackedResponse, error) ContainerExecInspect(ctx context.Context, execID string) (container.ExecInspect, error) - ContainerRename(ctx context.Context, containerID string, newName string) error -ContainerUpdate(ctx context.Context, containerID string, updateConfig container.UpdateConfig) (container.UpdateResponse, error) + ContainerRename(ctx context.Context, containerID string, newName string) error + ContainerUpdate(ctx context.Context, containerID string, updateConfig container.UpdateConfig) (container.UpdateResponse, error) } // NewDockerAdapter constructs a DockerAdapter with a Docker client. diff --git a/internal/repositories/libvirt/adapter.go b/internal/repositories/libvirt/adapter.go index a3025f9e1..a320efc35 100644 --- a/internal/repositories/libvirt/adapter.go +++ b/internal/repositories/libvirt/adapter.go @@ -48,6 +48,20 @@ const ( memStatTagRSS = 6 ) +// domainStateName returns a human-readable name for a libvirt domain state. +func domainStateName(state int32) string { + switch state { + case domainStateRunning: + return "RUNNING" + case domainStatePaused: + return "PAUSED" + case domainStateShutoff: + return "SHUTOFF" + default: + return fmt.Sprintf("UNKNOWN(%d)", state) + } +} + // LibvirtAdapter implements compute backend operations using libvirt/KVM. type LibvirtAdapter struct { client LibvirtClient @@ -193,7 +207,7 @@ func (a *LibvirtAdapter) PauseInstance(ctx context.Context, id string) error { return fmt.Errorf("failed to get domain state: %w", err) } if state != domainStateRunning { - return fmt.Errorf("%w: domain is %d, must be RUNNING (1)", apierrors.ErrInstanceNotPausable, state) + return fmt.Errorf("%w: domain is %s, must be RUNNING", apierrors.ErrInstanceNotPausable, domainStateName(state)) } if err := a.client.DomainSuspend(ctx, dom); err != nil { @@ -215,7 +229,7 @@ func (a *LibvirtAdapter) ResumeInstance(ctx context.Context, id string) error { return fmt.Errorf("failed to get domain state: %w", err) } if state != domainStatePaused { - return fmt.Errorf("%w: domain is %d, must be PAUSED (3)", apierrors.ErrInstanceNotResumable, state) + return fmt.Errorf("%w: domain is %s, must be PAUSED", apierrors.ErrInstanceNotResumable, domainStateName(state)) } if err := a.client.DomainResume(ctx, dom); err != nil { From bd1bb2337e817acc820ba4a7942882253902896e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Thu, 30 Apr 2026 13:13:53 +0300 Subject: [PATCH 25/29] fix(tests): add t.Helper() to assert closures in pause/resume test cases Fixes lint failure: internal/core/services/instance_unit_test.go:1012:12: test helper function should start from t.Helper() (thelper) --- internal/core/services/instance_unit_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/internal/core/services/instance_unit_test.go b/internal/core/services/instance_unit_test.go index 6eadcbdbb..60e410eb3 100644 --- a/internal/core/services/instance_unit_test.go +++ b/internal/core/services/instance_unit_test.go @@ -1010,6 +1010,7 @@ func testInstanceServicePauseResumeUnit(t *testing.T) { auditSvc.On("Log", mock.Anything, userID, "instance.pause", "instance", instanceID.String(), mock.Anything).Return(nil).Once() }, assert: func(t *testing.T, err error, inst *domain.Instance) { + t.Helper() require.NoError(t, err) assert.Equal(t, domain.StatusPaused, inst.Status) }, @@ -1025,6 +1026,7 @@ func testInstanceServicePauseResumeUnit(t *testing.T) { rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() }, assert: func(t *testing.T, err error, inst *domain.Instance) { + t.Helper() require.Error(t, err) assert.Contains(t, err.Error(), "must be RUNNING to pause") }, @@ -1043,6 +1045,7 @@ func testInstanceServicePauseResumeUnit(t *testing.T) { auditSvc.On("Log", mock.Anything, userID, "instance.pause", "instance", instanceID.String(), mock.Anything).Return(nil).Maybe() }, assert: func(t *testing.T, err error, inst *domain.Instance) { + t.Helper() require.Error(t, err) assert.Contains(t, err.Error(), "pause failed") }, From ff89197e4298810a9d87560669fbf231ccd9291c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Thu, 30 Apr 2026 13:35:07 +0300 Subject: [PATCH 26/29] fix(tests): add t.Helper() to all pause/resume assert closures Fixes lint failure across all 9 assert closures in the pause/resume table-driven test cases (lines 1012, 1028, 1047, 1071, 1095, 1111, 1131, 1148, 1172). --- internal/core/services/instance_unit_test.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/internal/core/services/instance_unit_test.go b/internal/core/services/instance_unit_test.go index 60e410eb3..45c92f187 100644 --- a/internal/core/services/instance_unit_test.go +++ b/internal/core/services/instance_unit_test.go @@ -1069,6 +1069,7 @@ func testInstanceServicePauseResumeUnit(t *testing.T) { auditSvc.On("Log", mock.Anything, userID, "instance.pause", "instance", instanceID.String(), mock.Anything).Return(nil).Maybe() }, assert: func(t *testing.T, err error, inst *domain.Instance) { + t.Helper() require.Error(t, err) assert.Contains(t, err.Error(), "db error") assert.Equal(t, domain.StatusRunning, inst.Status) @@ -1092,6 +1093,7 @@ func testInstanceServicePauseResumeUnit(t *testing.T) { auditSvc.On("Log", mock.Anything, userID, "instance.resume", "instance", instanceID.String(), mock.Anything).Return(nil).Once() }, assert: func(t *testing.T, err error, inst *domain.Instance) { + t.Helper() require.NoError(t, err) assert.Equal(t, domain.StatusRunning, inst.Status) }, @@ -1107,6 +1109,7 @@ func testInstanceServicePauseResumeUnit(t *testing.T) { rbacSvc.On("Authorize", mock.Anything, userID, tenantID, domain.PermissionInstanceRead, instanceID.String()).Return(nil).Once() }, assert: func(t *testing.T, err error, inst *domain.Instance) { + t.Helper() require.Error(t, err) assert.Contains(t, err.Error(), "must be PAUSED to resume") }, @@ -1126,6 +1129,7 @@ func testInstanceServicePauseResumeUnit(t *testing.T) { auditSvc.On("Log", mock.Anything, userID, "instance.resume", "instance", instanceID.String(), mock.Anything).Return(nil).Maybe() }, assert: func(t *testing.T, err error, inst *domain.Instance) { + t.Helper() require.Error(t, err) assert.Contains(t, err.Error(), "resume failed") }, @@ -1143,6 +1147,7 @@ func testInstanceServicePauseResumeUnit(t *testing.T) { compute.On("Type").Return("docker").Maybe() }, assert: func(t *testing.T, err error, inst *domain.Instance) { + t.Helper() require.Error(t, err) assert.Contains(t, err.Error(), "instance cannot be resumed") assert.True(t, svcerrors.Is(err, svcerrors.Conflict)) @@ -1167,6 +1172,7 @@ func testInstanceServicePauseResumeUnit(t *testing.T) { auditSvc.On("Log", mock.Anything, userID, "instance.resume", "instance", instanceID.String(), mock.Anything).Return(nil).Maybe() }, assert: func(t *testing.T, err error, inst *domain.Instance) { + t.Helper() require.Error(t, err) assert.Contains(t, err.Error(), "db error") assert.Equal(t, domain.StatusPaused, inst.Status) From 7a94f6ccffcd3cba7f8be8470daf7ee97f08e3fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Thu, 30 Apr 2026 13:46:56 +0300 Subject: [PATCH 27/29] ci: trigger fresh build From 9922509cb66e7d86a000988270a8edfb88ca02f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Thu, 30 Apr 2026 14:08:10 +0300 Subject: [PATCH 28/29] fix(tests): add PauseInstance and ResumeInstance to testComputeBackend After merging main, testComputeBackend was missing the new interface methods. --- internal/core/services/function_internal_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/core/services/function_internal_test.go b/internal/core/services/function_internal_test.go index d42b3f440..2692c8b6e 100644 --- a/internal/core/services/function_internal_test.go +++ b/internal/core/services/function_internal_test.go @@ -62,6 +62,8 @@ func (t *testComputeBackend) ResizeInstance(ctx context.Context, id string, cpu, func (t *testComputeBackend) CreateSnapshot(ctx context.Context, id, name string) error { return nil } func (t *testComputeBackend) RestoreSnapshot(ctx context.Context, id, name string) error { return nil } func (t *testComputeBackend) DeleteSnapshot(ctx context.Context, id, name string) error { return nil } +func (t *testComputeBackend) PauseInstance(ctx context.Context, id string) error { return nil } +func (t *testComputeBackend) ResumeInstance(ctx context.Context, id string) error { return nil } // compile-time check that testComputeBackend satisfies ports.ComputeBackend var _ ports.ComputeBackend = (*testComputeBackend)(nil) From a07312488bb0ec2801bc05f12350ff3721ac0f38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Thu, 30 Apr 2026 15:17:52 +0300 Subject: [PATCH 29/29] fix(services): clarify resume failure log message The previous log said "failed to resume container, rolling back" which was misleading since no backend rollback occurs when ResumeInstance fails - the container stays PAUSED. Updated to "failed to resume container, instance left in PAUSED state". --- internal/core/services/instance.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/internal/core/services/instance.go b/internal/core/services/instance.go index 0743234b1..222aafe23 100644 --- a/internal/core/services/instance.go +++ b/internal/core/services/instance.go @@ -717,12 +717,12 @@ func (s *InstanceService) ResumeInstance(ctx context.Context, idOrName string) e "container_id", target, "instance_id", inst.ID, "error", err) return errors.New(errors.Conflict, err.Error()) } - s.logger.Error("failed to resume container, rolling back", - "container_id", target, "instance_id", inst.ID, "old_status", oldStatus, "error", err) + s.logger.Error("failed to resume container, instance left in PAUSED state", + "container_id", target, "instance_id", inst.ID, "error", err) inst.Status = oldStatus if repoErr := s.repo.Update(ctx, inst); repoErr != nil { - s.logger.Error("failed to rollback instance status", - "instance_id", inst.ID, "resume_error", err, "rollback_error", repoErr) + s.logger.Error("failed to persist instance status after resume failure", + "instance_id", inst.ID, "resume_error", err, "persist_error", repoErr) } return errors.Wrap(errors.Internal, "failed to resume container", err) }