Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions cocoonset/agents.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ func (r *Reconciler) ensureSubAgents(ctx context.Context, cs *cocoonv1.CocoonSet
changed := false
var requeueAfter time.Duration

restorable, err := r.podsRestorableByCR(ctx, cs.Namespace)
if err != nil {
return changed, requeueAfter, err
}

g, gctx := errgroup.WithContext(ctx)
g.SetLimit(subAgentCreateConcurrency)
var created atomic.Bool
Expand All @@ -59,6 +64,10 @@ func (r *Reconciler) ensureSubAgents(ctx context.Context, cs *cocoonv1.CocoonSet
if err != nil {
return fmt.Errorf("build sub-agent slot %d: %w", slot, err)
}
_, intent := restorable[subPod.Name]
if err := r.markRestoreIfHibernated(gctx, subPod, intent); err != nil {
return fmt.Errorf("mark restore sub-agent slot %d: %w", slot, err)
}
if err := r.Create(gctx, subPod); err != nil {
if apierrors.IsAlreadyExists(err) {
return nil
Expand Down
43 changes: 30 additions & 13 deletions cocoonset/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,19 +116,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
return ctrl.Result{Requeue: true}, nil
}
if classified.main == nil {
mainPod, err := buildAgentPod(&cs, 0, "", "", r.Scheme)
if err != nil {
return ctrl.Result{}, fmt.Errorf("build main agent: %w", err)
}
if err := r.Create(ctx, mainPod); err != nil {
if apierrors.IsAlreadyExists(err) {
// Old pod still Terminating; requeue and wait.
return ctrl.Result{RequeueAfter: requeueWaitForMain}, nil
}
return ctrl.Result{}, fmt.Errorf("create main agent: %w", err)
}
logger.Infof(ctx, "created main agent %s/%s", mainPod.Namespace, mainPod.Name)
return ctrl.Result{Requeue: true}, nil
return r.createMainAgent(ctx, &cs)
}

// Sub-agents fork from main and need it live before creation.
Expand Down Expand Up @@ -158,6 +146,35 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
return ctrl.Result{RequeueAfter: subRequeue}, nil
}

// createMainAgent builds and creates the missing main agent pod, stamping
// restore-from-hibernate when the agent is hibernated so a cross-node recreate
// restores from the :hibernate snapshot instead of booting fresh. It always
// requeues so sub-agents fork off the now-created main.
func (r *Reconciler) createMainAgent(ctx context.Context, cs *cocoonv1.CocoonSet) (ctrl.Result, error) {
logger := log.WithFunc("cocoonset.Reconciler.createMainAgent")
mainPod, err := buildAgentPod(cs, 0, "", "", r.Scheme)
if err != nil {
return ctrl.Result{}, fmt.Errorf("build main agent: %w", err)
}
restorable, err := r.podsRestorableByCR(ctx, cs.Namespace)
if err != nil {
return ctrl.Result{}, err
}
_, intent := restorable[mainPod.Name]
if err := r.markRestoreIfHibernated(ctx, mainPod, intent); err != nil {
return ctrl.Result{}, err
}
if err := r.Create(ctx, mainPod); err != nil {
if apierrors.IsAlreadyExists(err) {
// Old pod still Terminating; requeue and wait.
return ctrl.Result{RequeueAfter: requeueWaitForMain}, nil
}
return ctrl.Result{}, fmt.Errorf("create main agent: %w", err)
}
logger.Infof(ctx, "created main agent %s/%s", mainPod.Namespace, mainPod.Name)
return ctrl.Result{Requeue: true}, nil
}

// observeMainPodFailed records the failure on the event channel and, when the
// signal came from a vk-cocoon lifecycle annotation, bumps the dedicated
// counter so the Pod-Phase-only path doesn't dilute the metric's meaning.
Expand Down
79 changes: 79 additions & 0 deletions cocoonset/restore.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
package cocoonset

import (
"context"
"fmt"

"github.com/projecteru2/core/log"
corev1 "k8s.io/api/core/v1"
"sigs.k8s.io/controller-runtime/pkg/client"

cocoonv1 "github.com/cocoonstack/cocoon-common/apis/v1"
"github.com/cocoonstack/cocoon-common/meta"
)

// hibernationPodNames lists the namespace's CocoonHibernations and returns the
// set of PodRef names whose CR satisfies accept.
func (r *Reconciler) hibernationPodNames(ctx context.Context, namespace string, accept func(*cocoonv1.CocoonHibernation) bool) (map[string]struct{}, error) {
var hibList cocoonv1.CocoonHibernationList
if err := r.List(ctx, &hibList, client.InNamespace(namespace)); err != nil {
return nil, fmt.Errorf("list cocoonhibernations in %s: %w", namespace, err)
}
out := make(map[string]struct{}, len(hibList.Items))
for i := range hibList.Items {
hib := &hibList.Items[i]
if hib.Spec.PodRef.Name != "" && accept(hib) {
out[hib.Spec.PodRef.Name] = struct{}{}
}
}
return out, nil
}

// podsRestorableByCR returns pod names whose CocoonHibernation is in a phase where
// a (re)created pod must restore its VM from the :hibernate snapshot rather than
// boot fresh: Hibernated (fully hibernated) or Waking (mid-wake; the tag is still
// present). Phase, not Desire, gates this: Phase reaches Hibernated only after the
// snapshot is confirmed pushed and clears on wake, so a leaked snapshot left on an
// Active agent is correctly excluded.
func (r *Reconciler) podsRestorableByCR(ctx context.Context, namespace string) (map[string]struct{}, error) {
return r.hibernationPodNames(ctx, namespace, func(h *cocoonv1.CocoonHibernation) bool {
return h.Status.Phase == cocoonv1.CocoonHibernationPhaseHibernated ||
h.Status.Phase == cocoonv1.CocoonHibernationPhaseWaking
})
}

// hasHibernateSnapshot reports whether vmName has a :hibernate snapshot in the
// registry — the same lookup vk-cocoon performs at wake.
func (r *Reconciler) hasHibernateSnapshot(ctx context.Context, vmName string) (bool, error) {
present, err := r.Registry.HasManifest(ctx, vmName, meta.HibernateSnapshotTag)
if err != nil {
return false, fmt.Errorf("probe hibernate snapshot %s: %w", vmName, err)
}
return present, nil
}

// markRestoreIfHibernated flags a freshly-built pod to restore its VM from the
// :hibernate snapshot when the agent is hibernated (intent) and the snapshot
// actually exists in the registry. The probe is the same lookup vk runs at wake,
// so intent and a present snapshot together guarantee the restore can proceed; it
// also fails closed so a create never silently falls back to a fresh boot (which a
// subsequent re-hibernate would then persist over the real snapshot).
func (r *Reconciler) markRestoreIfHibernated(ctx context.Context, pod *corev1.Pod, intent bool) error {
logger := log.WithFunc("cocoonset.Reconciler.markRestoreIfHibernated")
if !intent || r.Registry == nil {
return nil
}
vmName := meta.ParseVMSpec(pod).VMName
if vmName == "" {
return nil
}
present, err := r.hasHibernateSnapshot(ctx, vmName)
if err != nil {
return err
}
if present {
meta.MarkRestoreFromHibernate(pod)
logger.Infof(ctx, "pod %s/%s will restore VM %s from :hibernate", pod.Namespace, pod.Name, vmName)
}
return nil
}
139 changes: 139 additions & 0 deletions cocoonset/restore_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
package cocoonset

import (
"errors"
"testing"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
ctrlfake "sigs.k8s.io/controller-runtime/pkg/client/fake"

cocoonv1 "github.com/cocoonstack/cocoon-common/apis/v1"
"github.com/cocoonstack/cocoon-common/meta"
)

func TestMarkRestoreIfHibernated(t *testing.T) {
cases := []struct {
name string
intent bool
present map[string]bool
probeErr error
wantSet bool
wantErr bool
}{
{"no intent skips the probe", false, map[string]bool{"vm:hibernate": true}, nil, false, false},
{"intent and snapshot present", true, map[string]bool{"vm:hibernate": true}, nil, true, false},
{"intent but no snapshot", true, map[string]bool{}, nil, false, false},
{"probe error fails closed", true, nil, errors.New("boom"), false, true},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
r := &Reconciler{Registry: &fakeRegistry{present: c.present, probeErr: c.probeErr}}
pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{
Name: "p", Namespace: "ns",
Annotations: map[string]string{meta.AnnotationVMName: "vm"},
}}
err := r.markRestoreIfHibernated(t.Context(), pod, c.intent)
if (err != nil) != c.wantErr {
t.Fatalf("markRestoreIfHibernated err = %v, wantErr %v", err, c.wantErr)
}
if got := meta.ReadRestoreFromHibernate(pod); got != c.wantSet {
t.Errorf("restore-from-hibernate set = %v, want %v", got, c.wantSet)
}
})
}
}

func TestMarkRestoreIfHibernatedNoRegistry(t *testing.T) {
pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{meta.AnnotationVMName: "vm"},
}}
r := &Reconciler{} // Registry nil (OCI_REGISTRY unset deployment)
if err := r.markRestoreIfHibernated(t.Context(), pod, true); err != nil {
t.Fatalf("nil registry should be a no-op, got %v", err)
}
if meta.ReadRestoreFromHibernate(pod) {
t.Error("no registry must not flag restore")
}
}

func TestPodsRestorableByCR(t *testing.T) {
hib := func(pod string, phase cocoonv1.CocoonHibernationPhase) *cocoonv1.CocoonHibernation {
return &cocoonv1.CocoonHibernation{
ObjectMeta: metav1.ObjectMeta{Name: "h-" + pod, Namespace: "ns"},
Spec: cocoonv1.CocoonHibernationSpec{
PodRef: cocoonv1.HibernationPodRef{Name: pod},
Desire: cocoonv1.HibernationDesireHibernate,
},
Status: cocoonv1.CocoonHibernationStatus{Phase: phase},
}
}
scheme := testScheme(t)
cli := ctrlfake.NewClientBuilder().WithScheme(scheme).WithObjects(
hib("hibernated", cocoonv1.CocoonHibernationPhaseHibernated),
hib("waking", cocoonv1.CocoonHibernationPhaseWaking),
hib("active", cocoonv1.CocoonHibernationPhaseActive),
hib("hibernating", cocoonv1.CocoonHibernationPhaseHibernating),
).Build()
r := &Reconciler{Client: cli, Scheme: scheme}
got, err := r.podsRestorableByCR(t.Context(), "ns")
if err != nil {
t.Fatalf("podsRestorableByCR: %v", err)
}
for _, want := range []string{"hibernated", "waking"} {
if _, ok := got[want]; !ok {
t.Errorf("pod %q should be restorable", want)
}
}
for _, no := range []string{"active", "hibernating"} {
if _, ok := got[no]; ok {
t.Errorf("pod %q must not be restorable (phase excludes it)", no)
}
}
}

// TestEnsureToolboxesRestoresHibernated guards the toolbox recreate path: a
// managed toolbox hibernated via CR must be stamped restore-from-hibernate when
// ensureToolboxes rebuilds it, so it restores rather than cold-boots (a fresh
// boot would let a later hibernate overwrite the real snapshot).
func TestEnsureToolboxesRestoresHibernated(t *testing.T) {
scheme := testScheme(t)
cs := &cocoonv1.CocoonSet{
ObjectMeta: metav1.ObjectMeta{Name: "demo", Namespace: "ns"},
Spec: cocoonv1.CocoonSetSpec{
Toolboxes: []cocoonv1.ToolboxSpec{{Name: "tb", Image: "img", Mode: cocoonv1.ToolboxModeRun}},
},
}
tbPodName := toolboxPodName(cs.Name, "tb")
tbVMName := meta.VMNameForPod(cs.Namespace, tbPodName)
hib := &cocoonv1.CocoonHibernation{
ObjectMeta: metav1.ObjectMeta{Name: "h-tb", Namespace: "ns"},
Spec: cocoonv1.CocoonHibernationSpec{
PodRef: cocoonv1.HibernationPodRef{Name: tbPodName},
Desire: cocoonv1.HibernationDesireHibernate,
},
Status: cocoonv1.CocoonHibernationStatus{Phase: cocoonv1.CocoonHibernationPhaseHibernated},
}
cli := ctrlfake.NewClientBuilder().WithScheme(scheme).WithObjects(cs, hib).Build()
r := &Reconciler{
Client: cli,
Scheme: scheme,
Registry: &fakeRegistry{present: map[string]bool{tbVMName + ":hibernate": true}},
}

changed, err := r.ensureToolboxes(t.Context(), cs, classifyPods(nil))
if err != nil {
t.Fatalf("ensureToolboxes: %v", err)
}
if !changed {
t.Fatal("expected the missing toolbox to be created")
}
var created corev1.Pod
if err := cli.Get(t.Context(), client.ObjectKey{Namespace: "ns", Name: tbPodName}, &created); err != nil {
t.Fatalf("get created toolbox: %v", err)
}
if !meta.ReadRestoreFromHibernate(&created) {
t.Error("recreated hibernated toolbox must be flagged restore-from-hibernate")
}
}
28 changes: 9 additions & 19 deletions cocoonset/suspend.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (
"github.com/projecteru2/core/log"
corev1 "k8s.io/api/core/v1"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

cocoonv1 "github.com/cocoonstack/cocoon-common/apis/v1"
commonk8s "github.com/cocoonstack/cocoon-common/k8s"
Expand All @@ -28,6 +27,10 @@ func (r *Reconciler) reconcileSuspend(ctx context.Context, cs *cocoonv1.CocoonSe
if err != nil {
return ctrl.Result{}, fmt.Errorf("build main agent before suspend: %w", err)
}
// reconcileSuspend only runs under Spec.Suspend, so restore intent is unconditional.
if err := r.markRestoreIfHibernated(ctx, mainPod, true); err != nil {
return ctrl.Result{}, err
}
if err := r.Create(ctx, mainPod); err != nil {
return ctrl.Result{}, fmt.Errorf("create main agent before suspend: %w", err)
}
Expand Down Expand Up @@ -73,9 +76,9 @@ func (r *Reconciler) allOwnedPodsHibernated(ctx context.Context, classified clas
if spec.VMName == "" {
return false, nil
}
present, err := r.Registry.HasManifest(ctx, spec.VMName, meta.HibernateSnapshotTag)
present, err := r.hasHibernateSnapshot(ctx, spec.VMName)
if err != nil {
return false, fmt.Errorf("probe hibernate snapshot %s: %w", spec.VMName, err)
return false, err
}
if !present {
return false, nil
Expand Down Expand Up @@ -130,20 +133,7 @@ func (r *Reconciler) applyUnsuspend(ctx context.Context, namespace string, class

// podsHibernatedByCR returns pod names targeted by a desire=Hibernate CR.
func (r *Reconciler) podsHibernatedByCR(ctx context.Context, namespace string) (map[string]struct{}, error) {
var hibList cocoonv1.CocoonHibernationList
if err := r.List(ctx, &hibList, client.InNamespace(namespace)); err != nil {
return nil, fmt.Errorf("list cocoonhibernations in %s: %w", namespace, err)
}
out := make(map[string]struct{}, len(hibList.Items))
for i := range hibList.Items {
hib := &hibList.Items[i]
if hib.Spec.Desire != cocoonv1.HibernationDesireHibernate {
continue
}
if hib.Spec.PodRef.Name == "" {
continue
}
out[hib.Spec.PodRef.Name] = struct{}{}
}
return out, nil
return r.hibernationPodNames(ctx, namespace, func(h *cocoonv1.CocoonHibernation) bool {
return h.Spec.Desire == cocoonv1.HibernationDesireHibernate
})
}
8 changes: 8 additions & 0 deletions cocoonset/toolboxes.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ func (r *Reconciler) ensureToolboxes(ctx context.Context, cs *cocoonv1.CocoonSet
}
desired[tb.Name] = true
}
restorable, err := r.podsRestorableByCR(ctx, cs.Namespace)
if err != nil {
return false, err
}
changed := false
for _, tb := range cs.Spec.Toolboxes {
podName := toolboxPodName(cs.Name, tb.Name)
Expand All @@ -47,6 +51,10 @@ func (r *Reconciler) ensureToolboxes(ctx context.Context, cs *cocoonv1.CocoonSet
if err != nil {
return changed, fmt.Errorf("build toolbox %s: %w", tb.Name, err)
}
_, intent := restorable[tbPod.Name]
if err := r.markRestoreIfHibernated(ctx, tbPod, intent); err != nil {
return changed, fmt.Errorf("mark restore toolbox %s: %w", tb.Name, err)
}
if err := r.Create(ctx, tbPod); err != nil {
if !apierrors.IsAlreadyExists(err) {
return changed, fmt.Errorf("create toolbox %s: %w", tb.Name, err)
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module github.com/cocoonstack/cocoon-operator
go 1.25.6

require (
github.com/cocoonstack/cocoon-common v0.2.3-0.20260701090838-225f6832c6b9
github.com/cocoonstack/cocoon-common v0.2.3-0.20260701180250-e248edd1d822
github.com/go-logr/logr v1.4.3
github.com/google/go-containerregistry v0.21.7
github.com/projecteru2/core v0.0.0-20241016125006-ff909eefe04c
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b h1:r6VH0faHjZe
github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b/go.mod h1:Vz9DsVWQQhf3vs21MhPMZpMGSht7O/2vFW2xusFUVOs=
github.com/cockroachdb/redact v1.1.3 h1:AKZds10rFSIj7qADf0g46UixK8NNLwWTNdCIGS5wfSQ=
github.com/cockroachdb/redact v1.1.3/go.mod h1:BVNblN9mBWFyMyqK1k3AAiSxhvhfK2oOZZ2lK+dpvRg=
github.com/cocoonstack/cocoon-common v0.2.3-0.20260701090838-225f6832c6b9 h1:szvuOHCmhav9nOclSk+393k4QZZxqvZQvPQvBbJ0qBY=
github.com/cocoonstack/cocoon-common v0.2.3-0.20260701090838-225f6832c6b9/go.mod h1:/Cf3aBBN0blBxJWexuGuMbTkas+scvQiF2I75aQXkH4=
github.com/cocoonstack/cocoon-common v0.2.3-0.20260701180250-e248edd1d822 h1:m2gUPkKlOxJwNrJ6WL1u5sRorgufhq73UoSgUwfb3Lw=
github.com/cocoonstack/cocoon-common v0.2.3-0.20260701180250-e248edd1d822/go.mod h1:/Cf3aBBN0blBxJWexuGuMbTkas+scvQiF2I75aQXkH4=
github.com/codegangsta/inject v0.0.0-20150114235600-33e0aa1cb7c0/go.mod h1:4Zcjuz89kmFXt9morQgcfYZAYZ5n8WHjt81YYWIwtTM=
github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk=
Expand Down