diff --git a/cocoonset/agents.go b/cocoonset/agents.go index 31e6ec4..75c0b2d 100644 --- a/cocoonset/agents.go +++ b/cocoonset/agents.go @@ -37,6 +37,11 @@ func (r *Reconciler) ensureSubAgents(ctx context.Context, cs *cocoonv1.CocoonSet changed := false var requeueAfter time.Duration + restorable, err := r.podsRestorableByCR(ctx, cs.Namespace) + if err != nil { + return changed, requeueAfter, err + } + g, gctx := errgroup.WithContext(ctx) g.SetLimit(subAgentCreateConcurrency) var created atomic.Bool @@ -59,6 +64,10 @@ func (r *Reconciler) ensureSubAgents(ctx context.Context, cs *cocoonv1.CocoonSet if err != nil { return fmt.Errorf("build sub-agent slot %d: %w", slot, err) } + _, intent := restorable[subPod.Name] + if err := r.markRestoreIfHibernated(gctx, subPod, intent); err != nil { + return fmt.Errorf("mark restore sub-agent slot %d: %w", slot, err) + } if err := r.Create(gctx, subPod); err != nil { if apierrors.IsAlreadyExists(err) { return nil diff --git a/cocoonset/reconciler.go b/cocoonset/reconciler.go index 3c2baef..ca403e3 100644 --- a/cocoonset/reconciler.go +++ b/cocoonset/reconciler.go @@ -116,19 +116,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu return ctrl.Result{Requeue: true}, nil } if classified.main == nil { - mainPod, err := buildAgentPod(&cs, 0, "", "", r.Scheme) - if err != nil { - return ctrl.Result{}, fmt.Errorf("build main agent: %w", err) - } - if err := r.Create(ctx, mainPod); err != nil { - if apierrors.IsAlreadyExists(err) { - // Old pod still Terminating; requeue and wait. - return ctrl.Result{RequeueAfter: requeueWaitForMain}, nil - } - return ctrl.Result{}, fmt.Errorf("create main agent: %w", err) - } - logger.Infof(ctx, "created main agent %s/%s", mainPod.Namespace, mainPod.Name) - return ctrl.Result{Requeue: true}, nil + return r.createMainAgent(ctx, &cs) } // Sub-agents fork from main and need it live before creation. @@ -158,6 +146,35 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu return ctrl.Result{RequeueAfter: subRequeue}, nil } +// createMainAgent builds and creates the missing main agent pod, stamping +// restore-from-hibernate when the agent is hibernated so a cross-node recreate +// restores from the :hibernate snapshot instead of booting fresh. It always +// requeues so sub-agents fork off the now-created main. +func (r *Reconciler) createMainAgent(ctx context.Context, cs *cocoonv1.CocoonSet) (ctrl.Result, error) { + logger := log.WithFunc("cocoonset.Reconciler.createMainAgent") + mainPod, err := buildAgentPod(cs, 0, "", "", r.Scheme) + if err != nil { + return ctrl.Result{}, fmt.Errorf("build main agent: %w", err) + } + restorable, err := r.podsRestorableByCR(ctx, cs.Namespace) + if err != nil { + return ctrl.Result{}, err + } + _, intent := restorable[mainPod.Name] + if err := r.markRestoreIfHibernated(ctx, mainPod, intent); err != nil { + return ctrl.Result{}, err + } + if err := r.Create(ctx, mainPod); err != nil { + if apierrors.IsAlreadyExists(err) { + // Old pod still Terminating; requeue and wait. + return ctrl.Result{RequeueAfter: requeueWaitForMain}, nil + } + return ctrl.Result{}, fmt.Errorf("create main agent: %w", err) + } + logger.Infof(ctx, "created main agent %s/%s", mainPod.Namespace, mainPod.Name) + return ctrl.Result{Requeue: true}, nil +} + // observeMainPodFailed records the failure on the event channel and, when the // signal came from a vk-cocoon lifecycle annotation, bumps the dedicated // counter so the Pod-Phase-only path doesn't dilute the metric's meaning. diff --git a/cocoonset/restore.go b/cocoonset/restore.go new file mode 100644 index 0000000..7e870f6 --- /dev/null +++ b/cocoonset/restore.go @@ -0,0 +1,79 @@ +package cocoonset + +import ( + "context" + "fmt" + + "github.com/projecteru2/core/log" + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + cocoonv1 "github.com/cocoonstack/cocoon-common/apis/v1" + "github.com/cocoonstack/cocoon-common/meta" +) + +// hibernationPodNames lists the namespace's CocoonHibernations and returns the +// set of PodRef names whose CR satisfies accept. +func (r *Reconciler) hibernationPodNames(ctx context.Context, namespace string, accept func(*cocoonv1.CocoonHibernation) bool) (map[string]struct{}, error) { + var hibList cocoonv1.CocoonHibernationList + if err := r.List(ctx, &hibList, client.InNamespace(namespace)); err != nil { + return nil, fmt.Errorf("list cocoonhibernations in %s: %w", namespace, err) + } + out := make(map[string]struct{}, len(hibList.Items)) + for i := range hibList.Items { + hib := &hibList.Items[i] + if hib.Spec.PodRef.Name != "" && accept(hib) { + out[hib.Spec.PodRef.Name] = struct{}{} + } + } + return out, nil +} + +// podsRestorableByCR returns pod names whose CocoonHibernation is in a phase where +// a (re)created pod must restore its VM from the :hibernate snapshot rather than +// boot fresh: Hibernated (fully hibernated) or Waking (mid-wake; the tag is still +// present). Phase, not Desire, gates this: Phase reaches Hibernated only after the +// snapshot is confirmed pushed and clears on wake, so a leaked snapshot left on an +// Active agent is correctly excluded. +func (r *Reconciler) podsRestorableByCR(ctx context.Context, namespace string) (map[string]struct{}, error) { + return r.hibernationPodNames(ctx, namespace, func(h *cocoonv1.CocoonHibernation) bool { + return h.Status.Phase == cocoonv1.CocoonHibernationPhaseHibernated || + h.Status.Phase == cocoonv1.CocoonHibernationPhaseWaking + }) +} + +// hasHibernateSnapshot reports whether vmName has a :hibernate snapshot in the +// registry — the same lookup vk-cocoon performs at wake. +func (r *Reconciler) hasHibernateSnapshot(ctx context.Context, vmName string) (bool, error) { + present, err := r.Registry.HasManifest(ctx, vmName, meta.HibernateSnapshotTag) + if err != nil { + return false, fmt.Errorf("probe hibernate snapshot %s: %w", vmName, err) + } + return present, nil +} + +// markRestoreIfHibernated flags a freshly-built pod to restore its VM from the +// :hibernate snapshot when the agent is hibernated (intent) and the snapshot +// actually exists in the registry. The probe is the same lookup vk runs at wake, +// so intent and a present snapshot together guarantee the restore can proceed; it +// also fails closed so a create never silently falls back to a fresh boot (which a +// subsequent re-hibernate would then persist over the real snapshot). +func (r *Reconciler) markRestoreIfHibernated(ctx context.Context, pod *corev1.Pod, intent bool) error { + logger := log.WithFunc("cocoonset.Reconciler.markRestoreIfHibernated") + if !intent || r.Registry == nil { + return nil + } + vmName := meta.ParseVMSpec(pod).VMName + if vmName == "" { + return nil + } + present, err := r.hasHibernateSnapshot(ctx, vmName) + if err != nil { + return err + } + if present { + meta.MarkRestoreFromHibernate(pod) + logger.Infof(ctx, "pod %s/%s will restore VM %s from :hibernate", pod.Namespace, pod.Name, vmName) + } + return nil +} diff --git a/cocoonset/restore_test.go b/cocoonset/restore_test.go new file mode 100644 index 0000000..17816a7 --- /dev/null +++ b/cocoonset/restore_test.go @@ -0,0 +1,139 @@ +package cocoonset + +import ( + "errors" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + ctrlfake "sigs.k8s.io/controller-runtime/pkg/client/fake" + + cocoonv1 "github.com/cocoonstack/cocoon-common/apis/v1" + "github.com/cocoonstack/cocoon-common/meta" +) + +func TestMarkRestoreIfHibernated(t *testing.T) { + cases := []struct { + name string + intent bool + present map[string]bool + probeErr error + wantSet bool + wantErr bool + }{ + {"no intent skips the probe", false, map[string]bool{"vm:hibernate": true}, nil, false, false}, + {"intent and snapshot present", true, map[string]bool{"vm:hibernate": true}, nil, true, false}, + {"intent but no snapshot", true, map[string]bool{}, nil, false, false}, + {"probe error fails closed", true, nil, errors.New("boom"), false, true}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + r := &Reconciler{Registry: &fakeRegistry{present: c.present, probeErr: c.probeErr}} + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{ + Name: "p", Namespace: "ns", + Annotations: map[string]string{meta.AnnotationVMName: "vm"}, + }} + err := r.markRestoreIfHibernated(t.Context(), pod, c.intent) + if (err != nil) != c.wantErr { + t.Fatalf("markRestoreIfHibernated err = %v, wantErr %v", err, c.wantErr) + } + if got := meta.ReadRestoreFromHibernate(pod); got != c.wantSet { + t.Errorf("restore-from-hibernate set = %v, want %v", got, c.wantSet) + } + }) + } +} + +func TestMarkRestoreIfHibernatedNoRegistry(t *testing.T) { + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{meta.AnnotationVMName: "vm"}, + }} + r := &Reconciler{} // Registry nil (OCI_REGISTRY unset deployment) + if err := r.markRestoreIfHibernated(t.Context(), pod, true); err != nil { + t.Fatalf("nil registry should be a no-op, got %v", err) + } + if meta.ReadRestoreFromHibernate(pod) { + t.Error("no registry must not flag restore") + } +} + +func TestPodsRestorableByCR(t *testing.T) { + hib := func(pod string, phase cocoonv1.CocoonHibernationPhase) *cocoonv1.CocoonHibernation { + return &cocoonv1.CocoonHibernation{ + ObjectMeta: metav1.ObjectMeta{Name: "h-" + pod, Namespace: "ns"}, + Spec: cocoonv1.CocoonHibernationSpec{ + PodRef: cocoonv1.HibernationPodRef{Name: pod}, + Desire: cocoonv1.HibernationDesireHibernate, + }, + Status: cocoonv1.CocoonHibernationStatus{Phase: phase}, + } + } + scheme := testScheme(t) + cli := ctrlfake.NewClientBuilder().WithScheme(scheme).WithObjects( + hib("hibernated", cocoonv1.CocoonHibernationPhaseHibernated), + hib("waking", cocoonv1.CocoonHibernationPhaseWaking), + hib("active", cocoonv1.CocoonHibernationPhaseActive), + hib("hibernating", cocoonv1.CocoonHibernationPhaseHibernating), + ).Build() + r := &Reconciler{Client: cli, Scheme: scheme} + got, err := r.podsRestorableByCR(t.Context(), "ns") + if err != nil { + t.Fatalf("podsRestorableByCR: %v", err) + } + for _, want := range []string{"hibernated", "waking"} { + if _, ok := got[want]; !ok { + t.Errorf("pod %q should be restorable", want) + } + } + for _, no := range []string{"active", "hibernating"} { + if _, ok := got[no]; ok { + t.Errorf("pod %q must not be restorable (phase excludes it)", no) + } + } +} + +// TestEnsureToolboxesRestoresHibernated guards the toolbox recreate path: a +// managed toolbox hibernated via CR must be stamped restore-from-hibernate when +// ensureToolboxes rebuilds it, so it restores rather than cold-boots (a fresh +// boot would let a later hibernate overwrite the real snapshot). +func TestEnsureToolboxesRestoresHibernated(t *testing.T) { + scheme := testScheme(t) + cs := &cocoonv1.CocoonSet{ + ObjectMeta: metav1.ObjectMeta{Name: "demo", Namespace: "ns"}, + Spec: cocoonv1.CocoonSetSpec{ + Toolboxes: []cocoonv1.ToolboxSpec{{Name: "tb", Image: "img", Mode: cocoonv1.ToolboxModeRun}}, + }, + } + tbPodName := toolboxPodName(cs.Name, "tb") + tbVMName := meta.VMNameForPod(cs.Namespace, tbPodName) + hib := &cocoonv1.CocoonHibernation{ + ObjectMeta: metav1.ObjectMeta{Name: "h-tb", Namespace: "ns"}, + Spec: cocoonv1.CocoonHibernationSpec{ + PodRef: cocoonv1.HibernationPodRef{Name: tbPodName}, + Desire: cocoonv1.HibernationDesireHibernate, + }, + Status: cocoonv1.CocoonHibernationStatus{Phase: cocoonv1.CocoonHibernationPhaseHibernated}, + } + cli := ctrlfake.NewClientBuilder().WithScheme(scheme).WithObjects(cs, hib).Build() + r := &Reconciler{ + Client: cli, + Scheme: scheme, + Registry: &fakeRegistry{present: map[string]bool{tbVMName + ":hibernate": true}}, + } + + changed, err := r.ensureToolboxes(t.Context(), cs, classifyPods(nil)) + if err != nil { + t.Fatalf("ensureToolboxes: %v", err) + } + if !changed { + t.Fatal("expected the missing toolbox to be created") + } + var created corev1.Pod + if err := cli.Get(t.Context(), client.ObjectKey{Namespace: "ns", Name: tbPodName}, &created); err != nil { + t.Fatalf("get created toolbox: %v", err) + } + if !meta.ReadRestoreFromHibernate(&created) { + t.Error("recreated hibernated toolbox must be flagged restore-from-hibernate") + } +} diff --git a/cocoonset/suspend.go b/cocoonset/suspend.go index a0712ec..04993ab 100644 --- a/cocoonset/suspend.go +++ b/cocoonset/suspend.go @@ -10,7 +10,6 @@ import ( "github.com/projecteru2/core/log" corev1 "k8s.io/api/core/v1" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" cocoonv1 "github.com/cocoonstack/cocoon-common/apis/v1" commonk8s "github.com/cocoonstack/cocoon-common/k8s" @@ -28,6 +27,10 @@ func (r *Reconciler) reconcileSuspend(ctx context.Context, cs *cocoonv1.CocoonSe if err != nil { return ctrl.Result{}, fmt.Errorf("build main agent before suspend: %w", err) } + // reconcileSuspend only runs under Spec.Suspend, so restore intent is unconditional. + if err := r.markRestoreIfHibernated(ctx, mainPod, true); err != nil { + return ctrl.Result{}, err + } if err := r.Create(ctx, mainPod); err != nil { return ctrl.Result{}, fmt.Errorf("create main agent before suspend: %w", err) } @@ -73,9 +76,9 @@ func (r *Reconciler) allOwnedPodsHibernated(ctx context.Context, classified clas if spec.VMName == "" { return false, nil } - present, err := r.Registry.HasManifest(ctx, spec.VMName, meta.HibernateSnapshotTag) + present, err := r.hasHibernateSnapshot(ctx, spec.VMName) if err != nil { - return false, fmt.Errorf("probe hibernate snapshot %s: %w", spec.VMName, err) + return false, err } if !present { return false, nil @@ -130,20 +133,7 @@ func (r *Reconciler) applyUnsuspend(ctx context.Context, namespace string, class // podsHibernatedByCR returns pod names targeted by a desire=Hibernate CR. func (r *Reconciler) podsHibernatedByCR(ctx context.Context, namespace string) (map[string]struct{}, error) { - var hibList cocoonv1.CocoonHibernationList - if err := r.List(ctx, &hibList, client.InNamespace(namespace)); err != nil { - return nil, fmt.Errorf("list cocoonhibernations in %s: %w", namespace, err) - } - out := make(map[string]struct{}, len(hibList.Items)) - for i := range hibList.Items { - hib := &hibList.Items[i] - if hib.Spec.Desire != cocoonv1.HibernationDesireHibernate { - continue - } - if hib.Spec.PodRef.Name == "" { - continue - } - out[hib.Spec.PodRef.Name] = struct{}{} - } - return out, nil + return r.hibernationPodNames(ctx, namespace, func(h *cocoonv1.CocoonHibernation) bool { + return h.Spec.Desire == cocoonv1.HibernationDesireHibernate + }) } diff --git a/cocoonset/toolboxes.go b/cocoonset/toolboxes.go index ace2c73..52152f1 100644 --- a/cocoonset/toolboxes.go +++ b/cocoonset/toolboxes.go @@ -27,6 +27,10 @@ func (r *Reconciler) ensureToolboxes(ctx context.Context, cs *cocoonv1.CocoonSet } desired[tb.Name] = true } + restorable, err := r.podsRestorableByCR(ctx, cs.Namespace) + if err != nil { + return false, err + } changed := false for _, tb := range cs.Spec.Toolboxes { podName := toolboxPodName(cs.Name, tb.Name) @@ -47,6 +51,10 @@ func (r *Reconciler) ensureToolboxes(ctx context.Context, cs *cocoonv1.CocoonSet if err != nil { return changed, fmt.Errorf("build toolbox %s: %w", tb.Name, err) } + _, intent := restorable[tbPod.Name] + if err := r.markRestoreIfHibernated(ctx, tbPod, intent); err != nil { + return changed, fmt.Errorf("mark restore toolbox %s: %w", tb.Name, err) + } if err := r.Create(ctx, tbPod); err != nil { if !apierrors.IsAlreadyExists(err) { return changed, fmt.Errorf("create toolbox %s: %w", tb.Name, err) diff --git a/go.mod b/go.mod index 403ecd8..ea120ee 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/cocoonstack/cocoon-operator go 1.25.6 require ( - github.com/cocoonstack/cocoon-common v0.2.3-0.20260701090838-225f6832c6b9 + github.com/cocoonstack/cocoon-common v0.2.3-0.20260701180250-e248edd1d822 github.com/go-logr/logr v1.4.3 github.com/google/go-containerregistry v0.21.7 github.com/projecteru2/core v0.0.0-20241016125006-ff909eefe04c diff --git a/go.sum b/go.sum index 7a4f29c..55317b2 100644 --- a/go.sum +++ b/go.sum @@ -29,8 +29,8 @@ github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b h1:r6VH0faHjZe github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b/go.mod h1:Vz9DsVWQQhf3vs21MhPMZpMGSht7O/2vFW2xusFUVOs= github.com/cockroachdb/redact v1.1.3 h1:AKZds10rFSIj7qADf0g46UixK8NNLwWTNdCIGS5wfSQ= github.com/cockroachdb/redact v1.1.3/go.mod h1:BVNblN9mBWFyMyqK1k3AAiSxhvhfK2oOZZ2lK+dpvRg= -github.com/cocoonstack/cocoon-common v0.2.3-0.20260701090838-225f6832c6b9 h1:szvuOHCmhav9nOclSk+393k4QZZxqvZQvPQvBbJ0qBY= -github.com/cocoonstack/cocoon-common v0.2.3-0.20260701090838-225f6832c6b9/go.mod h1:/Cf3aBBN0blBxJWexuGuMbTkas+scvQiF2I75aQXkH4= +github.com/cocoonstack/cocoon-common v0.2.3-0.20260701180250-e248edd1d822 h1:m2gUPkKlOxJwNrJ6WL1u5sRorgufhq73UoSgUwfb3Lw= +github.com/cocoonstack/cocoon-common v0.2.3-0.20260701180250-e248edd1d822/go.mod h1:/Cf3aBBN0blBxJWexuGuMbTkas+scvQiF2I75aQXkH4= github.com/codegangsta/inject v0.0.0-20150114235600-33e0aa1cb7c0/go.mod h1:4Zcjuz89kmFXt9morQgcfYZAYZ5n8WHjt81YYWIwtTM= github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk=