Skip to content

feat(authentik/genmachine): HA reliability improvements#1794

Open
ixxeL2097 wants to merge 1 commit into
mainfrom
fix/authentik-genmachine-ha-clean
Open

feat(authentik/genmachine): HA reliability improvements#1794
ixxeL2097 wants to merge 1 commit into
mainfrom
fix/authentik-genmachine-ha-clean

Conversation

@ixxeL2097

Copy link
Copy Markdown
Member

Summary

  • Scale server and worker to 2 replicas for redundancy during node eviction/rollouts
  • Add PodDisruptionBudget (minAvailable: 1) on both server and worker — prevents eviction from taking the last pod
  • Set RollingUpdate strategy with maxUnavailable: 0 / maxSurge: 1 — zero-downtime deploys
  • Add soft pod anti-affinity — prefer scheduling server/worker pods on different nodes
  • Add topologySpreadConstraints (ScheduleAnyway) — best-effort spread across nodes
  • Set resource requests/limits for server (200m/512Mi1000m/1Gi) and worker (100m/256Mi500m/512Mi)
  • Remove dead redis: config — chart 2026.x has no redis subchart, this block had no effect

Notes

  • Embedded outpost runs within server pods: scaling to 2 replicas automatically gives 2 outpost instances
  • PostgreSQL stays on embedded chart (no CNPG migration)
  • Soft affinity: pods are spread when possible but scheduling is not blocked if only one node is available

Test plan

  • ArgoCD syncs cleanly on genmachine
  • kubectl get pods -n authentik shows 2 server + 2 worker pods on different nodes
  • kubectl get pdb -n authentik shows 2 PDBs with ALLOWED DISRUPTIONS = 1
  • Authentik UI accessible at https://authentik.talos-genmachine.fredcorp.com
  • Drain a node: verify service stays up on remaining pod

🤖 Generated with Claude Code

- Scale server and worker to 2 replicas for redundancy
- Add PodDisruptionBudget (minAvailable: 1) for both server and worker
- Set RollingUpdate strategy with maxUnavailable: 0 to prevent downtime during rollouts
- Add soft pod anti-affinity to spread across nodes
- Add topologySpreadConstraints (ScheduleAnyway) for best-effort node distribution
- Set resource requests/limits for server (200m/512Mi → 1/1Gi) and worker (100m/256Mi → 500m/512Mi)
- Remove dead redis subchart config (chart 2026.x has no redis subchart)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
@ixxel-bot

ixxel-bot Bot commented May 14, 2026

Copy link
Copy Markdown
Contributor
--- main/authentik_gitops_manifests_authentik_genmachine_app_manifest_main.yaml	2026-05-14 21:05:05.007442338 +0000
+++ pr/authentik_gitops_manifests_authentik_genmachine_app_manifest_pr.yaml	2026-05-14 21:05:03.926419593 +0000
@@ -41,20 +41,66 @@
     helm.sh/chart: postgresql-16.7.27
     app.kubernetes.io/component: primary
 spec:
   maxUnavailable: 1
   selector:
     matchLabels:
       app.kubernetes.io/instance: authentik
       app.kubernetes.io/name: postgresql
       app.kubernetes.io/component: primary
 ---
+# Source: authentik/charts/authentik/templates/server/pdb.yaml
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: authentik-server
+  namespace: "default"
+  labels:
+    helm.sh/chart: "authentik-2026.2.3"
+    app.kubernetes.io/name: "authentik"
+    app.kubernetes.io/instance: "authentik"
+    app.kubernetes.io/component: "server"
+    app.kubernetes.io/managed-by: "Helm"
+    app.kubernetes.io/part-of: "authentik"
+    app.kubernetes.io/version: "2026.2.3"
+  
+spec:
+  minAvailable: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: "authentik"
+      app.kubernetes.io/instance: "authentik"
+      app.kubernetes.io/component: "server"
+---
+# Source: authentik/charts/authentik/templates/worker/pdb.yaml
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: authentik-worker
+  namespace: "default"
+  labels:
+    helm.sh/chart: "authentik-2026.2.3"
+    app.kubernetes.io/name: "authentik"
+    app.kubernetes.io/instance: "authentik"
+    app.kubernetes.io/component: "worker"
+    app.kubernetes.io/managed-by: "Helm"
+    app.kubernetes.io/part-of: "authentik"
+    app.kubernetes.io/version: "2026.2.3"
+  
+spec:
+  minAvailable: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: "authentik"
+      app.kubernetes.io/instance: "authentik"
+      app.kubernetes.io/component: "worker"
+---
 # Source: authentik/charts/authentik/charts/postgresql/templates/serviceaccount.yaml
 apiVersion: v1
 kind: ServiceAccount
 metadata:
   name: authentik-postgresql
   namespace: "default"
   labels:
     app.kubernetes.io/instance: authentik
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: postgresql
@@ -416,21 +462,26 @@
   namespace: "default"
   labels:
     helm.sh/chart: "authentik-2026.2.3"
     app.kubernetes.io/name: "authentik"
     app.kubernetes.io/instance: "authentik"
     app.kubernetes.io/component: "server"
     app.kubernetes.io/managed-by: "Helm"
     app.kubernetes.io/part-of: "authentik"
     app.kubernetes.io/version: "2026.2.3"
 spec:
-  replicas: 1
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 1
+      maxUnavailable: 0
+  replicas: 2
   revisionHistoryLimit: 3
   selector:
     matchLabels:
       app.kubernetes.io/name: "authentik"
       app.kubernetes.io/instance: "authentik"
       app.kubernetes.io/component: "server"
   template:
     metadata:
       labels:
         helm.sh/chart: "authentik-2026.2.3"
@@ -524,32 +575,44 @@
             
             failureThreshold: 60
             httpGet:
               path: '/-/health/live/'
               port: http
             initialDelaySeconds: 5
             periodSeconds: 10
             successThreshold: 1
             timeoutSeconds: 3
           resources:
-            {}
+            limits:
+              cpu: 1000m
+              memory: 1Gi
+            requests:
+              cpu: 200m
+              memory: 512Mi
       affinity:
         podAntiAffinity:
           preferredDuringSchedulingIgnoredDuringExecution:
-            - weight: 100
-              podAffinityTerm:
-                labelSelector:
-                  matchLabels:
-                    app.kubernetes.io/name: "authentik"
-                    app.kubernetes.io/instance: "authentik"
-                    app.kubernetes.io/component: "server"
-                topologyKey: kubernetes.io/hostname
+          - podAffinityTerm:
+              labelSelector:
+                matchLabels:
+                  app.kubernetes.io/component: server
+                  app.kubernetes.io/name: authentik
+              topologyKey: kubernetes.io/hostname
+            weight: 100
+      topologySpreadConstraints:
+      - labelSelector:
+          matchLabels:
+            app.kubernetes.io/component: server
+            app.kubernetes.io/name: authentik
+        maxSkew: 1
+        topologyKey: kubernetes.io/hostname
+        whenUnsatisfiable: ScheduleAnyway
       volumes:
         - name: secrets
           secret:
             secretName: authentik-key
         - name: pgsql-creds
           secret:
             secretName: authentik-pgsql-creds
       enableServiceLinks: true
 ---
 # Source: authentik/charts/authentik/templates/worker/deployment.yaml
@@ -560,21 +623,26 @@
   namespace: "default"
   labels:
     helm.sh/chart: "authentik-2026.2.3"
     app.kubernetes.io/name: "authentik"
     app.kubernetes.io/instance: "authentik"
     app.kubernetes.io/component: "worker"
     app.kubernetes.io/managed-by: "Helm"
     app.kubernetes.io/part-of: "authentik"
     app.kubernetes.io/version: "2026.2.3"
 spec:
-  replicas: 1
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 1
+      maxUnavailable: 0
+  replicas: 2
   revisionHistoryLimit: 3
   selector:
     matchLabels:
       app.kubernetes.io/name: "authentik"
       app.kubernetes.io/instance: "authentik"
       app.kubernetes.io/component: "worker"
   template:
     metadata:
       labels:
         helm.sh/chart: "authentik-2026.2.3"
@@ -669,32 +737,44 @@
             exec:
               command:
               - ak
               - healthcheck
             failureThreshold: 60
             initialDelaySeconds: 30
             periodSeconds: 10
             successThreshold: 1
             timeoutSeconds: 3
           resources:
-            {}
+            limits:
+              cpu: 500m
+              memory: 512Mi
+            requests:
+              cpu: 100m
+              memory: 256Mi
       affinity:
         podAntiAffinity:
           preferredDuringSchedulingIgnoredDuringExecution:
-            - weight: 100
-              podAffinityTerm:
-                labelSelector:
-                  matchLabels:
-                    app.kubernetes.io/name: "authentik"
-                    app.kubernetes.io/instance: "authentik"
-                    app.kubernetes.io/component: "worker"
-                topologyKey: kubernetes.io/hostname
+          - podAffinityTerm:
+              labelSelector:
+                matchLabels:
+                  app.kubernetes.io/component: worker
+                  app.kubernetes.io/name: authentik
+              topologyKey: kubernetes.io/hostname
+            weight: 100
+      topologySpreadConstraints:
+      - labelSelector:
+          matchLabels:
+            app.kubernetes.io/component: worker
+            app.kubernetes.io/name: authentik
+        maxSkew: 1
+        topologyKey: kubernetes.io/hostname
+        whenUnsatisfiable: ScheduleAnyway
       volumes:
         - name: secrets
           secret:
             secretName: authentik-key
         - name: pgsql-creds
           secret:
             secretName: authentik-pgsql-creds
         - name: blueprints-cm-authentik-blueprints
           configMap:
             name: authentik-blueprints
 

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant