diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 53e57386..736e77c2 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -54,7 +54,7 @@ RUN curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/i | sh -s -- -b /usr/local/bin ${GOLANGCI_LINT_VERSION} # Set working directory -WORKDIR /workspace +WORKDIR /workspaces # Create godev group for shared Go development directory access # This allows both root (during build) and vscode user (during dev) to write to /go @@ -128,8 +128,8 @@ RUN groupadd -f docker && usermod -aG docker vscode # Ensure vscode user can write to workspace (empty, so fast) # Note: /go permissions are already set in CI stage and preserved here -RUN chown -R vscode:vscode /workspace && \ - chmod -R 755 /workspace +RUN chown -R vscode:vscode /workspaces && \ + chmod -R 755 /workspaces # Switch back to vscode user for development USER vscode @@ -138,4 +138,4 @@ USER vscode ENV DEBIAN_FRONTEND=dialog # Default command -CMD ["/bin/bash"] \ No newline at end of file +CMD ["/bin/bash"] diff --git a/.devcontainer/SETUP_CLUSTER_TROUBLESHOOTING.md b/.devcontainer/SETUP_CLUSTER_TROUBLESHOOTING.md new file mode 100644 index 00000000..84977e37 --- /dev/null +++ b/.devcontainer/SETUP_CLUSTER_TROUBLESHOOTING.md @@ -0,0 +1,72 @@ +# Troubleshooting `make setup-cluster` in DevContainer + +## Symptom + +`make setup-cluster` fails and Kind waits for the control-plane API server, with logs like: + +``` +Get "https://172.19.0.2:6443/livez?timeout=10s": dial tcp 172.19.0.2:6443: connect: connection refused +``` + +## Root cause (current setup) + +`test/e2e/kind/start-cluster.sh` generates `test/e2e/kind/cluster.ignore.yaml` from `HOST_PROJECT_PATH`. + +In the current devcontainer config, `HOST_PROJECT_PATH` is set from `${localWorkspaceFolder}`. +That produced: + +``` +hostPath: /home/simon/git/gitops-reverser2/test/e2e/kind/audit +``` + +But that mounted directory exists and is empty in the container, while the real audit files are under: + +``` +/workspaces/gitops-reverser2/test/e2e/kind/audit +``` + +Because the mount source is wrong/empty, kube-apiserver cannot read: + +- `/etc/kubernetes/audit/policy.yaml` +- `/etc/kubernetes/audit/webhook-config.yaml` + +Then kube-apiserver fails startup, and Kind reports API server connection refused on `:6443`. + +## Why this happens + +The path strategy differs by Docker mode: + +- Host Docker socket mode: daemon needs host-visible paths. +- Docker-in-Docker mode: daemon needs container-visible paths. + +Your current config mixes modes and path assumptions, so Kind mount path resolution is inconsistent. + +## Fix options + +1. Use Docker-in-Docker only (recommended) +- Remove host socket mount from `.devcontainer/devcontainer.json`. +- Set `HOST_PROJECT_PATH` to container workspace path (for example `/workspaces/${localWorkspaceFolderBasename}`). + +2. Use host Docker socket only +- Remove `docker-in-docker` feature. +- Keep `HOST_PROJECT_PATH` as host path. + +## Quick verification + +Before running `make setup-cluster`, verify generated config points to a path with files: + +```bash +cat test/e2e/kind/cluster.ignore.yaml +ls -la +``` + +Expected: `policy.yaml` and `webhook-config.yaml` are present. + +## Immediate workaround + +Run setup with a container-visible path explicitly: + +```bash +HOST_PROJECT_PATH=/workspaces/$(basename "$PWD") make setup-cluster +``` + diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 15540575..570704a6 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -5,21 +5,20 @@ "context": "..", "target": "dev" }, + "workspaceMount": "source=${localWorkspaceFolder},target=/workspaces/${localWorkspaceFolderBasename},type=bind", + "workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}", "features": { + "ghcr.io/devcontainers/features/docker-outside-of-docker:1": {}, "ghcr.io/devcontainers/features/common-utils:2": { "userUid": "automatic", "userGid": "automatic", "username": "vscode" }, - "ghcr.io/devcontainers/features/docker-in-docker:2": { - "moby": false, - "dockerDashComposeVersion": "v2" - }, "ghcr.io/devcontainers/features/git:1": {} }, "runArgs": [ - "--network=host", - "--group-add=docker" + "--group-add=docker", + "--add-host=host.docker.internal:host-gateway" ], "forwardPorts": [ 13000, @@ -58,17 +57,21 @@ "golang.go", "ms-kubernetes-tools.vscode-kubernetes-tools", "ms-azuretools.vscode-docker", - "kilocode.kilo-code" + "openai.chatgpt" ] } }, - "postCreateCommand": "sudo chmod 666 /var/run/docker.sock || true && docker network create -d=bridge --subnet=172.19.0.0/24 kind || true && sudo chown -R vscode:vscode /workspace || true", + "postCreateCommand": "bash .devcontainer/post-create.sh '${containerWorkspaceFolder}'", "remoteUser": "vscode", "mounts": [ - "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind" + "source=ghconfig,target=/home/vscode/.config/gh,type=volume", + "source=${localEnv:HOME}${localEnv:USERPROFILE}/.gitconfig,target=/home/vscode/.gitconfig-host,type=bind,readonly,consistency=cached", + "source=gomodcache,target=/go/pkg/mod,type=volume", + "source=gobuildcache,target=/home/vscode/.cache/go-build,type=volume", + "source=codexconfig,target=/home/vscode/.codex,type=volume" ], "containerEnv": { "HOST_PROJECT_PATH": "${localWorkspaceFolder}", - "DOCKER_API_VERSION": "1.44" + "PROJECT_PATH": "/workspaces/${localWorkspaceFolderBasename}" } -} \ No newline at end of file +} diff --git a/.devcontainer/post-create.sh b/.devcontainer/post-create.sh new file mode 100644 index 00000000..7f30b16d --- /dev/null +++ b/.devcontainer/post-create.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +set -euo pipefail + +log() { + echo "[post-create] $*" +} + +# Resolve workspace path in a way that works both inside and outside +# VS Code-specific shell variable injection. +workspace_dir="${1:-${containerWorkspaceFolder:-${WORKSPACE_FOLDER:-$(pwd)}}}" +log "Using workspace directory: ${workspace_dir}" + +# Keep ~/.gitconfig writable inside the container while still importing host settings. +if [ -f /home/vscode/.gitconfig-host ]; then + log "Configuring git to include /home/vscode/.gitconfig-host" + touch /home/vscode/.gitconfig + if git config --global --get-all include.path | grep -Fxq "/home/vscode/.gitconfig-host"; then + log "Host gitconfig include already present" + else + git config --global --add include.path /home/vscode/.gitconfig-host + log "Added host gitconfig include" + fi +fi + +# Ensure Go-related caches exist and are writable by vscode +log "Ensuring Go cache directories exist" +sudo mkdir -p \ + /home/vscode/.cache/go-build \ + /home/vscode/.cache/goimports \ + /home/vscode/.cache/golangci-lint + +# Fix ownership for workspace and cache roots used by tooling +if [ -d "${workspace_dir}" ]; then + log "Fixing ownership for workspace and cache directories" + sudo chown -R vscode:vscode "${workspace_dir}" /home/vscode || true +else + log "Workspace directory not found; fixing ownership for cache only" + sudo chown -R vscode:vscode /home/vscode || true +fi + +log "post-create completed" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d91994e1..d8c1d0f4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,6 +13,8 @@ env: IMAGE_NAME: configbutler/gitops-reverser IMAGE_TAG: ci-${{ github.sha }} CHART_REGISTRY: ghcr.io/configbutler/charts + REPO_NAME: ${{ github.event.repository.name }} + CI_WORKDIR: /workspaces/${{ github.event.repository.name }} permissions: contents: write @@ -125,7 +127,7 @@ jobs: " lint-helm: - name: Lint Helm Chart + name: Lint and build Helm Chart (and generate single-file installer) runs-on: ubuntu-latest needs: build-ci-container container: @@ -155,9 +157,23 @@ jobs: --set image.repository=test/image \ --set image.tag=test - - name: Validate packaged chart + - name: Generate install.yaml from Helm chart run: | - helm package charts/gitops-reverser --destination /tmp + make build-installer + + - name: Package Helm chart + run: | + helm package charts/gitops-reverser --destination . + mv gitops-reverser-*.tgz gitops-reverser.tgz + + - name: Upload release bundle artifact + uses: actions/upload-artifact@v6 + with: + name: release-bundle + path: | + dist/install.yaml + gitops-reverser.tgz + if-no-files-found: error lint: name: Lint Go Code @@ -245,7 +261,7 @@ jobs: - name: Generate Kind cluster config from template env: - HOST_PROJECT_PATH: ${{ github.workspace }} # docker in docker, so we need to pass the host path + HOST_PROJECT_PATH: ${{ github.workspace }} run: | echo "πŸ”§ Generating cluster config with HOST_PROJECT_PATH=${HOST_PROJECT_PATH}" envsubst < test/e2e/kind/cluster-template.yaml > test/e2e/kind/cluster.yaml @@ -256,8 +272,7 @@ jobs: uses: helm/kind-action@v1.13.0 with: cluster_name: ${{ env.KIND_CLUSTER }} - version: v0.30.0 - kubectl_version: v1.32.3 + version: v0.31.0 config: test/e2e/kind/cluster.yaml wait: 5m @@ -271,33 +286,93 @@ jobs: run: | echo "${{ secrets.GITHUB_TOKEN }}" | docker login ${{ env.REGISTRY }} -u ${{ github.actor }} --password-stdin - - name: Pull and load image to Kind - run: | - echo "Pulling image: ${{ env.PROJECT_IMAGE }}" - docker pull ${{ env.PROJECT_IMAGE }} - kind load docker-image ${{ env.PROJECT_IMAGE }} --name ${{ env.KIND_CLUSTER }} - - name: Run E2E tests in CI container run: | docker run --rm \ --network host \ - -v ${{ github.workspace }}:/workspace \ - -v $HOME/.kube:/root/.kube \ - -w /workspace \ + -v "${GITHUB_WORKSPACE}:${{ env.CI_WORKDIR }}" \ + -v "$HOME/.kube:/root/.kube" \ + -w "${{ env.CI_WORKDIR }}" \ -e PROJECT_IMAGE=${{ env.PROJECT_IMAGE }} \ -e KIND_CLUSTER=${{ env.KIND_CLUSTER }} \ ${{ env.CI_CONTAINER }} \ bash -c " - git config --global --add safe.directory /workspace + git config --global --add safe.directory ${{ env.CI_WORKDIR }} make test-e2e " + e2e-install-smoke: + name: E2E Install Smoke (${{ matrix.scenario }}) + runs-on: ubuntu-latest + needs: [build-ci-container, docker-build, lint-helm] + strategy: + matrix: + scenario: [helm, manifest] + env: + PROJECT_IMAGE: ${{ needs.docker-build.outputs.image }} + KIND_CLUSTER: gitops-reverser-test-e2e-smoke-${{ matrix.scenario }} + CI_CONTAINER: ${{ needs.build-ci-container.outputs.image }} + + steps: + - name: Checkout code + uses: actions/checkout@v6 + + - name: Download tested release bundle artifact + uses: actions/download-artifact@v7 + with: + name: release-bundle + path: . + + - name: Generate Kind cluster config from template + env: + HOST_PROJECT_PATH: ${{ github.workspace }} + run: | + echo "πŸ”§ Generating cluster config with HOST_PROJECT_PATH=${HOST_PROJECT_PATH}" + envsubst < test/e2e/kind/cluster-template.yaml > test/e2e/kind/cluster.yaml + echo "βœ… Generated configuration:" + cat test/e2e/kind/cluster.yaml + + - name: Set up Kind cluster with audit webhook support + uses: helm/kind-action@v1.13.0 + with: + cluster_name: ${{ env.KIND_CLUSTER }} + version: v0.31.0 + config: test/e2e/kind/cluster.yaml + wait: 5m + + - name: Verify cluster setup + run: | + kubectl cluster-info + kubectl get nodes + echo "βœ… Kind cluster is ready" + + - name: Login to Docker registry + run: | + echo "${{ secrets.GITHUB_TOKEN }}" | docker login ${{ env.REGISTRY }} -u ${{ github.actor }} --password-stdin + + - name: Run install smoke test in CI container + run: | + TARGET="test-e2e-install-${{ matrix.scenario }}" + docker run --rm \ + --network host \ + -v "${GITHUB_WORKSPACE}:${{ env.CI_WORKDIR }}" \ + -v "$HOME/.kube:/root/.kube" \ + -w "${{ env.CI_WORKDIR }}" \ + -e PROJECT_IMAGE=${{ env.PROJECT_IMAGE }} \ + -e KIND_CLUSTER=${{ env.KIND_CLUSTER }} \ + -e HELM_CHART_SOURCE="./gitops-reverser.tgz" \ + ${{ env.CI_CONTAINER }} \ + bash -c " + git config --global --add safe.directory ${{ env.CI_WORKDIR }} + make ${TARGET} + " + # Release job only runs on push to main after tests pass release-please: name: Release Please runs-on: ubuntu-latest if: github.event_name == 'push' && github.ref == 'refs/heads/main' - needs: [lint-helm, lint, test, e2e-test, validate-devcontainer] + needs: [lint-helm, lint, test, e2e-test, e2e-install-smoke, validate-devcontainer] outputs: release_created: ${{ steps.release.outputs.release_created }} tag_name: ${{ steps.release.outputs.tag_name }} @@ -438,7 +513,7 @@ jobs: publish-helm: name: Publish Helm Chart runs-on: ubuntu-latest - needs: [build-ci-container, release-please] + needs: [build-ci-container, e2e-install-smoke, release-please] if: needs.release-please.outputs.release_created == 'true' container: image: ${{ needs.build-ci-container.outputs.image }} @@ -452,21 +527,19 @@ jobs: - name: Configure Git safe directory run: git config --global --add safe.directory /__w/gitops-reverser/gitops-reverser - - name: Generate install.yaml from Helm chart (also does helm-sync) - run: | - make build-installer + - name: Download tested release bundle artifact + uses: actions/download-artifact@v7 + with: + name: release-bundle + path: . - name: Login to GitHub Container Registry run: | echo "${{ secrets.GITHUB_TOKEN }}" | helm registry login ${{ env.REGISTRY }} --username ${{ github.actor }} --password-stdin - - name: Package Helm chart - run: | - helm package charts/gitops-reverser --destination .helm-charts - - name: Push Helm chart to GHCR run: | - helm push .helm-charts/gitops-reverser-${{ needs.release-please.outputs.version }}.tgz oci://${{ env.CHART_REGISTRY }} + helm push ./gitops-reverser.tgz oci://${{ env.CHART_REGISTRY }} - name: Upload install.yaml as release asset uses: softprops/action-gh-release@v2 diff --git a/Dockerfile b/Dockerfile index 13a23307..a3f13fb3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ FROM golang:1.25.6 AS builder ARG TARGETOS ARG TARGETARCH -WORKDIR /workspace +WORKDIR /workspaces # Copy the Go Modules manifests COPY go.mod go.sum ./ @@ -25,7 +25,7 @@ RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -o manager cmd/ # Refer to https://github.com/GoogleContainerTools/distroless for more details FROM gcr.io/distroless/static:debug WORKDIR / -COPY --from=builder /workspace/manager . +COPY --from=builder /workspaces/manager . USER 65532:65532 ENTRYPOINT ["/manager"] diff --git a/Makefile b/Makefile index 6b9ed0e2..326c4eb5 100644 --- a/Makefile +++ b/Makefile @@ -43,10 +43,12 @@ help: ## Display this help. .PHONY: manifests manifests: ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. + @rm -f config/crd/bases/*.yaml $(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases .PHONY: helm-sync helm-sync: ## Sync CRDs and roles from config/crd/bases to Helm chart crds directory (for packaging) + @rm -f charts/gitops-reverser/crds/*.yaml @cp config/crd/bases/*.yaml charts/gitops-reverser/crds/ @cp config/rbac/role.yaml charts/gitops-reverser/config @@ -67,6 +69,7 @@ test: manifests generate fmt vet setup-envtest ## Run tests. KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(shell pwd)/bin -p path)" go test $$(go list ./... | grep -v /e2e) -coverprofile cover.out KIND_CLUSTER ?= gitops-reverser-test-e2e +E2E_LOCAL_IMAGE ?= gitops-reverser:e2e-local .PHONY: setup-cluster setup-cluster: ## Set up a Kind cluster for e2e tests if it does not exist @@ -78,11 +81,40 @@ setup-cluster: ## Set up a Kind cluster for e2e tests if it does not exist .PHONY: cleanup-cluster cleanup-cluster: ## Tear down the Kind cluster used for e2e tests - @$(KIND) delete cluster --name $(KIND_CLUSTER) + @if $(KIND) get clusters 2>/dev/null | grep -q "^$(KIND_CLUSTER)$$"; then \ + echo "🧹 Deleting Kind cluster '$(KIND_CLUSTER)'"; \ + $(KIND) delete cluster --name $(KIND_CLUSTER); \ + else \ + echo "ℹ️ Kind cluster '$(KIND_CLUSTER)' does not exist; skipping cleanup"; \ + fi + +.PHONY: e2e-build-load-image +e2e-build-load-image: ## Build local image and load it into the Kind cluster used by local e2e flows + @if [ -n "$(PROJECT_IMAGE)" ]; then \ + echo "🐳 Building local image $(PROJECT_IMAGE)"; \ + $(CONTAINER_TOOL) build -t $(PROJECT_IMAGE) .; \ + echo "πŸ“¦ Loading image $(PROJECT_IMAGE) into Kind cluster $(KIND_CLUSTER)"; \ + $(KIND) load docker-image $(PROJECT_IMAGE) --name $(KIND_CLUSTER); \ + else \ + echo "🐳 Building local image $(E2E_LOCAL_IMAGE)"; \ + $(CONTAINER_TOOL) build -t $(E2E_LOCAL_IMAGE) .; \ + echo "πŸ“¦ Loading image $(E2E_LOCAL_IMAGE) into Kind cluster $(KIND_CLUSTER)"; \ + $(KIND) load docker-image $(E2E_LOCAL_IMAGE) --name $(KIND_CLUSTER); \ + fi .PHONY: test-e2e test-e2e: setup-cluster cleanup-webhook setup-e2e manifests setup-port-forwards ## Run end-to-end tests in Kind cluster, note that vet, fmt and generate are not run! - KIND_CLUSTER=$(KIND_CLUSTER) go test ./test/e2e/ -v -ginkgo.v + @echo "ℹ️ test-e2e reuses the existing Kind cluster (no cluster cleanup in this target)"; \ + if [ -n "$(PROJECT_IMAGE)" ]; then \ + echo "ℹ️ Entry point selected pre-built image (CI-friendly): $(PROJECT_IMAGE)"; \ + echo "ℹ️ Skipping local image build/load for pre-built image path"; \ + KIND_CLUSTER=$(KIND_CLUSTER) PROJECT_IMAGE="$(PROJECT_IMAGE)" go test ./test/e2e/ -v -ginkgo.v; \ + else \ + echo "ℹ️ Entry point selected local fallback image: $(E2E_LOCAL_IMAGE)"; \ + echo "ℹ️ Building/loading local image into existing cluster"; \ + $(MAKE) e2e-build-load-image KIND_CLUSTER=$(KIND_CLUSTER); \ + KIND_CLUSTER=$(KIND_CLUSTER) PROJECT_IMAGE="$(E2E_LOCAL_IMAGE)" go test ./test/e2e/ -v -ginkgo.v; \ + fi .PHONY: cleanup-webhook cleanup-webhook: ## Preventive cleanup of ValidatingWebhookConfiguration potenially left by previous test runs @@ -145,6 +177,7 @@ build-installer: manifests helm-sync ## Generate a consolidated YAML from Helm c @$(HELM) template gitops-reverser charts/gitops-reverser \ --namespace gitops-reverser \ --set labels.managedBy=kubectl \ + --set createNamespace=true \ --include-crds > dist/install.yaml @echo "βœ… Generated dist/install.yaml ($(shell wc -l < dist/install.yaml) lines)" @@ -160,12 +193,12 @@ uninstall: manifests ## Uninstall CRDs from the K8s cluster specified in ~/.kube .PHONY: deploy deploy: manifests ## Deploy controller to the K8s cluster specified in ~/.kube/config. - cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG} - $(KUSTOMIZE) build config/default | $(KUBECTL) apply -f - + cd config && $(KUSTOMIZE) edit set image gitops-reverser=${IMG} + $(KUSTOMIZE) build config | $(KUBECTL) apply -f - .PHONY: undeploy undeploy: ## Undeploy controller from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion. - $(KUSTOMIZE) build config/default | $(KUBECTL) delete --ignore-not-found=true -f - + $(KUSTOMIZE) build config | $(KUBECTL) delete --ignore-not-found=true -f - ##@ Dependencies @@ -183,7 +216,7 @@ ENVTEST_K8S_VERSION ?= $(shell go list -m -f "{{ .Version }}" k8s.io/api | awk - # Gitea E2E Configuration GITEA_NAMESPACE ?= gitea-e2e - GITEA_CHART_VERSION ?= 12.5.0 # https://gitea.com/gitea/helm-gitea +GITEA_CHART_VERSION ?= 12.5.0 # https://gitea.com/gitea/helm-gitea .PHONY: setup-envtest setup-envtest: ## Setup envtest binaries for unit tests @@ -246,3 +279,46 @@ cleanup-prometheus-e2e: ## Clean up Prometheus e2e environment .PHONY: setup-e2e setup-e2e: setup-cert-manager setup-gitea-e2e setup-prometheus-e2e ## Setup all e2e test infrastructure @echo "βœ… E2E infrastructure initialized" + +.PHONY: wait-cert-manager +wait-cert-manager: setup-cert-manager ## Wait for cert-manager pods to become ready + @$(KUBECTL) wait --for=condition=ready pod -l app.kubernetes.io/instance=cert-manager -n cert-manager --timeout=300s + +## Smoke test: install from local Helm chart and verify rollout +.PHONY: test-e2e-install +test-e2e-install: ## Smoke test install with E2E_INSTALL_MODE=helm|manifest + @MODE="$(E2E_INSTALL_MODE)"; \ + if [ "$$MODE" != "helm" ] && [ "$$MODE" != "manifest" ]; then \ + echo "❌ Invalid E2E_INSTALL_MODE='$$MODE' (expected: helm|manifest)"; \ + exit 1; \ + fi; \ + PROJECT_IMAGE_VALUE="$(PROJECT_IMAGE)"; \ + if [ -n "$$PROJECT_IMAGE_VALUE" ]; then \ + echo "ℹ️ Entry point selected pre-built image (probably running in CI): $$PROJECT_IMAGE_VALUE"; \ + echo "ℹ️ Skipping cluster cleanup for pre-built image path"; \ + KIND_CLUSTER=$(KIND_CLUSTER) $(MAKE) setup-cluster setup-e2e wait-cert-manager; \ + else \ + PROJECT_IMAGE_VALUE="$(E2E_LOCAL_IMAGE)"; \ + echo "🧹 Local fallback path: cleaning cluster to test a clean install"; \ + KIND_CLUSTER=$(KIND_CLUSTER) $(MAKE) cleanup-cluster; \ + echo "ℹ️ Entry point selected local fallback image: $$PROJECT_IMAGE_VALUE"; \ + KIND_CLUSTER=$(KIND_CLUSTER) PROJECT_IMAGE="$$PROJECT_IMAGE_VALUE" $(MAKE) setup-cluster setup-e2e wait-cert-manager e2e-build-load-image; \ + fi; \ + echo "ℹ️ Running install smoke mode: $$MODE"; \ + PROJECT_IMAGE="$$PROJECT_IMAGE_VALUE" bash test/e2e/scripts/install-smoke.sh "$$MODE"; \ + +## Smoke test: install from local Helm chart and verify rollout +.PHONY: test-e2e-install-helm +test-e2e-install-helm: + @$(MAKE) test-e2e-install E2E_INSTALL_MODE=helm PROJECT_IMAGE="$(PROJECT_IMAGE)" KIND_CLUSTER="$(KIND_CLUSTER)" + +## Smoke test: install from generated dist/install.yaml and verify rollout +.PHONY: test-e2e-install-manifest +test-e2e-install-manifest: + @if [ -n "$(PROJECT_IMAGE)" ]; then \ + echo "ℹ️ test-e2e-install-manifest using existing artifact (PROJECT_IMAGE set, CI/pre-built path)"; \ + else \ + echo "ℹ️ test-e2e-install-manifest local path: regenerating dist/install.yaml via build-installer"; \ + $(MAKE) build-installer; \ + fi + @$(MAKE) test-e2e-install E2E_INSTALL_MODE=manifest PROJECT_IMAGE="$(PROJECT_IMAGE)" KIND_CLUSTER="$(KIND_CLUSTER)" diff --git a/README.md b/README.md index e2498971..3a0491f9 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,8 @@ Reverse GitOps gives you both: the interactivity of the Kubernetes API with Git' 🚨 This is early stage software. CRDs and behavior may change; not recommended for production yet. Feedback and contributions are very welcome! +Current limitation: GitOps Reverser must run as a single pod (`replicas=1`). Multi-pod/HA operation is not supported yet. + ### Use of AI I have been thinking about the idea behind GitOps Reverser for several years (I've given up my fulltime job to work on it). Some of the hardest parts, especially writing to Git efficiently and safely under load, were designed and implemented manually. The rest is vibe coded, and needs more refinement before I would run it in production. @@ -156,16 +158,15 @@ Avoid infinite loops: Do not point GitOps (Argo CD/Flux) and GitOps Reverser at - Drift detection (use commits as alert inputs) - Hybrid (traditional GitOps for infra; Reverser for app/config changes) -## Known limitations - -- Avoid multiple GitProvider configurations pointing at the same repo to prevent queue collisions (see [`docs/TODO.md`](docs/TODO.md)). -- Queue collisions are possible when multiple configs target the same repository; mitigation is planned. +## Known limitations / design choices -## Monitoring +- GitOps Reverser currently supports only a single controller pod (no multi-pod/HA yet). +- `Secret` resources (`core/v1`, `secrets`) are intentionally ignored and never written to Git, even if a `WatchRule` includes `secrets` or `*`. +- Avoid multiple GitProvider configurations pointing at the same repo to prevent queue collisions. +- Queue collisions are possible when multiple configs target the same repository (so don't do that). -Exposes basic OpenTelemetry metrics. See `config/prometheus/` for example manifests. -## Other options to consider +## Other applications to consider | **Tool** | **How it Works** | **Key Differences** | |---|---|---| diff --git a/charts/gitops-reverser/README.md b/charts/gitops-reverser/README.md index 800f1931..d3cd37a9 100644 --- a/charts/gitops-reverser/README.md +++ b/charts/gitops-reverser/README.md @@ -72,36 +72,33 @@ kubectl apply -f https://github.com/ConfigButler/gitops-reverser/releases/latest ## Architecture -### High Availability Setup +### Deployment Topology -The chart deploys 2 replicas by default with leader election: +The chart deploys 1 replica by default: ``` β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ Kubernetes API Server β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ - β”‚ webhook requests + β”‚ webhook + audit + metrics requests β–Ό β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ gitops-reverser-leader-only (Service) β”‚ -β”‚ Routes to: role=leader β”‚ +β”‚ gitops-reverser (Service) β”‚ +β”‚ Ports: admission(9443), audit(9444), metrics(8080) | β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β–Ό β–Ό -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ Pod 1 β”‚ β”‚ Pod 2 β”‚ -β”‚ LEADER │◄───────── STANDBY β”‚ -β”‚ Active β”‚ electionβ”‚ Ready β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β–Ό + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Pod 1 β”‚ + β”‚ Controller β”‚ + β”‚ Active β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ ``` **Key Features:** -- **Leader-only service**: Routes webhook traffic only to the active leader pod -- **Automatic failover**: Standby pod takes over if leader fails -- **Pod anti-affinity**: Pods spread across different nodes -- **Pod disruption budget**: Ensures at least 1 pod available during maintenance +- **Single-pod operation**: Minimal moving parts while HA work is deferred +- **Single Service topology**: admission, audit, and metrics on one Service ## Configuration @@ -109,13 +106,12 @@ The chart deploys 2 replicas by default with leader election: #### Minimal (Testing/Development) -Single replica, no HA: +Single replica: ```yaml # minimal-values.yaml replicaCount: 1 controllerManager: - leaderElection: false podDisruptionBudget: enabled: false affinity: {} @@ -131,15 +127,15 @@ helm install gitops-reverser \ #### Production (Recommended) -Enhanced HA with 3 replicas: +Hardened single-replica deployment: ```yaml # production-values.yaml -replicaCount: 3 +replicaCount: 1 podDisruptionBudget: enabled: true - minAvailable: 2 + minAvailable: 1 resources: requests: @@ -178,12 +174,26 @@ webhook: | Parameter | Description | Default | |-----------|-------------|---------| -| `namespaceCreation.enabled` | Create namespace automatically | `true` | -| `replicaCount` | Number of controller replicas | `2` | -| `leaderOnlyService.enabled` | Create service routing to leader only | `true` | +| `replicaCount` | Number of controller replicas (can't be higher than 1 for now, sorry) | `1` | | `image.repository` | Container image repository | `ghcr.io/configbutler/gitops-reverser` | -| `controllerManager.leaderElection` | Enable leader election | `true` | | `webhook.validating.failurePolicy` | Webhook failure policy (Ignore/Fail) | `Ignore` | +| `servers.admission.tls.enabled` | Serve admission webhook with TLS (disable only for local/testing) | `true` | +| `servers.admission.tls.secretName` | Secret name for admission TLS cert/key | `-admission-server-cert` | +| `servers.audit.port` | Audit container port | `9444` | +| `servers.audit.tls.enabled` | Serve audit ingress with TLS | `true` | +| `servers.audit.maxRequestBodyBytes` | Max accepted audit request size | `10485760` | +| `servers.audit.timeouts.read` | Audit-server read timeout | `15s` | +| `servers.audit.timeouts.write` | Audit-server write timeout | `30s` | +| `servers.audit.timeouts.idle` | Audit-server idle timeout | `60s` | +| `servers.audit.tls.secretName` | Secret name for audit TLS cert/key | `-audit-server-cert` | +| `servers.metrics.bindAddress` | Metrics listener bind address | `:8080` | +| `servers.metrics.tls.enabled` | Serve metrics with TLS | `false` | +| `servers.metrics.tls.certPath` | Metrics TLS certificate mount path | `/tmp/k8s-metrics-server/metrics-server-certs` | +| `servers.metrics.tls.secretName` | Secret name for metrics TLS cert/key | `-metrics-server-cert` | +| `service.clusterIP` | Optional fixed ClusterIP for single controller Service | `""` | +| `service.ports.admission` | Service port for admission webhook | `9443` | +| `service.ports.audit` | Service port for audit ingress | `9444` | +| `service.ports.metrics` | Service port for metrics | `8080` | | `certificates.certManager.enabled` | Use cert-manager for certificates | `true` | | `podDisruptionBudget.enabled` | Enable PodDisruptionBudget | `true` | | `resources.requests.cpu` | CPU request | `10m` | @@ -193,6 +203,14 @@ webhook: See [`values.yaml`](values.yaml) for complete configuration options. +### Audit Webhook URL Contract + +Source clusters must target: + +`https://:9444/audit-webhook/` + +The bare path `/audit-webhook` is rejected. Use a non-empty cluster ID segment. + ## Custom Resource Definitions (CRDs) This chart automatically manages the following CRDs: @@ -221,7 +239,7 @@ kubectl delete crd gitrepoconfigs.configbutler.ai watchrules.configbutler.ai ### Verify Installation ```bash -# Check pods (should see 2 replicas) +# Check pods (should see 1 replica) kubectl get pods -n gitops-reverser-system # Check CRDs @@ -230,8 +248,6 @@ kubectl get crd | grep configbutler # Check webhook kubectl get validatingwebhookconfiguration -l app.kubernetes.io/name=gitops-reverser -# Check leader election -kubectl get lease -n gitops-reverser-system ``` ### View Logs @@ -240,15 +256,15 @@ kubectl get lease -n gitops-reverser-system # All pods kubectl logs -n gitops-reverser-system -l app.kubernetes.io/name=gitops-reverser -f -# Leader pod only -kubectl logs -n gitops-reverser-system -l role=leader -f ``` ### Access Metrics ```bash -kubectl port-forward -n gitops-reverser-system svc/gitops-reverser-metrics-service 8080:8080 +kubectl port-forward -n gitops-reverser-system svc/gitops-reverser 8080:8080 curl http://localhost:8080/metrics +# If metrics TLS is enabled: +# curl -k https://localhost:8080/metrics ``` ## Upgrading @@ -274,7 +290,7 @@ helm upgrade gitops-reverser \ If upgrading from earlier chart versions: -- Default replicas changed from 1 to 2 (adjust `replicaCount` if needed) +- Single-replica is the default during the current simplified topology phase - Leader election now enabled by default (required for HA) - Health probe port changed to 8081 - Certificate secret names are auto-generated @@ -300,7 +316,7 @@ Check certificate status: ```bash kubectl get certificate -n gitops-reverser-system -kubectl describe certificate gitops-reverser-webhook-server-tls-cert -n gitops-reverser-system +kubectl describe certificate gitops-reverser-admission-server-cert -n gitops-reverser-system ``` If cert-manager is not working: @@ -313,20 +329,6 @@ kubectl logs -n cert-manager -l app=cert-manager -f kubectl rollout restart deployment cert-manager -n cert-manager ``` -### Leader Election Issues - -Check which pod is the leader: - -```bash -# View lease -kubectl get lease -n gitops-reverser-system - -# View pod labels -kubectl get pods -n gitops-reverser-system --show-labels - -# Leader should have label: role=leader -``` - ### Pods Not Scheduling If pods are pending due to anti-affinity rules: @@ -335,7 +337,7 @@ If pods are pending due to anti-affinity rules: # Check node count kubectl get nodes -# If you have only 1 node, reduce replicas or disable affinity +# If you have only 1 node, keep a single replica or disable affinity helm upgrade gitops-reverser \ oci://ghcr.io/configbutler/charts/gitops-reverser \ --namespace gitops-reverser-system \ @@ -384,33 +386,12 @@ webhook: Create certificate secret manually: ```bash -kubectl create secret tls webhook-server-cert \ +kubectl create secret tls gitops-reverser-admission-server-cert \ --cert=path/to/tls.crt \ --key=path/to/tls.key \ -n gitops-reverser-system ``` -### Network Policies - -Enable network policies for additional security: - -```yaml -networkPolicy: - enabled: true - ingress: - - from: - - namespaceSelector: {} - ports: - - protocol: TCP - port: 9443 # webhook port - egress: - - to: - - namespaceSelector: {} - ports: - - protocol: TCP - port: 443 # Kubernetes API -``` - ### Custom Resource Limits For clusters with high resource usage: diff --git a/charts/gitops-reverser/templates/validating-webhook.yaml b/charts/gitops-reverser/templates/admission-webhook.yaml similarity index 90% rename from charts/gitops-reverser/templates/validating-webhook.yaml rename to charts/gitops-reverser/templates/admission-webhook.yaml index 7ea831f6..ec2b6a97 100644 --- a/charts/gitops-reverser/templates/validating-webhook.yaml +++ b/charts/gitops-reverser/templates/admission-webhook.yaml @@ -7,15 +7,16 @@ metadata: {{- include "gitops-reverser.labels" . | nindent 4 }} {{- if .Values.certificates.certManager.enabled }} annotations: - cert-manager.io/inject-ca-from: {{ .Release.Namespace }}/{{ include "gitops-reverser.fullname" . }}-serving-cert + cert-manager.io/inject-ca-from: {{ .Release.Namespace }}/{{ include "gitops-reverser.fullname" . }}-admission-server-cert {{- end }} webhooks: - admissionReviewVersions: - v1 clientConfig: service: - name: {{ include "gitops-reverser.fullname" . }}-leader-only + name: {{ include "gitops-reverser.fullname" . }} namespace: {{ .Release.Namespace }} + port: {{ .Values.service.ports.admission }} path: /process-validating-webhook {{- if not .Values.certificates.certManager.enabled }} caBundle: {{ .Values.webhook.caBundle | b64enc }} diff --git a/charts/gitops-reverser/templates/certificates.yaml b/charts/gitops-reverser/templates/certificates.yaml index 235e492a..8e6ad918 100644 --- a/charts/gitops-reverser/templates/certificates.yaml +++ b/charts/gitops-reverser/templates/certificates.yaml @@ -15,22 +15,69 @@ spec: apiVersion: cert-manager.io/v1 kind: Certificate metadata: - name: {{ include "gitops-reverser.fullname" . }}-serving-cert + name: {{ include "gitops-reverser.fullname" . }}-admission-server-cert namespace: {{ .Release.Namespace }} labels: {{- include "gitops-reverser.labels" . | nindent 4 }} spec: -{{- if .Values.audit.clusterIP }} + dnsNames: + - {{ include "gitops-reverser.fullname" . }}.{{ .Release.Namespace }}.svc + - {{ include "gitops-reverser.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local + issuerRef: + kind: {{ .Values.certificates.certManager.issuer.kind }} + name: {{ .Values.certificates.certManager.issuer.name }} + secretName: {{ .Values.servers.admission.tls.secretName | default (printf "%s-admission-server-cert" (include "gitops-reverser.fullname" .)) }} + usages: + - digital signature + - key encipherment + - server auth + privateKey: + rotationPolicy: Always +{{- if .Values.servers.audit.tls.enabled }} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ include "gitops-reverser.fullname" . }}-audit-server-cert + namespace: {{ .Release.Namespace }} + labels: + {{- include "gitops-reverser.labels" . | nindent 4 }} +spec: +{{- if .Values.service.clusterIP }} ipAddresses: - - {{ .Values.audit.clusterIP }} + - {{ .Values.service.clusterIP }} +{{- end }} + dnsNames: + - {{ include "gitops-reverser.fullname" . }}.{{ .Release.Namespace }}.svc + - {{ include "gitops-reverser.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local + issuerRef: + kind: {{ .Values.certificates.certManager.issuer.kind }} + name: {{ .Values.certificates.certManager.issuer.name }} + secretName: {{ .Values.servers.audit.tls.secretName | default (printf "%s-audit-server-cert" (include "gitops-reverser.fullname" .)) }} + usages: + - digital signature + - key encipherment + - server auth + privateKey: + rotationPolicy: Always {{- end }} +{{- if .Values.servers.metrics.tls.enabled }} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ include "gitops-reverser.fullname" . }}-metrics-server-cert + namespace: {{ .Release.Namespace }} + labels: + {{- include "gitops-reverser.labels" . | nindent 4 }} +spec: dnsNames: - - {{ include "gitops-reverser.fullname" . }}-leader-only.{{ .Release.Namespace }}.svc - - {{ include "gitops-reverser.fullname" . }}-leader-only.{{ .Release.Namespace }}.svc.cluster.local + - {{ include "gitops-reverser.fullname" . }}.{{ .Release.Namespace }}.svc + - {{ include "gitops-reverser.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local issuerRef: kind: {{ .Values.certificates.certManager.issuer.kind }} name: {{ .Values.certificates.certManager.issuer.name }} - secretName: {{ include "gitops-reverser.fullname" . }}-webhook-server-tls-cert + secretName: {{ .Values.servers.metrics.tls.secretName | default (printf "%s-metrics-server-cert" (include "gitops-reverser.fullname" .)) }} usages: - digital signature - key encipherment @@ -38,3 +85,4 @@ spec: privateKey: rotationPolicy: Always {{- end }} +{{- end }} diff --git a/charts/gitops-reverser/templates/configmap.yaml b/charts/gitops-reverser/templates/configmap.yaml index 3d653265..619a24b9 100644 --- a/charts/gitops-reverser/templates/configmap.yaml +++ b/charts/gitops-reverser/templates/configmap.yaml @@ -12,16 +12,13 @@ data: health: healthProbeBindAddress: {{ .Values.controllerManager.healthProbe.bindAddress }} metrics: - bindAddress: {{ .Values.controllerManager.metrics.bindAddress }} + bindAddress: {{ .Values.servers.metrics.bindAddress }} webhook: - port: {{ .Values.webhook.server.port }} - leaderElection: - leaderElect: {{ .Values.controllerManager.leaderElection }} - resourceName: 9ed3440e.configbutler.ai + port: {{ .Values.servers.admission.port }} {{- if .Values.logging }} logging: level: {{ .Values.logging.level | default "info" }} development: {{ .Values.logging.development | default false }} encoder: {{ .Values.logging.encoder | default "json" }} stacktraceLevel: {{ .Values.logging.stacktraceLevel | default "error" }} - {{- end }} \ No newline at end of file + {{- end }} diff --git a/charts/gitops-reverser/templates/deployment.yaml b/charts/gitops-reverser/templates/deployment.yaml index 5adf3cf1..49e8a379 100644 --- a/charts/gitops-reverser/templates/deployment.yaml +++ b/charts/gitops-reverser/templates/deployment.yaml @@ -11,6 +11,8 @@ metadata: control-plane: controller-manager spec: replicas: {{ .Values.replicaCount }} + strategy: + {{- toYaml .Values.deploymentStrategy | nindent 4 }} selector: matchLabels: {{- include "gitops-reverser.selectorLabels" . | nindent 6 }} @@ -31,7 +33,7 @@ spec: serviceAccountName: {{ include "gitops-reverser.serviceAccountName" . }} securityContext: {{- toYaml .Values.podSecurityContext | nindent 8 }} - terminationGracePeriodSeconds: 20 # Shutting down the leaders requires leader transfer to be completed before we shut down the pod (can take some time). + terminationGracePeriodSeconds: 20 containers: - name: manager securityContext: @@ -39,15 +41,35 @@ spec: image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.image.pullPolicy }} args: - {{- if .Values.controllerManager.leaderElection }} - - --leader-elect - {{- end }} - --health-probe-bind-address=:8081 - - --metrics-bind-address=:8080 - - --metrics-secure=false - - --webhook-cert-path={{ .Values.webhook.server.certPath }} - - --webhook-cert-name={{ .Values.webhook.server.certName }} - - --webhook-cert-key={{ .Values.webhook.server.certKey }} + - --metrics-bind-address={{ .Values.servers.metrics.bindAddress }} + {{- if not .Values.servers.metrics.tls.enabled }} + - --metrics-insecure + {{- else }} + - --metrics-cert-path={{ .Values.servers.metrics.tls.certPath }} + - --metrics-cert-name={{ .Values.servers.metrics.tls.certName }} + - --metrics-cert-key={{ .Values.servers.metrics.tls.certKey }} + {{- end }} + {{- if .Values.servers.admission.tls.enabled }} + - --webhook-cert-path={{ .Values.servers.admission.tls.certPath }} + - --webhook-cert-name={{ .Values.servers.admission.tls.certName }} + - --webhook-cert-key={{ .Values.servers.admission.tls.certKey }} + {{- else }} + - --webhook-insecure + {{- end }} + {{- if .Values.servers.audit.tls.enabled }} + - --audit-cert-path={{ .Values.servers.audit.tls.certPath }} + - --audit-cert-name={{ .Values.servers.audit.tls.certName }} + - --audit-cert-key={{ .Values.servers.audit.tls.certKey }} + {{- else }} + - --audit-insecure + {{- end }} + - --audit-listen-address={{ .Values.servers.audit.listenAddress }} + - --audit-port={{ .Values.servers.audit.port }} + - --audit-max-request-body-bytes={{ int64 .Values.servers.audit.maxRequestBodyBytes }} + - --audit-read-timeout={{ .Values.servers.audit.timeouts.read }} + - --audit-write-timeout={{ .Values.servers.audit.timeouts.write }} + - --audit-idle-timeout={{ .Values.servers.audit.timeouts.idle }} {{- if .Values.logging.level }} - --zap-log-level={{ .Values.logging.level }} {{- end }} @@ -64,11 +86,14 @@ spec: - --audit-dump-path=/var/run/audit-dumps {{- end }} ports: - - name: webhook-server - containerPort: {{ .Values.webhook.server.port }} + - name: admission + containerPort: {{ .Values.servers.admission.port }} + protocol: TCP + - name: audit + containerPort: {{ .Values.servers.audit.port }} protocol: TCP - name: metrics - containerPort: {{ .Values.controllerManager.metrics.port }} + containerPort: {{ .Values.servers.metrics.port }} protocol: TCP env: - name: POD_NAME @@ -103,9 +128,21 @@ spec: {{- end }} - name: tmp-dir mountPath: /tmp - - name: cert - mountPath: {{ .Values.webhook.server.certPath }} + {{- if .Values.servers.admission.tls.enabled }} + - name: admission-cert + mountPath: {{ .Values.servers.admission.tls.certPath }} + readOnly: true + {{- end }} + {{- if .Values.servers.metrics.tls.enabled }} + - name: metrics-cert + mountPath: {{ .Values.servers.metrics.tls.certPath }} + readOnly: true + {{- end }} + {{- if .Values.servers.audit.tls.enabled }} + - name: audit-cert + mountPath: {{ .Values.servers.audit.tls.certPath }} readOnly: true + {{- end }} {{- with .Values.volumeMounts }} {{- toYaml . | nindent 12 }} {{- end }} @@ -116,10 +153,24 @@ spec: {{- end }} - name: tmp-dir emptyDir: {} - - name: cert + {{- if .Values.servers.admission.tls.enabled }} + - name: admission-cert + secret: + secretName: {{ .Values.servers.admission.tls.secretName | default (printf "%s-admission-server-cert" (include "gitops-reverser.fullname" .)) }} + defaultMode: 420 + {{- end }} + {{- if .Values.servers.metrics.tls.enabled }} + - name: metrics-cert secret: - secretName: {{ include "gitops-reverser.fullname" . }}-webhook-server-tls-cert + secretName: {{ .Values.servers.metrics.tls.secretName | default (printf "%s-metrics-server-cert" (include "gitops-reverser.fullname" .)) }} defaultMode: 420 + {{- end }} + {{- if .Values.servers.audit.tls.enabled }} + - name: audit-cert + secret: + secretName: {{ .Values.servers.audit.tls.secretName | default (printf "%s-audit-server-cert" (include "gitops-reverser.fullname" .)) }} + defaultMode: 420 + {{- end }} {{- with .Values.volumes }} {{- toYaml . | nindent 8 }} {{- end }} diff --git a/charts/gitops-reverser/templates/namespace.yaml b/charts/gitops-reverser/templates/namespace.yaml new file mode 100644 index 00000000..54e579d7 --- /dev/null +++ b/charts/gitops-reverser/templates/namespace.yaml @@ -0,0 +1,8 @@ +{{- if .Values.createNamespace }} +apiVersion: v1 +kind: Namespace +metadata: + name: {{ .Release.Namespace }} + labels: + {{- include "gitops-reverser.labels" . | nindent 4 }} +{{- end }} diff --git a/charts/gitops-reverser/templates/rbac.yaml b/charts/gitops-reverser/templates/rbac.yaml index 1ecde19f..a37fba09 100644 --- a/charts/gitops-reverser/templates/rbac.yaml +++ b/charts/gitops-reverser/templates/rbac.yaml @@ -72,61 +72,6 @@ roleRef: kind: ClusterRole name: {{ include "gitops-reverser.fullname" . }}-proxy-role subjects: -- kind: ServiceAccount - name: {{ include "gitops-reverser.serviceAccountName" . }} - namespace: {{ .Release.Namespace }} ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: {{ include "gitops-reverser.fullname" . }}-leader-election-role - namespace: {{ .Release.Namespace }} - labels: - {{- include "gitops-reverser.labels" . | nindent 4 }} -rules: -- apiGroups: - - "" - resources: - - configmaps - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - coordination.k8s.io - resources: - - leases - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - "" - resources: - - events - verbs: - - create ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: {{ include "gitops-reverser.fullname" . }}-leader-election-rolebinding - namespace: {{ .Release.Namespace }} - labels: - {{- include "gitops-reverser.labels" . | nindent 4 }} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: {{ include "gitops-reverser.fullname" . }}-leader-election-role -subjects: - kind: ServiceAccount name: {{ include "gitops-reverser.serviceAccountName" . }} namespace: {{ .Release.Namespace }} diff --git a/charts/gitops-reverser/templates/services.yaml b/charts/gitops-reverser/templates/services.yaml index 76a6ab1b..063c787d 100644 --- a/charts/gitops-reverser/templates/services.yaml +++ b/charts/gitops-reverser/templates/services.yaml @@ -1,42 +1,31 @@ -# We don't have a 'normal' API yet (a very simple one is running for our health probing): so there is no need yet for a generic service ---- apiVersion: v1 kind: Service metadata: - name: {{ include "gitops-reverser.fullname" . }}-leader-only + name: {{ include "gitops-reverser.fullname" . }} namespace: {{ .Release.Namespace }} labels: {{- include "gitops-reverser.labels" . | nindent 4 }} - app.kubernetes.io/component: leader-only + app.kubernetes.io/component: controller + prometheus.io/scrape: "true" + prometheus.io/scheme: {{ ternary "https" "http" .Values.servers.metrics.tls.enabled | quote }} + prometheus.io/port: "{{ .Values.service.ports.metrics }}" spec: type: ClusterIP - {{- if .Values.audit.clusterIP }} - clusterIP: {{ .Values.audit.clusterIP }} + {{- if .Values.service.clusterIP }} + clusterIP: {{ .Values.service.clusterIP }} {{- end }} ports: - - name: webhook-server - port: 443 - targetPort: {{ .Values.webhook.server.port }} + - name: admission + port: {{ .Values.service.ports.admission }} + targetPort: {{ .Values.servers.admission.port }} + protocol: TCP + - name: audit + port: {{ .Values.service.ports.audit }} + targetPort: {{ .Values.servers.audit.port }} protocol: TCP - selector: - {{- include "gitops-reverser.selectorLabels" . | nindent 4 }} - role: leader # Pods get this label from within the operator source code: the Kube API lease mechanism is used to always have one active leader. ---- -apiVersion: v1 -kind: Service -metadata: - name: {{ include "gitops-reverser.fullname" . }}-metrics - namespace: {{ .Release.Namespace }} - labels: - {{- include "gitops-reverser.labels" . | nindent 4 }} - prometheus.io/scrape: "true" - prometheus.io/port: "{{ .Values.controllerManager.metrics.port }}" -spec: - type: ClusterIP - ports: - name: metrics - port: {{ .Values.controllerManager.metrics.port }} - targetPort: {{ .Values.controllerManager.metrics.port }} + port: {{ .Values.service.ports.metrics }} + targetPort: {{ .Values.servers.metrics.port }} protocol: TCP selector: {{- include "gitops-reverser.selectorLabels" . | nindent 4 }} diff --git a/charts/gitops-reverser/templates/validate-replica-count.yaml b/charts/gitops-reverser/templates/validate-replica-count.yaml new file mode 100644 index 00000000..d12b2e34 --- /dev/null +++ b/charts/gitops-reverser/templates/validate-replica-count.yaml @@ -0,0 +1,3 @@ +{{- if gt (int .Values.replicaCount) 1 -}} +{{- fail "gitops-reverser does not support HA yet (Sorry I feel your pain: but it can't be perfect from the start). Set .Values.replicaCount to 1." -}} +{{- end -}} diff --git a/charts/gitops-reverser/values.yaml b/charts/gitops-reverser/values.yaml index ef42416d..0a9f2fb9 100644 --- a/charts/gitops-reverser/values.yaml +++ b/charts/gitops-reverser/values.yaml @@ -4,12 +4,11 @@ # High Availability configuration - runs 1 replicas by default (HA support is not good enough yet) replicaCount: 1 - -# Leader-only service configuration -leaderOnlyService: - # When true, creates a dedicated service that routes traffic only to the leader pod - # This is critical for HA deployments to ensure consistent processing of incomming API server events - enabled: true +deploymentStrategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 0 + maxUnavailable: 1 image: repository: ghcr.io/configbutler/gitops-reverser @@ -19,6 +18,9 @@ image: imagePullSecrets: [] nameOverride: "" fullnameOverride: "" +# Create the release namespace as part of chart rendering (useful for install.yaml workflows). +# Keep false for standard Helm installs where --create-namespace is preferred. +createNamespace: false serviceAccount: # Specifies whether a service account should be created @@ -50,28 +52,54 @@ securityContext: # Controller manager configuration controllerManager: - # Enable leader election for controller manager (required for HA) - leaderElection: true # Health probe configuration healthProbe: bindAddress: :8081 - # Metrics configuration - metrics: - port: 8080 - bindAddress: 127.0.0.1:8080 # Enable HTTP/2 (disabled by default for security) enableHTTP2: false -# Webhook configuration -webhook: - enabled: true - # Webhook server configuration - server: +# HTTPS servers +servers: + admission: port: 9443 - certPath: "/tmp/k8s-webhook-server/serving-certs" - certName: "tls.crt" - certKey: "tls.key" + tls: + # Controls webhook TLS wiring in the controller process. + # Keep enabled for normal Kubernetes webhook operation. + enabled: true + certPath: "/tmp/k8s-admission-server/admission-server-certs" + certName: "tls.crt" + certKey: "tls.key" + secretName: "" + audit: + listenAddress: 0.0.0.0 + port: 9444 + tls: + # Serve audit ingress over HTTPS when true, HTTP when false. + enabled: true + certPath: "/tmp/k8s-audit-server/audit-server-certs" + certName: "tls.crt" + certKey: "tls.key" + secretName: "" + timeouts: + read: "15s" + write: "30s" + idle: "60s" + maxRequestBodyBytes: 10485760 + + metrics: + bindAddress: :8080 + port: 8080 + tls: + # Serve metrics over HTTPS when true, HTTP when false. + enabled: false + certPath: "/tmp/k8s-metrics-server/metrics-server-certs" + certName: "tls.crt" + certKey: "tls.key" + secretName: "" + +# Webhook behavior +webhook: audit: # Set to true if you want to write events to /var/run/audit-dumps debugDumps: false @@ -110,10 +138,6 @@ certificates: kind: Issuer create: true -# Settings for receiving audit events from the kubernetes api (the best way to run this) -audit: - clusterIP: 10.43.200.200 # Make sure that it's free, most clusters won't have a problem with this (it's in the default range and it's a high number) - # RBAC configuration rbac: create: true @@ -161,9 +185,20 @@ monitoring: path: /metrics # Port name on the metrics Service (see templates/services.yaml) port: metrics - # Plain HTTP by default (--metrics-secure=false) + # Must match the effective metrics transport: + # - https when servers.metrics.tls.enabled=true + # - http when servers.metrics.tls.enabled=false scheme: http +# Service exposure +service: + # Optional fixed ClusterIP (useful for Kind/bootstrap environments before DNS is ready) + clusterIP: "" + ports: + admission: 9443 + audit: 9444 + metrics: 8080 + # Logging configuration logging: level: info diff --git a/cmd/main.go b/cmd/main.go index 3892e0c3..6b2315c2 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -20,9 +20,15 @@ package main import ( "context" "crypto/tls" + "errors" "flag" + "fmt" + "net" + "net/http" "os" "path/filepath" + "strconv" + "strings" "time" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) @@ -46,7 +52,6 @@ import ( "github.com/ConfigButler/gitops-reverser/internal/controller" "github.com/ConfigButler/gitops-reverser/internal/correlation" "github.com/ConfigButler/gitops-reverser/internal/git" - "github.com/ConfigButler/gitops-reverser/internal/leader" "github.com/ConfigButler/gitops-reverser/internal/metrics" "github.com/ConfigButler/gitops-reverser/internal/reconcile" "github.com/ConfigButler/gitops-reverser/internal/rulestore" @@ -62,8 +67,15 @@ var ( const ( // Correlation store configuration. - correlationMaxEntries = 10000 - correlationTTL = 5 * time.Minute + correlationMaxEntries = 10000 + correlationTTL = 5 * time.Minute + flagParseFailureExitCode = 2 + defaultAuditPort = 9444 + defaultAuditMaxBodyBytes = int64(10 * 1024 * 1024) + defaultAuditReadTimeout = 15 * time.Second + defaultAuditWriteTimeout = 30 * time.Second + defaultAuditIdleTimeout = 60 * time.Second + defaultAuditShutdownTimeout = 10 * time.Second ) func init() { @@ -81,7 +93,9 @@ func main() { // Log metrics configuration for debugging setupLog.Info("Metrics configuration", "metrics-bind-address", cfg.metricsAddr, - "metrics-secure", cfg.secureMetrics) + "metrics-insecure", cfg.metricsInsecure, + "webhook-insecure", cfg.webhookInsecure, + "audit-insecure", cfg.auditInsecure) // Initialize metrics setupCtx := ctrl.SetupSignalHandler() @@ -93,19 +107,17 @@ func main() { // Servers and cert watchers webhookServer, webhookCertWatcher := initWebhookServer( + !cfg.webhookInsecure, cfg.webhookCertPath, cfg.webhookCertName, cfg.webhookCertKey, tlsOpts, ) metricsServerOptions, metricsCertWatcher := buildMetricsServerOptions( - cfg.metricsAddr, cfg.secureMetrics, + cfg.metricsAddr, !cfg.metricsInsecure, cfg.metricsCertPath, cfg.metricsCertName, cfg.metricsCertKey, tlsOpts, ) // Manager - mgr := newManager(metricsServerOptions, webhookServer, cfg.probeAddr, cfg.enableLeaderElection) - - // Leader labeler (if elected) - addLeaderPodLabeler(mgr, cfg.enableLeaderElection) + mgr := newManager(metricsServerOptions, webhookServer, cfg.probeAddr) // Initialize rule store for watch rules ruleStore := rulestore.NewStore() @@ -192,17 +204,27 @@ func main() { // Register experimental audit webhook for metrics collection auditHandler, err := webhookhandler.NewAuditHandler(webhookhandler.AuditHandlerConfig{ - DumpDir: cfg.auditDumpPath, + DumpDir: cfg.auditDumpPath, + MaxRequestBodyBytes: cfg.auditMaxRequestBodyBytes, }) fatalIfErr(err, "unable to create audit handler") - mgr.GetWebhookServer().Register("/audit-webhook", auditHandler) + + var auditCertWatcher *certwatcher.CertWatcher + + auditRunnable, watcher, initErr := initAuditServerRunnable(cfg, tlsOpts, auditHandler) + fatalIfErr(initErr, "unable to initialize audit ingress server") + auditCertWatcher = watcher + fatalIfErr(mgr.Add(auditRunnable), "unable to add audit ingress server runnable") + if cfg.auditDumpPath != "" { - setupLog.Info("Experimental audit webhook handler registered with file dumping", - "http-path", "/audit-webhook", - "dump-path", cfg.auditDumpPath) + setupLog.Info("Audit ingress server configured with file dumping", + "http-path", "/audit-webhook/{clusterID}", + "dump-path", cfg.auditDumpPath, + "address", buildAuditServerAddress(cfg.auditListenAddress, cfg.auditPort)) } else { - setupLog.Info("Experimental audit webhook handler registered (file dumping disabled)", - "http-path", "/audit-webhook") + setupLog.Info("Audit ingress server configured", + "http-path", "/audit-webhook/{clusterID}", + "address", buildAuditServerAddress(cfg.auditListenAddress, cfg.auditPort)) } // NOTE: Old git.Worker has been replaced by WorkerManager + BranchWorker architecture @@ -233,7 +255,7 @@ func main() { // +kubebuilder:scaffold:builder // Cert watchers - addCertWatchersToManager(mgr, metricsCertWatcher, webhookCertWatcher) + addCertWatchersToManager(mgr, metricsCertWatcher, webhookCertWatcher, auditCertWatcher) // Health checks addHealthChecks(mgr) @@ -245,64 +267,140 @@ func main() { // appConfig holds parsed CLI flags and logging options. type appConfig struct { - metricsAddr string - metricsCertPath string - metricsCertName string - metricsCertKey string - webhookCertPath string - webhookCertName string - webhookCertKey string - enableLeaderElection bool - probeAddr string - secureMetrics bool - enableHTTP2 bool - auditDumpPath string - zapOpts zap.Options + metricsAddr string + metricsCertPath string + metricsCertName string + metricsCertKey string + webhookCertPath string + webhookCertName string + webhookCertKey string + probeAddr string + metricsInsecure bool + webhookInsecure bool + enableHTTP2 bool + auditDumpPath string + auditListenAddress string + auditPort int + auditCertPath string + auditCertName string + auditCertKey string + auditInsecure bool + auditMaxRequestBodyBytes int64 + auditReadTimeout time.Duration + auditWriteTimeout time.Duration + auditIdleTimeout time.Duration + zapOpts zap.Options } // parseFlags parses CLI flags and returns the application configuration. func parseFlags() appConfig { + cfg, err := parseFlagsWithArgs(flag.CommandLine, os.Args[1:]) + if err != nil { + setupLog.Error(err, "unable to parse flags") + os.Exit(flagParseFailureExitCode) + } + return cfg +} + +func parseFlagsWithArgs(fs *flag.FlagSet, args []string) (appConfig, error) { var cfg appConfig - flag.StringVar(&cfg.metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ + fs.StringVar(&cfg.metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ "Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.") - flag.StringVar(&cfg.probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") - flag.BoolVar(&cfg.enableLeaderElection, "leader-elect", false, - "Enable leader election for controller manager. "+ - "Enabling this will ensure there is only one active controller manager.") - flag.BoolVar(&cfg.secureMetrics, "metrics-secure", true, - "If set, the metrics endpoint is served securely via HTTPS. Use --metrics-secure=false to use HTTP instead.") - flag.StringVar( - &cfg.webhookCertPath, - "webhook-cert-path", - "", - "The directory that contains the webhook certificate.", - ) - flag.StringVar(&cfg.webhookCertName, "webhook-cert-name", "tls.crt", "The name of the webhook certificate file.") - flag.StringVar(&cfg.webhookCertKey, "webhook-cert-key", "tls.key", "The name of the webhook key file.") - flag.StringVar(&cfg.metricsCertPath, "metrics-cert-path", "", - "The directory that contains the metrics server certificate.") - flag.StringVar( + fs.StringVar(&cfg.probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") + fs.BoolVar(&cfg.metricsInsecure, "metrics-insecure", false, + "If set, the metrics endpoint is served via HTTP instead of HTTPS.") + bindServerCertFlags(fs, "webhook", "webhook", &cfg.webhookCertPath, &cfg.webhookCertName, &cfg.webhookCertKey) + bindServerCertFlags( + fs, + "metrics", + "metrics server", + &cfg.metricsCertPath, &cfg.metricsCertName, - "metrics-cert-name", - "tls.crt", - "The name of the metrics server certificate file.", + &cfg.metricsCertKey, ) - flag.StringVar(&cfg.metricsCertKey, "metrics-cert-key", "tls.key", "The name of the metrics server key file.") - flag.BoolVar(&cfg.enableHTTP2, "enable-http2", false, + fs.BoolVar(&cfg.webhookInsecure, "webhook-insecure", false, + "If set, webhook server certificate watching and TLS wiring are disabled for local test/play usage.") + fs.BoolVar(&cfg.enableHTTP2, "enable-http2", false, "If set, HTTP/2 will be enabled for the metrics and webhook servers") - flag.StringVar(&cfg.auditDumpPath, "audit-dump-path", "", + fs.StringVar(&cfg.auditDumpPath, "audit-dump-path", "", "Directory to write audit events for debugging. If empty, audit event file dumping is disabled.") + fs.StringVar(&cfg.auditListenAddress, "audit-listen-address", "0.0.0.0", + "IP address for the dedicated audit ingress HTTPS server.") + fs.IntVar(&cfg.auditPort, "audit-port", defaultAuditPort, "Port for the dedicated audit ingress HTTPS server.") + bindServerCertFlags(fs, "audit", "audit ingress TLS", &cfg.auditCertPath, &cfg.auditCertName, &cfg.auditCertKey) + fs.BoolVar(&cfg.auditInsecure, "audit-insecure", false, + "If set, the audit ingress endpoint is served via HTTP instead of HTTPS.") + fs.Int64Var(&cfg.auditMaxRequestBodyBytes, "audit-max-request-body-bytes", defaultAuditMaxBodyBytes, + "Maximum request body size in bytes accepted by the audit ingress handler.") + fs.DurationVar(&cfg.auditReadTimeout, "audit-read-timeout", defaultAuditReadTimeout, + "Read timeout for the dedicated audit ingress HTTPS server.") + fs.DurationVar(&cfg.auditWriteTimeout, "audit-write-timeout", defaultAuditWriteTimeout, + "Write timeout for the dedicated audit ingress HTTPS server.") + fs.DurationVar(&cfg.auditIdleTimeout, "audit-idle-timeout", defaultAuditIdleTimeout, + "Idle timeout for the dedicated audit ingress HTTPS server.") cfg.zapOpts = zap.Options{ Development: true, // Enable more detailed logging for debugging Level: zapcore.InfoLevel, // Change to DebugLevel for even more verbose output } - cfg.zapOpts.BindFlags(flag.CommandLine) + cfg.zapOpts.BindFlags(fs) - flag.Parse() - return cfg + if err := fs.Parse(args); err != nil { + return appConfig{}, err + } + applyAuditCertFallbacks(&cfg) + if err := validateAuditConfig(cfg); err != nil { + return appConfig{}, err + } + + return cfg, nil +} + +func bindServerCertFlags( + fs *flag.FlagSet, + prefix string, + component string, + certPath, certName, certKey *string, +) { + fs.StringVar(certPath, fmt.Sprintf("%s-cert-path", prefix), "", + fmt.Sprintf("The directory that contains the %s certificate.", component)) + fs.StringVar(certName, fmt.Sprintf("%s-cert-name", prefix), "tls.crt", + fmt.Sprintf("The name of the %s certificate file.", component)) + fs.StringVar(certKey, fmt.Sprintf("%s-cert-key", prefix), "tls.key", + fmt.Sprintf("The name of the %s key file.", component)) +} + +func applyAuditCertFallbacks(cfg *appConfig) { + if cfg.auditCertPath == "" { + cfg.auditCertPath = cfg.webhookCertPath + } + if cfg.auditCertName == "" { + cfg.auditCertName = cfg.webhookCertName + } + if cfg.auditCertKey == "" { + cfg.auditCertKey = cfg.webhookCertKey + } +} + +func validateAuditConfig(cfg appConfig) error { + if cfg.auditPort <= 0 { + return fmt.Errorf("audit-port must be > 0, got %d", cfg.auditPort) + } + if cfg.auditMaxRequestBodyBytes <= 0 { + return fmt.Errorf("audit-max-request-body-bytes must be > 0, got %d", cfg.auditMaxRequestBodyBytes) + } + if cfg.auditReadTimeout <= 0 { + return fmt.Errorf("audit-read-timeout must be > 0, got %s", cfg.auditReadTimeout) + } + if cfg.auditWriteTimeout <= 0 { + return fmt.Errorf("audit-write-timeout must be > 0, got %s", cfg.auditWriteTimeout) + } + if cfg.auditIdleTimeout <= 0 { + return fmt.Errorf("audit-idle-timeout must be > 0, got %s", cfg.auditIdleTimeout) + } + return nil } // fatalIfErr logs and exits the process if err is not nil. @@ -334,27 +432,16 @@ func buildTLSOptions(enableHTTP2 bool) []func(*tls.Config) { // initWebhookServer initializes the webhook server and, if configured, a cert watcher. func initWebhookServer( + tlsEnabled bool, certPath, certName, certKey string, baseTLS []func(*tls.Config), ) (webhook.Server, *certwatcher.CertWatcher) { - webhookTLSOpts := append([]func(*tls.Config){}, baseTLS...) - var webhookCertWatcher *certwatcher.CertWatcher - - if len(certPath) > 0 { - setupLog.Info("Initializing webhook certificate watcher using provided certificates", - "webhook-cert-path", certPath, //nolint:lll // Structured log with many fields - "webhook-cert-name", certName, "webhook-cert-key", certKey) - - var err error - webhookCertWatcher, err = certwatcher.New( - filepath.Join(certPath, certName), - filepath.Join(certPath, certKey), - ) - fatalIfErr(err, "Failed to initialize webhook certificate watcher") - - webhookTLSOpts = append(webhookTLSOpts, func(config *tls.Config) { - config.GetCertificate = webhookCertWatcher.GetCertificate - }) + webhookTLSOpts, webhookCertWatcher, err := buildTLSRuntime( + tlsEnabled, false, "webhook", certPath, certName, certKey, baseTLS, + ) + fatalIfErr(err, "failed to initialize webhook TLS runtime") + if !tlsEnabled { + setupLog.Info("Webhook insecure mode enabled; skipping webhook certificate watcher wiring") } server := webhook.NewServer(webhook.Options{TLSOpts: webhookTLSOpts}) @@ -368,10 +455,15 @@ func buildMetricsServerOptions( certPath, certName, certKey string, baseTLS []func(*tls.Config), ) (metricsserver.Options, *certwatcher.CertWatcher) { + tlsOpts, metricsCertWatcher, err := buildTLSRuntime( + secureMetrics, false, "metrics", certPath, certName, certKey, baseTLS, + ) + fatalIfErr(err, "failed to initialize metrics TLS runtime") + opts := metricsserver.Options{ BindAddress: metricsAddr, SecureServing: secureMetrics, - TLSOpts: baseTLS, + TLSOpts: tlsOpts, } if secureMetrics { @@ -382,25 +474,156 @@ func buildMetricsServerOptions( opts.FilterProvider = filters.WithAuthenticationAndAuthorization } - var metricsCertWatcher *certwatcher.CertWatcher - if len(certPath) > 0 { - setupLog.Info("Initializing metrics certificate watcher using provided certificates", - "metrics-cert-path", certPath, //nolint:lll // Structured log with many fields - "metrics-cert-name", certName, "metrics-cert-key", certKey) + return opts, metricsCertWatcher +} + +type auditServerRunnable struct { + server *http.Server + tlsEnabled bool +} - var err error - metricsCertWatcher, err = certwatcher.New( - filepath.Join(certPath, certName), - filepath.Join(certPath, certKey), - ) - fatalIfErr(err, "to initialize metrics certificate watcher", "error", err) +type serverTimeouts struct { + read time.Duration + write time.Duration + idle time.Duration +} - opts.TLSOpts = append(opts.TLSOpts, func(config *tls.Config) { - config.GetCertificate = metricsCertWatcher.GetCertificate - }) +func (r *auditServerRunnable) Start(ctx context.Context) error { + setupLog.Info("Starting dedicated audit ingress server", "address", r.server.Addr) + + shutdownDone := make(chan struct{}) + go func() { + defer close(shutdownDone) + <-ctx.Done() + shutdownCtx, cancel := context.WithTimeout(context.Background(), defaultAuditShutdownTimeout) + defer cancel() + if err := r.server.Shutdown(shutdownCtx); err != nil { + setupLog.Error(err, "Failed to shutdown dedicated audit ingress server") + } + }() + + var err error + if r.tlsEnabled { + err = r.server.ListenAndServeTLS("", "") + } else { + err = r.server.ListenAndServe() + } + <-shutdownDone + if errors.Is(err, http.ErrServerClosed) { + return nil } + return fmt.Errorf("audit ingress server failed: %w", err) +} - return opts, metricsCertWatcher +func initAuditServerRunnable( + cfg appConfig, + baseTLS []func(*tls.Config), + handler http.Handler, +) (*auditServerRunnable, *certwatcher.CertWatcher, error) { + tlsEnabled := !cfg.auditInsecure + tlsOpts, certWatcher, err := buildTLSRuntime( + tlsEnabled, true, "audit ingress", cfg.auditCertPath, cfg.auditCertName, cfg.auditCertKey, baseTLS, + ) + if err != nil { + return nil, nil, err + } + + var serverTLS *tls.Config + if tlsEnabled { + serverTLS = buildServerTLSConfig(tlsOpts) + } else { + setupLog.Info("Audit ingress TLS disabled; serving plain HTTP for audit ingress") + } + + mux := buildAuditServeMux(handler) + server := buildHTTPServer( + buildAuditServerAddress(cfg.auditListenAddress, cfg.auditPort), + mux, + serverTLS, + serverTimeouts{ + read: cfg.auditReadTimeout, + write: cfg.auditWriteTimeout, + idle: cfg.auditIdleTimeout, + }, + ) + + return &auditServerRunnable{server: server, tlsEnabled: tlsEnabled}, certWatcher, nil +} + +func buildAuditServeMux(handler http.Handler) *http.ServeMux { + mux := http.NewServeMux() + mux.Handle("/audit-webhook", handler) + mux.Handle("/audit-webhook/", handler) + return mux +} + +func buildAuditServerAddress(listenAddress string, port int) string { + if strings.TrimSpace(listenAddress) == "" { + return fmt.Sprintf(":%d", port) + } + return net.JoinHostPort(listenAddress, strconv.Itoa(port)) +} + +func buildServerTLSConfig(tlsOpts []func(*tls.Config)) *tls.Config { + serverTLS := &tls.Config{MinVersion: tls.VersionTLS12} + for _, opt := range tlsOpts { + opt(serverTLS) + } + return serverTLS +} + +func buildTLSRuntime( + tlsEnabled bool, + requireCert bool, + component string, + certPath, certName, certKey string, + baseTLS []func(*tls.Config), +) ([]func(*tls.Config), *certwatcher.CertWatcher, error) { + tlsOpts := append([]func(*tls.Config){}, baseTLS...) + if !tlsEnabled { + return tlsOpts, nil, nil + } + + if strings.TrimSpace(certPath) == "" { + if requireCert { + return nil, nil, fmt.Errorf("%s-cert-path is required when %s TLS is enabled", component, component) + } + return tlsOpts, nil, nil + } + + setupLog.Info("Initializing certificate watcher using provided certificates", + "component", component, + "cert-path", certPath, + "cert-name", certName, + "cert-key", certKey) + + certWatcher, err := newCertWatcher(certPath, certName, certKey) + if err != nil { + return nil, nil, fmt.Errorf("failed to initialize %s certificate watcher: %w", component, err) + } + + tlsOpts = append(tlsOpts, func(config *tls.Config) { + config.GetCertificate = certWatcher.GetCertificate + }) + return tlsOpts, certWatcher, nil +} + +func buildHTTPServer(addr string, handler http.Handler, tlsConfig *tls.Config, timeouts serverTimeouts) *http.Server { + return &http.Server{ + Addr: addr, + Handler: handler, + TLSConfig: tlsConfig, + ReadTimeout: timeouts.read, + WriteTimeout: timeouts.write, + IdleTimeout: timeouts.idle, + } +} + +func newCertWatcher(certPath, certName, certKey string) (*certwatcher.CertWatcher, error) { + return certwatcher.New( + filepath.Join(certPath, certName), + filepath.Join(certPath, certKey), + ) } // newManager creates a new controller-runtime Manager with common options. @@ -408,16 +631,12 @@ func newManager( metricsOptions metricsserver.Options, webhookServer webhook.Server, probeAddr string, - enableLeaderElection bool, ) ctrl.Manager { mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ Scheme: scheme, Metrics: metricsOptions, WebhookServer: webhookServer, HealthProbeBindAddress: probeAddr, - LeaderElection: enableLeaderElection, - LeaderElectionID: "9ed3440e.configbutler.ai", - // LeaderElectionReleaseOnCancel: true, }) if err != nil { setupLog.Error(err, "unable to start manager") @@ -426,38 +645,26 @@ func newManager( return mgr } -// addLeaderPodLabeler adds the leader pod labeler runnable when leader election is enabled. -// +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;update;patch -func addLeaderPodLabeler(mgr ctrl.Manager, enabled bool) { - if !enabled { - return - } - - podName := leader.GetPodName() - podNamespace := leader.GetPodNamespace() - if podName != "" && podNamespace != "" { - setupLog.Info("Adding leader pod labeler", "pod", podName, "namespace", podNamespace) - podLabeler := &leader.PodLabeler{ - Client: mgr.GetClient(), - Log: ctrl.Log.WithName("leader-labeler"), - PodName: podName, - Namespace: podNamespace, - } - fatalIfErr(mgr.Add(podLabeler), "unable to add leader pod labeler") - } else { - setupLog.Info("POD_NAME or POD_NAMESPACE not set, skipping leader pod labeler") +// addCertWatchersToManager attaches optional certificate watchers to the manager. +func addCertWatchersToManager( + mgr ctrl.Manager, + metricsCertWatcher, webhookCertWatcher, auditCertWatcher *certwatcher.CertWatcher, +) { + watchers := []struct { + component string + watcher *certwatcher.CertWatcher + }{ + {component: "metrics", watcher: metricsCertWatcher}, + {component: "webhook", watcher: webhookCertWatcher}, + {component: "audit ingress", watcher: auditCertWatcher}, } -} -// addCertWatchersToManager attaches optional certificate watchers to the manager. -func addCertWatchersToManager(mgr ctrl.Manager, metricsCertWatcher, webhookCertWatcher *certwatcher.CertWatcher) { - if metricsCertWatcher != nil { - setupLog.Info("Adding metrics certificate watcher to manager") - fatalIfErr(mgr.Add(metricsCertWatcher), "unable to add metrics certificate watcher to manager") - } - if webhookCertWatcher != nil { - setupLog.Info("Adding webhook certificate watcher to manager") - fatalIfErr(mgr.Add(webhookCertWatcher), "unable to add webhook certificate watcher to manager") + for _, item := range watchers { + if item.watcher == nil { + continue + } + setupLog.Info("Adding certificate watcher to manager", "component", item.component) + fatalIfErr(mgr.Add(item.watcher), "unable to add certificate watcher to manager", "component", item.component) } } diff --git a/cmd/main_audit_server_test.go b/cmd/main_audit_server_test.go new file mode 100644 index 00000000..a644eab9 --- /dev/null +++ b/cmd/main_audit_server_test.go @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2025 ConfigButler +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "flag" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestParseFlagsWithArgs_Defaults(t *testing.T) { + fs := flag.NewFlagSet("test-defaults", flag.ContinueOnError) + + cfg, err := parseFlagsWithArgs(fs, []string{}) + require.NoError(t, err) + + assert.False(t, cfg.webhookInsecure) + assert.False(t, cfg.metricsInsecure) + assert.False(t, cfg.auditInsecure) + assert.Equal(t, "0.0.0.0", cfg.auditListenAddress) + assert.Equal(t, 9444, cfg.auditPort) + assert.Equal(t, int64(10485760), cfg.auditMaxRequestBodyBytes) + assert.Equal(t, 15*time.Second, cfg.auditReadTimeout) + assert.Equal(t, 30*time.Second, cfg.auditWriteTimeout) + assert.Equal(t, 60*time.Second, cfg.auditIdleTimeout) +} + +func TestParseFlagsWithArgs_AuditUnsecure(t *testing.T) { + fs := flag.NewFlagSet("test-audit-insecure", flag.ContinueOnError) + args := []string{ + "--audit-insecure", + } + + cfg, err := parseFlagsWithArgs(fs, args) + require.NoError(t, err) + assert.True(t, cfg.auditInsecure) +} + +func TestParseFlagsWithArgs_CustomAuditValues(t *testing.T) { + fs := flag.NewFlagSet("test-custom", flag.ContinueOnError) + args := []string{ + "--webhook-cert-path=/tmp/webhook-certs", + "--audit-listen-address=127.0.0.1", + "--audit-port=9555", + "--audit-cert-path=/tmp/audit-certs", + "--audit-cert-name=cert.pem", + "--audit-cert-key=key.pem", + "--audit-max-request-body-bytes=2048", + "--audit-read-timeout=5s", + "--audit-write-timeout=8s", + "--audit-idle-timeout=13s", + } + + cfg, err := parseFlagsWithArgs(fs, args) + require.NoError(t, err) + + assert.Equal(t, "127.0.0.1", cfg.auditListenAddress) + assert.Equal(t, 9555, cfg.auditPort) + assert.Equal(t, "/tmp/audit-certs", cfg.auditCertPath) + assert.Equal(t, "cert.pem", cfg.auditCertName) + assert.Equal(t, "key.pem", cfg.auditCertKey) + assert.Equal(t, int64(2048), cfg.auditMaxRequestBodyBytes) + assert.Equal(t, 5*time.Second, cfg.auditReadTimeout) + assert.Equal(t, 8*time.Second, cfg.auditWriteTimeout) + assert.Equal(t, 13*time.Second, cfg.auditIdleTimeout) +} + +func TestParseFlagsWithArgs_FallsBackToWebhookCertPath(t *testing.T) { + fs := flag.NewFlagSet("test-fallback", flag.ContinueOnError) + args := []string{ + "--webhook-cert-path=/tmp/webhook-certs", + } + + cfg, err := parseFlagsWithArgs(fs, args) + require.NoError(t, err) + assert.Equal(t, "/tmp/webhook-certs", cfg.auditCertPath) +} + +func TestParseFlagsWithArgs_InvalidAuditSettings(t *testing.T) { + tests := []struct { + name string + args []string + }{ + { + name: "invalid port", + args: []string{"--audit-port=0"}, + }, + { + name: "invalid body size", + args: []string{"--audit-max-request-body-bytes=0"}, + }, + { + name: "invalid read timeout", + args: []string{"--audit-read-timeout=0s"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fs := flag.NewFlagSet("test-invalid", flag.ContinueOnError) + _, err := parseFlagsWithArgs(fs, tt.args) + require.Error(t, err) + }) + } +} + +func TestBuildAuditServeMux_RoutesAuditPaths(t *testing.T) { + handler := http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusAccepted) + }) + + mux := buildAuditServeMux(handler) + + req := httptest.NewRequest(http.MethodPost, "/audit-webhook/cluster-a", nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + assert.Equal(t, http.StatusAccepted, w.Code) + + req = httptest.NewRequest(http.MethodPost, "/audit-webhook", nil) + w = httptest.NewRecorder() + mux.ServeHTTP(w, req) + assert.Equal(t, http.StatusAccepted, w.Code) + + req = httptest.NewRequest(http.MethodPost, "/not-audit", nil) + w = httptest.NewRecorder() + mux.ServeHTTP(w, req) + assert.Equal(t, http.StatusNotFound, w.Code) +} + +func TestBuildAuditServerAddress(t *testing.T) { + assert.Equal(t, "0.0.0.0:9444", buildAuditServerAddress("0.0.0.0", 9444)) + assert.Equal(t, ":9444", buildAuditServerAddress("", 9444)) +} diff --git a/config/README.md b/config/README.md new file mode 100644 index 00000000..961026ae --- /dev/null +++ b/config/README.md @@ -0,0 +1,17 @@ +# config + +This folder contains simplified raw manifests used primarily for local development and testing, +especially end-to-end (e2e) test workflows. + +## Intended use +- Local cluster bring-up. +- E2E test deployments. +- Debugging and iteration with explicit manifests. + +## Production guidance +For production deployments, use the Helm chart in `charts/gitops-reverser`. +The Helm chart is the recommended installation and lifecycle management path for production. + +## Notes +- These manifests are opinionated toward the local/e2e setup. +- Keep them simple and explicit; avoid reintroducing heavy kustomize indirection here. diff --git a/config/certmanager/certificate-metrics.yaml b/config/certmanager/certificate-metrics.yaml deleted file mode 100644 index fb45c2c7..00000000 --- a/config/certmanager/certificate-metrics.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# The following manifests contain a self-signed issuer CR and a metrics certificate CR. -# More document can be found at https://docs.cert-manager.io -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - labels: - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: metrics-certs # this name should match the one appeared in kustomizeconfig.yaml - namespace: system -spec: - dnsNames: - # SERVICE_NAME and SERVICE_NAMESPACE will be substituted by kustomize - # replacements in the config/default/kustomization.yaml file. - - SERVICE_NAME.SERVICE_NAMESPACE.svc - - SERVICE_NAME.SERVICE_NAMESPACE.svc.cluster.local - issuerRef: - kind: Issuer - name: selfsigned-issuer - secretName: metrics-server-cert - privateKey: - rotationPolicy: Always \ No newline at end of file diff --git a/config/certmanager/certificate-webhook.yaml b/config/certmanager/certificate-webhook.yaml deleted file mode 100644 index 01f0a793..00000000 --- a/config/certmanager/certificate-webhook.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# The following manifests contain a self-signed issuer CR and a certificate CR. -# More document can be found at https://docs.cert-manager.io -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - labels: - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: serving-cert # this name should match the one appeared in kustomizeconfig.yaml - namespace: system -spec: - # SERVICE_NAME and SERVICE_NAMESPACE will be substituted by kustomize - # replacements in the config/default/kustomization.yaml file. - dnsNames: - - SERVICE_NAME.SERVICE_NAMESPACE.svc - - SERVICE_NAME.SERVICE_NAMESPACE.svc.cluster.local - issuerRef: - kind: Issuer - name: selfsigned-issuer - secretName: webhook-server-cert - privateKey: - rotationPolicy: Always \ No newline at end of file diff --git a/config/certmanager/issuer.yaml b/config/certmanager/issuer.yaml deleted file mode 100644 index 52d4dc75..00000000 --- a/config/certmanager/issuer.yaml +++ /dev/null @@ -1,13 +0,0 @@ -# The following manifest contains a self-signed issuer CR. -# More information can be found at https://docs.cert-manager.io -# WARNING: Targets CertManager v1.0. Check https://cert-manager.io/docs/installation/upgrading/ for breaking changes. -apiVersion: cert-manager.io/v1 -kind: Issuer -metadata: - labels: - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: selfsigned-issuer - namespace: system -spec: - selfSigned: {} diff --git a/config/certmanager/kustomization.yaml b/config/certmanager/kustomization.yaml deleted file mode 100644 index fcb7498e..00000000 --- a/config/certmanager/kustomization.yaml +++ /dev/null @@ -1,7 +0,0 @@ -resources: -- issuer.yaml -- certificate-webhook.yaml -- certificate-metrics.yaml - -configurations: -- kustomizeconfig.yaml diff --git a/config/certmanager/kustomizeconfig.yaml b/config/certmanager/kustomizeconfig.yaml deleted file mode 100644 index cf6f89e8..00000000 --- a/config/certmanager/kustomizeconfig.yaml +++ /dev/null @@ -1,8 +0,0 @@ -# This configuration is for teaching kustomize how to update name ref substitution -nameReference: -- kind: Issuer - group: cert-manager.io - fieldSpecs: - - kind: Certificate - group: cert-manager.io - path: spec/issuerRef/name diff --git a/config/certs/certificates.yaml b/config/certs/certificates.yaml new file mode 100644 index 00000000..c580689c --- /dev/null +++ b/config/certs/certificates.yaml @@ -0,0 +1,37 @@ +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + labels: + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: gitops-reverser + name: gitops-reverser-admission-server-cert + namespace: sut +spec: + dnsNames: + - gitops-reverser-service.sut.svc + - gitops-reverser-service.sut.svc.cluster.local + issuerRef: + kind: Issuer + name: gitops-reverser-selfsigned-issuer + privateKey: + rotationPolicy: Always + secretName: admission-server-cert +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + labels: + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: gitops-reverser + name: gitops-reverser-audit-server-cert + namespace: sut +spec: + dnsNames: + - gitops-reverser-service.sut.svc + - gitops-reverser-service.sut.svc.cluster.local + issuerRef: + kind: Issuer + name: gitops-reverser-selfsigned-issuer + privateKey: + rotationPolicy: Always + secretName: audit-server-cert diff --git a/config/certs/issuer.yaml b/config/certs/issuer.yaml new file mode 100644 index 00000000..e760ddd4 --- /dev/null +++ b/config/certs/issuer.yaml @@ -0,0 +1,12 @@ +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + labels: + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: gitops-reverser + name: gitops-reverser-selfsigned-issuer + namespace: sut +spec: + selfSigned: {} + + diff --git a/config/certs/kustomization.yaml b/config/certs/kustomization.yaml new file mode 100644 index 00000000..58816ce6 --- /dev/null +++ b/config/certs/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - issuer.yaml + - certificates.yaml diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml index 378fe339..2b954c79 100644 --- a/config/crd/kustomization.yaml +++ b/config/crd/kustomization.yaml @@ -1,19 +1,7 @@ -# This kustomization.yaml is not intended to be run by itself, -# since it depends on service name and namespace that are out of this kustomize package. -# It should be run by config/default +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization resources: - - bases/configbutler.ai_watchrules.yaml - bases/configbutler.ai_clusterwatchrules.yaml - bases/configbutler.ai_gitproviders.yaml - bases/configbutler.ai_gittargets.yaml -# +kubebuilder:scaffold:crdkustomizeresource - -patches: [] -# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix. -# patches here are for enabling the conversion webhook for each CRD -# +kubebuilder:scaffold:crdkustomizewebhookpatch - -# [WEBHOOK] To enable webhook, uncomment the following section -# the following config is for teaching kustomize how to do kustomization for CRDs. -#configurations: -#- kustomizeconfig.yaml + - bases/configbutler.ai_watchrules.yaml diff --git a/config/crd/kustomizeconfig.yaml b/config/crd/kustomizeconfig.yaml deleted file mode 100644 index ec5c150a..00000000 --- a/config/crd/kustomizeconfig.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# This file is for teaching kustomize how to substitute name and namespace reference in CRD -nameReference: -- kind: Service - version: v1 - fieldSpecs: - - kind: CustomResourceDefinition - version: v1 - group: apiextensions.k8s.io - path: spec/conversion/webhook/clientConfig/service/name - -namespace: -- kind: CustomResourceDefinition - version: v1 - group: apiextensions.k8s.io - path: spec/conversion/webhook/clientConfig/service/namespace - create: false - -varReference: -- path: metadata/annotations diff --git a/config/default/cert_metrics_manager_patch.yaml b/config/default/cert_metrics_manager_patch.yaml deleted file mode 100644 index d9750155..00000000 --- a/config/default/cert_metrics_manager_patch.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# This patch adds the args, volumes, and ports to allow the manager to use the metrics-server certs. - -# Add the volumeMount for the metrics-server certs -- op: add - path: /spec/template/spec/containers/0/volumeMounts/- - value: - mountPath: /tmp/k8s-metrics-server/metrics-certs - name: metrics-certs - readOnly: true - -# Add the --metrics-cert-path argument for the metrics server -- op: add - path: /spec/template/spec/containers/0/args/- - value: --metrics-cert-path=/tmp/k8s-metrics-server/metrics-certs - -# Add the metrics-server certs volume configuration -- op: add - path: /spec/template/spec/volumes/- - value: - name: metrics-certs - secret: - secretName: metrics-server-cert - optional: false - items: - - key: ca.crt - path: ca.crt - - key: tls.crt - path: tls.crt - - key: tls.key - path: tls.key diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml deleted file mode 100644 index 7b47b5f8..00000000 --- a/config/default/kustomization.yaml +++ /dev/null @@ -1,240 +0,0 @@ -# Adds namespace to all resources. -namespace: sut - -# Value of this field is prepended to the -# names of all resources, e.g. a deployment named -# "wordpress" becomes "alices-wordpress". -# Note that it should also match with the prefix (text before '-') of the namespace -# field above. -namePrefix: gitops-reverser- - -# Labels to add to all resources and selectors. -#labels: -#- includeSelectors: true -# pairs: -# someName: someValue - -resources: -- ../crd -- ../rbac -- ../manager -# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in -# crd/kustomization.yaml -- ../webhook -# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required. -- ../certmanager -# [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'. -#- ../prometheus -# [METRICS] Expose the controller manager metrics service. -- metrics_service.yaml -# [NETWORK POLICY] Protect the /metrics endpoint and Webhook Server with NetworkPolicy. -# Only Pod(s) running a namespace labeled with 'metrics: enabled' will be able to gather the metrics. -# Only CR(s) which requires webhooks and are applied on namespaces labeled with 'webhooks: enabled' will -# be able to communicate with the Webhook Server. -#- ../network-policy - -# Uncomment the patches line if you enable Metrics -patches: -# [METRICS] The following patch will enable the metrics endpoint using HTTPS and the port :8443. -# More info: https://book.kubebuilder.io/reference/metrics -- path: manager_metrics_patch.yaml - target: - kind: Deployment - -# [AUDIT-WEBHOOK] Set fixed ClusterIP for webhook service so kube-apiserver can connect before CoreDNS is ready -- path: webhook_service_fixed_ip_patch.yaml - target: - kind: Service - name: webhook-service - -# Uncomment the patches line if you enable Metrics and CertManager -# [METRICS-WITH-CERTS] To enable metrics protected with certManager, uncomment the following line. -# This patch will protect the metrics with certManager self-signed certs. -#- path: cert_metrics_manager_patch.yaml -# target: -# kind: Deployment - -# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in -# crd/kustomization.yaml -- path: manager_webhook_patch.yaml - target: - kind: Deployment - -# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER' prefix. -# Uncomment the following replacements to add the cert-manager CA injection annotations -replacements: -# - source: # Uncomment the following block to enable certificates for metrics -# kind: Service -# version: v1 -# name: controller-manager-metrics-service -# fieldPath: metadata.name -# targets: -# - select: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: metrics-certs -# fieldPaths: -# - spec.dnsNames.0 -# - spec.dnsNames.1 -# options: -# delimiter: '.' -# index: 0 -# create: true -# - select: # Uncomment the following to set the Service name for TLS config in Prometheus ServiceMonitor -# kind: ServiceMonitor -# group: monitoring.coreos.com -# version: v1 -# name: controller-manager-metrics-monitor -# fieldPaths: -# - spec.endpoints.0.tlsConfig.serverName -# options: -# delimiter: '.' -# index: 0 -# create: true - -# - source: -# kind: Service -# version: v1 -# name: controller-manager-metrics-service -# fieldPath: metadata.namespace -# targets: -# - select: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: metrics-certs -# fieldPaths: -# - spec.dnsNames.0 -# - spec.dnsNames.1 -# options: -# delimiter: '.' -# index: 1 -# create: true -# - select: # Uncomment the following to set the Service namespace for TLS in Prometheus ServiceMonitor -# kind: ServiceMonitor -# group: monitoring.coreos.com -# version: v1 -# name: controller-manager-metrics-monitor -# fieldPaths: -# - spec.endpoints.0.tlsConfig.serverName -# options: -# delimiter: '.' -# index: 1 -# create: true - - - source: # Uncomment the following block if you have any webhook - kind: Service - version: v1 - name: webhook-service - fieldPath: .metadata.name # Name of the service - targets: - - select: - kind: Certificate - group: cert-manager.io - version: v1 - name: serving-cert - fieldPaths: - - .spec.dnsNames.0 - - .spec.dnsNames.1 - options: - delimiter: '.' - index: 0 - create: true - - source: - kind: Service - version: v1 - name: webhook-service - fieldPath: .metadata.namespace # Namespace of the service - targets: - - select: - kind: Certificate - group: cert-manager.io - version: v1 - name: serving-cert - fieldPaths: - - .spec.dnsNames.0 - - .spec.dnsNames.1 - options: - delimiter: '.' - index: 1 - create: true - - - source: # Uncomment the following block if you have a ValidatingWebhook (--programmatic-validation) - kind: Certificate - group: cert-manager.io - version: v1 - name: serving-cert # This name should match the one in certificate.yaml - fieldPath: .metadata.namespace # Namespace of the certificate CR - targets: - - select: - kind: ValidatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 0 - create: true - - source: - kind: Certificate - group: cert-manager.io - version: v1 - name: serving-cert - fieldPath: .metadata.name - targets: - - select: - kind: ValidatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 1 - create: true - - - source: # Uncomment the following block if you have a DefaultingWebhook (--defaulting ) - kind: Certificate - group: cert-manager.io - version: v1 - name: serving-cert - fieldPath: .metadata.namespace # Namespace of the certificate CR - targets: - - select: - kind: MutatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 0 - create: true - - source: - kind: Certificate - group: cert-manager.io - version: v1 - name: serving-cert - fieldPath: .metadata.name - targets: - - select: - kind: MutatingWebhookConfiguration - fieldPaths: - - .metadata.annotations.[cert-manager.io/inject-ca-from] - options: - delimiter: '/' - index: 1 - create: true - -# - source: # Uncomment the following block if you have a ConversionWebhook (--conversion) -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert -# fieldPath: .metadata.namespace # Namespace of the certificate CR -# targets: # Do not remove or uncomment the following scaffold marker; required to generate code for target CRD. -# +kubebuilder:scaffold:crdkustomizecainjectionns -# - source: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert -# fieldPath: .metadata.name -# targets: # Do not remove or uncomment the following scaffold marker; required to generate code for target CRD. -# +kubebuilder:scaffold:crdkustomizecainjectionname diff --git a/config/default/manager_metrics_patch.yaml b/config/default/manager_metrics_patch.yaml deleted file mode 100644 index 2aaef653..00000000 --- a/config/default/manager_metrics_patch.yaml +++ /dev/null @@ -1,4 +0,0 @@ -# This patch adds the args to allow exposing the metrics endpoint using HTTPS -- op: add - path: /spec/template/spec/containers/0/args/0 - value: --metrics-bind-address=:8443 diff --git a/config/default/manager_webhook_patch.yaml b/config/default/manager_webhook_patch.yaml deleted file mode 100644 index 963c8a4c..00000000 --- a/config/default/manager_webhook_patch.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# This patch ensures the webhook certificates are properly mounted in the manager container. -# It configures the necessary arguments, volumes, volume mounts, and container ports. - -# Add the --webhook-cert-path argument for configuring the webhook certificate path -- op: add - path: /spec/template/spec/containers/0/args/- - value: --webhook-cert-path=/tmp/k8s-webhook-server/serving-certs - -# Add the volumeMount for the webhook certificates -- op: add - path: /spec/template/spec/containers/0/volumeMounts/- - value: - mountPath: /tmp/k8s-webhook-server/serving-certs - name: webhook-certs - readOnly: true - -# Add the port configuration for the webhook server -- op: add - path: /spec/template/spec/containers/0/ports/- - value: - containerPort: 9443 - name: webhook-server - protocol: TCP - -# Add the volume configuration for the webhook certificates -- op: add - path: /spec/template/spec/volumes/- - value: - name: webhook-certs - secret: - secretName: webhook-server-cert diff --git a/config/default/metrics_service.yaml b/config/default/metrics_service.yaml deleted file mode 100644 index 4dff8901..00000000 --- a/config/default/metrics_service.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - labels: - control-plane: controller-manager - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: controller-manager-metrics-service - namespace: system -spec: - ports: - - name: https - port: 8443 - protocol: TCP - targetPort: 8443 - selector: - control-plane: controller-manager - app.kubernetes.io/name: gitops-reverser diff --git a/config/default/webhook_service_fixed_ip_patch.yaml b/config/default/webhook_service_fixed_ip_patch.yaml deleted file mode 100644 index 3e647bfe..00000000 --- a/config/default/webhook_service_fixed_ip_patch.yaml +++ /dev/null @@ -1,11 +0,0 @@ -# Patch to set fixed ClusterIP for webhook service -# This is required for audit webhook to work because kube-apiserver -# starts before CoreDNS, so DNS resolution (.svc.cluster.local) fails. -# Using a fixed IP allows kube-apiserver to connect on startup. -apiVersion: v1 -kind: Service -metadata: - name: webhook-service - namespace: system -spec: - clusterIP: 10.96.200.200 \ No newline at end of file diff --git a/config/deployment.yaml b/config/deployment.yaml new file mode 100644 index 00000000..eddd54f5 --- /dev/null +++ b/config/deployment.yaml @@ -0,0 +1,102 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: gitops-reverser + control-plane: controller-manager + name: gitops-reverser-controller-manager + namespace: sut +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: gitops-reverser + control-plane: controller-manager + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + labels: + app.kubernetes.io/name: gitops-reverser + control-plane: controller-manager + spec: + containers: + - args: + - --metrics-bind-address=:8443 + - --metrics-insecure + - --health-probe-bind-address=:8081 + - --webhook-cert-path=/tmp/k8s-admission-server/admission-server-certs + - --audit-cert-path=/tmp/k8s-audit-server/audit-server-certs + command: + - /manager + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + image: gitops-reverser:latest + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: manager + ports: + - containerPort: 9443 + name: admission + protocol: TCP + - containerPort: 9444 + name: audit + protocol: TCP + - containerPort: 8443 + name: metrics + protocol: TCP + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 10m + memory: 64Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + volumeMounts: + - mountPath: /tmp + name: tmp-dir + - mountPath: /tmp/k8s-admission-server/admission-server-certs + name: webhook-certs + readOnly: true + - mountPath: /tmp/k8s-audit-server/audit-server-certs + name: audit-webhook-certs + readOnly: true + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + serviceAccountName: gitops-reverser-controller-manager + terminationGracePeriodSeconds: 10 + volumes: + - emptyDir: {} + name: tmp-dir + - name: webhook-certs + secret: + secretName: admission-server-cert + - name: audit-webhook-certs + secret: + secretName: audit-server-cert diff --git a/config/kustomization.yaml b/config/kustomization.yaml new file mode 100644 index 00000000..570f21c2 --- /dev/null +++ b/config/kustomization.yaml @@ -0,0 +1,14 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: +- namespace.yaml +- crd +- rbac +- service.yaml +- deployment.yaml +- certs +- webhook.yaml +images: +- name: gitops-reverser + newName: gitops-reverser + newTag: e2e-local diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml deleted file mode 100644 index d2e4a38c..00000000 --- a/config/manager/kustomization.yaml +++ /dev/null @@ -1,8 +0,0 @@ -resources: -- manager.yaml -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -images: -- name: controller - newName: example.com/gitops-reverser - newTag: v0.0.1 diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml deleted file mode 100644 index c94b390c..00000000 --- a/config/manager/manager.yaml +++ /dev/null @@ -1,112 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - labels: - control-plane: controller-manager - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: system ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: controller-manager - namespace: system - labels: - control-plane: controller-manager - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize -spec: - selector: - matchLabels: - control-plane: controller-manager - app.kubernetes.io/name: gitops-reverser - replicas: 2 - template: - metadata: - annotations: - kubectl.kubernetes.io/default-container: manager - labels: - control-plane: controller-manager - app.kubernetes.io/name: gitops-reverser - spec: - # TODO(user): Uncomment the following code to configure the nodeAffinity expression - # according to the platforms which are supported by your solution. - # It is considered best practice to support multiple architectures. You can - # build your manager image using the makefile target docker-buildx. - # affinity: - # nodeAffinity: - # requiredDuringSchedulingIgnoredDuringExecution: - # nodeSelectorTerms: - # - matchExpressions: - # - key: kubernetes.io/arch - # operator: In - # values: - # - amd64 - # - arm64 - # - ppc64le - # - s390x - # - key: kubernetes.io/os - # operator: In - # values: - # - linux - securityContext: - # Projects are configured by default to adhere to the "restricted" Pod Security Standards. - # This ensures that deployments meet the highest security requirements for Kubernetes. - # For more details, see: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted - runAsNonRoot: true - seccompProfile: - type: RuntimeDefault - containers: - - command: - - /manager - args: - - --leader-elect - - --health-probe-bind-address=:8081 - image: controller:latest - name: manager - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - ports: [] - securityContext: - readOnlyRootFilesystem: true - allowPrivilegeEscalation: false - capabilities: - drop: - - "ALL" - livenessProbe: - httpGet: - path: /healthz - port: 8081 - initialDelaySeconds: 15 - periodSeconds: 20 - readinessProbe: - httpGet: - path: /readyz - port: 8081 - initialDelaySeconds: 5 - periodSeconds: 10 - # TODO(user): Configure the resources accordingly based on the project requirements. - # More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - resources: - limits: - cpu: 500m - memory: 128Mi - requests: - cpu: 10m - memory: 64Mi - volumeMounts: - - name: tmp-dir - mountPath: /tmp - volumes: - - name: tmp-dir - emptyDir: {} - serviceAccountName: controller-manager - terminationGracePeriodSeconds: 10 diff --git a/config/namespace.yaml b/config/namespace.yaml new file mode 100644 index 00000000..14e972db --- /dev/null +++ b/config/namespace.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Namespace +metadata: + labels: + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: gitops-reverser + control-plane: controller-manager + name: sut diff --git a/config/network-policy/allow-metrics-traffic.yaml b/config/network-policy/allow-metrics-traffic.yaml deleted file mode 100644 index 35673a36..00000000 --- a/config/network-policy/allow-metrics-traffic.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# This NetworkPolicy allows ingress traffic -# with Pods running on namespaces labeled with 'metrics: enabled'. Only Pods on those -# namespaces are able to gather data from the metrics endpoint. -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - labels: - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: allow-metrics-traffic - namespace: system -spec: - podSelector: - matchLabels: - control-plane: controller-manager - app.kubernetes.io/name: gitops-reverser - policyTypes: - - Ingress - ingress: - # This allows ingress traffic from any namespace with the label metrics: enabled - - from: - - namespaceSelector: - matchLabels: - metrics: enabled # Only from namespaces with this label - ports: - - port: 8443 - protocol: TCP diff --git a/config/network-policy/allow-webhook-traffic.yaml b/config/network-policy/allow-webhook-traffic.yaml deleted file mode 100644 index 169327bf..00000000 --- a/config/network-policy/allow-webhook-traffic.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# This NetworkPolicy allows ingress traffic to your webhook server running -# as part of the controller-manager from specific namespaces and pods. CR(s) which uses webhooks -# will only work when applied in namespaces labeled with 'webhook: enabled' -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - labels: - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: allow-webhook-traffic - namespace: system -spec: - podSelector: - matchLabels: - control-plane: controller-manager - app.kubernetes.io/name: gitops-reverser - policyTypes: - - Ingress - ingress: - # This allows ingress traffic from any namespace with the label webhook: enabled - - from: - - namespaceSelector: - matchLabels: - webhook: enabled # Only from namespaces with this label - ports: - - port: 443 - protocol: TCP diff --git a/config/network-policy/kustomization.yaml b/config/network-policy/kustomization.yaml deleted file mode 100644 index 0872bee1..00000000 --- a/config/network-policy/kustomization.yaml +++ /dev/null @@ -1,3 +0,0 @@ -resources: -- allow-webhook-traffic.yaml -- allow-metrics-traffic.yaml diff --git a/config/prometheus/kustomization.yaml b/config/prometheus/kustomization.yaml deleted file mode 100644 index fdc5481b..00000000 --- a/config/prometheus/kustomization.yaml +++ /dev/null @@ -1,11 +0,0 @@ -resources: -- monitor.yaml - -# [PROMETHEUS-WITH-CERTS] The following patch configures the ServiceMonitor in ../prometheus -# to securely reference certificates created and managed by cert-manager. -# Additionally, ensure that you uncomment the [METRICS WITH CERTMANAGER] patch under config/default/kustomization.yaml -# to mount the "metrics-server-cert" secret in the Manager Deployment. -#patches: -# - path: monitor_tls_patch.yaml -# target: -# kind: ServiceMonitor diff --git a/config/prometheus/monitor.yaml b/config/prometheus/monitor.yaml deleted file mode 100644 index c21839b3..00000000 --- a/config/prometheus/monitor.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# Prometheus Monitor Service (Metrics) -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - control-plane: controller-manager - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: controller-manager-metrics-monitor - namespace: system -spec: - endpoints: - - path: /metrics - port: https # Ensure this is the name of the port that exposes HTTPS metrics - scheme: https - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - tlsConfig: - # TODO(user): The option insecureSkipVerify: true is not recommended for production since it disables - # certificate verification, exposing the system to potential man-in-the-middle attacks. - # For production environments, it is recommended to use cert-manager for automatic TLS certificate management. - # To apply this configuration, enable cert-manager and use the patch located at config/prometheus/servicemonitor_tls_patch.yaml, - # which securely references the certificate from the 'metrics-server-cert' secret. - insecureSkipVerify: true - selector: - matchLabels: - control-plane: controller-manager - app.kubernetes.io/name: gitops-reverser diff --git a/config/prometheus/monitor_tls_patch.yaml b/config/prometheus/monitor_tls_patch.yaml deleted file mode 100644 index 5bf84ce0..00000000 --- a/config/prometheus/monitor_tls_patch.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Patch for Prometheus ServiceMonitor to enable secure TLS configuration -# using certificates managed by cert-manager -- op: replace - path: /spec/endpoints/0/tlsConfig - value: - # SERVICE_NAME and SERVICE_NAMESPACE will be substituted by kustomize - serverName: SERVICE_NAME.SERVICE_NAMESPACE.svc - insecureSkipVerify: false - ca: - secret: - name: metrics-server-cert - key: ca.crt - cert: - secret: - name: metrics-server-cert - key: tls.crt - keySecret: - name: metrics-server-cert - key: tls.key diff --git a/config/rbac/service_account.yaml b/config/rbac/gitops-reverser-controller-manager.yaml similarity index 70% rename from config/rbac/service_account.yaml rename to config/rbac/gitops-reverser-controller-manager.yaml index 62fc0094..d726578e 100644 --- a/config/rbac/service_account.yaml +++ b/config/rbac/gitops-reverser-controller-manager.yaml @@ -2,7 +2,7 @@ apiVersion: v1 kind: ServiceAccount metadata: labels: - app.kubernetes.io/name: gitops-reverser app.kubernetes.io/managed-by: kustomize - name: controller-manager - namespace: system + app.kubernetes.io/name: gitops-reverser + name: gitops-reverser-controller-manager + namespace: sut diff --git a/config/rbac/test_user_role_binding.yaml b/config/rbac/gitops-reverser-demo-jane-access.yaml similarity index 85% rename from config/rbac/test_user_role_binding.yaml rename to config/rbac/gitops-reverser-demo-jane-access.yaml index 5c0a390f..9d02eb7c 100644 --- a/config/rbac/test_user_role_binding.yaml +++ b/config/rbac/gitops-reverser-demo-jane-access.yaml @@ -1,7 +1,7 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: demo-jane-access + name: gitops-reverser-demo-jane-access roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole diff --git a/config/rbac/gitops-reverser-manager-role.yaml b/config/rbac/gitops-reverser-manager-role.yaml new file mode 100644 index 00000000..ca36f738 --- /dev/null +++ b/config/rbac/gitops-reverser-manager-role.yaml @@ -0,0 +1,54 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: gitops-reverser-manager-role +rules: +- apiGroups: + - "" + resources: + - namespaces + - secrets + verbs: + - get + - list + - watch +- apiGroups: + - '*' + resources: + - '*' + verbs: + - get + - list + - watch +- apiGroups: + - configbutler.ai + resources: + - clusterwatchrules + - gitproviders + - gittargets + - watchrules + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - configbutler.ai + resources: + - clusterwatchrules/status + - gitproviders/status + - gittargets/status + - watchrules/status + verbs: + - get + - patch + - update +- apiGroups: + - configbutler.ai + resources: + - gitproviders/finalizers + verbs: + - update diff --git a/config/rbac/role_binding.yaml b/config/rbac/gitops-reverser-manager-rolebinding.yaml similarity index 66% rename from config/rbac/role_binding.yaml rename to config/rbac/gitops-reverser-manager-rolebinding.yaml index 75c9905c..5eb0f83e 100644 --- a/config/rbac/role_binding.yaml +++ b/config/rbac/gitops-reverser-manager-rolebinding.yaml @@ -2,14 +2,14 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: labels: - app.kubernetes.io/name: gitops-reverser app.kubernetes.io/managed-by: kustomize - name: manager-rolebinding + app.kubernetes.io/name: gitops-reverser + name: gitops-reverser-manager-rolebinding roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: manager-role + name: gitops-reverser-manager-role subjects: - kind: ServiceAccount - name: controller-manager - namespace: system + name: gitops-reverser-controller-manager + namespace: sut diff --git a/config/rbac/gitprovider_admin_role.yaml b/config/rbac/gitprovider_admin_role.yaml deleted file mode 100644 index d4c77df4..00000000 --- a/config/rbac/gitprovider_admin_role.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# This rule is not used by the project gitops-reverser itself. -# It is provided to allow the cluster admin to help manage permissions for users. -# -# Grants full permissions ('*') over configbutler.ai. -# This role is intended for users authorized to modify roles and bindings within the cluster, -# enabling them to delegate specific permissions to other users or groups as needed. - -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: gitprovider-admin-role -rules: -- apiGroups: - - configbutler.ai - resources: - - gitproviders - verbs: - - '*' -- apiGroups: - - configbutler.ai - resources: - - gitproviders/status - verbs: - - get diff --git a/config/rbac/gitprovider_editor_role.yaml b/config/rbac/gitprovider_editor_role.yaml deleted file mode 100644 index 69d0d35e..00000000 --- a/config/rbac/gitprovider_editor_role.yaml +++ /dev/null @@ -1,33 +0,0 @@ -# This rule is not used by the project gitops-reverser itself. -# It is provided to allow the cluster admin to help manage permissions for users. -# -# Grants permissions to create, update, and delete resources within the configbutler.ai. -# This role is intended for users who need to manage these resources -# but should not control RBAC or manage permissions for others. - -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: gitprovider-editor-role -rules: -- apiGroups: - - configbutler.ai - resources: - - gitproviders - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - configbutler.ai - resources: - - gitproviders/status - verbs: - - get diff --git a/config/rbac/gitprovider_viewer_role.yaml b/config/rbac/gitprovider_viewer_role.yaml deleted file mode 100644 index 027012d4..00000000 --- a/config/rbac/gitprovider_viewer_role.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# This rule is not used by the project gitops-reverser itself. -# It is provided to allow the cluster admin to help manage permissions for users. -# -# Grants read-only access to configbutler.ai resources. -# This role is intended for users who need visibility into these resources -# without permissions to modify them. It is ideal for monitoring purposes and limited-access viewing. - -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: gitprovider-viewer-role -rules: -- apiGroups: - - configbutler.ai - resources: - - gitproviders - verbs: - - get - - list - - watch -- apiGroups: - - configbutler.ai - resources: - - gitproviders/status - verbs: - - get diff --git a/config/rbac/gittarget_admin_role.yaml b/config/rbac/gittarget_admin_role.yaml deleted file mode 100644 index 0122ca4c..00000000 --- a/config/rbac/gittarget_admin_role.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# This rule is not used by the project gitops-reverser itself. -# It is provided to allow the cluster admin to help manage permissions for users. -# -# Grants full permissions ('*') over configbutler.ai. -# This role is intended for users authorized to modify roles and bindings within the cluster, -# enabling them to delegate specific permissions to other users or groups as needed. - -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: gittarget-admin-role -rules: -- apiGroups: - - configbutler.ai - resources: - - gittargets - verbs: - - '*' -- apiGroups: - - configbutler.ai - resources: - - gittargets/status - verbs: - - get diff --git a/config/rbac/gittarget_editor_role.yaml b/config/rbac/gittarget_editor_role.yaml deleted file mode 100644 index 6adbedb5..00000000 --- a/config/rbac/gittarget_editor_role.yaml +++ /dev/null @@ -1,33 +0,0 @@ -# This rule is not used by the project gitops-reverser itself. -# It is provided to allow the cluster admin to help manage permissions for users. -# -# Grants permissions to create, update, and delete resources within the configbutler.ai. -# This role is intended for users who need to manage these resources -# but should not control RBAC or manage permissions for others. - -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: gittarget-editor-role -rules: -- apiGroups: - - configbutler.ai - resources: - - gittargets - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - configbutler.ai - resources: - - gittargets/status - verbs: - - get diff --git a/config/rbac/gittarget_viewer_role.yaml b/config/rbac/gittarget_viewer_role.yaml deleted file mode 100644 index b6285b70..00000000 --- a/config/rbac/gittarget_viewer_role.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# This rule is not used by the project gitops-reverser itself. -# It is provided to allow the cluster admin to help manage permissions for users. -# -# Grants read-only access to configbutler.ai resources. -# This role is intended for users who need visibility into these resources -# without permissions to modify them. It is ideal for monitoring purposes and limited-access viewing. - -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: gittarget-viewer-role -rules: -- apiGroups: - - configbutler.ai - resources: - - gittargets - verbs: - - get - - list - - watch -- apiGroups: - - configbutler.ai - resources: - - gittargets/status - verbs: - - get diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml index 86645ef5..f88dc735 100644 --- a/config/rbac/kustomization.yaml +++ b/config/rbac/kustomization.yaml @@ -1,39 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization resources: - # All RBAC will be applied under this service account in - # the deployment namespace. You may comment out this resource - # if your manager will use a service account that exists at - # runtime. Be sure to update RoleBinding and ClusterRoleBinding - # subjects if changing service account names. - - service_account.yaml - - role.yaml - - role_binding.yaml - - test_user_role_binding.yaml - - leader_election_role.yaml - - leader_election_role_binding.yaml - # The following RBAC configurations are used to protect - # the metrics endpoint with authn/authz. These configurations - # ensure that only authorized users and service accounts - # can access the metrics endpoint. Comment the following - # permissions if you want to disable this protection. - # More info: https://book.kubebuilder.io/reference/metrics.html - - metrics_auth_role.yaml - - metrics_auth_role_binding.yaml - - metrics_reader_role.yaml - # For each CRD, "Admin", "Editor" and "Viewer" roles are scaffolded by - # default, aiding admins in cluster management. Those roles are - # not used by the gitops-reverser itself. You can comment the following lines - # if you do not want those helpers be installed with your Project. - - watchrule_admin_role.yaml - - watchrule_editor_role.yaml - - watchrule_viewer_role.yaml - # For each CRD, "Admin", "Editor" and "Viewer" roles are scaffolded by - # default, aiding admins in cluster management. Those roles are - # not used by the gitops-reverser itself. You can comment the following lines - # if you do not want those helpers be installed with your Project. - - gittarget_admin_role.yaml - - gittarget_editor_role.yaml - - gittarget_viewer_role.yaml - - gitprovider_admin_role.yaml - - gitprovider_editor_role.yaml - - gitprovider_viewer_role.yaml - + # Service account used by the controller manager deployment. + - gitops-reverser-controller-manager.yaml + # Main runtime permissions for the controller. + - gitops-reverser-manager-role.yaml + - gitops-reverser-manager-rolebinding.yaml + # E2E-only helper: allows impersonated user jane@acme.com writes in tests. + - gitops-reverser-demo-jane-access.yaml diff --git a/config/rbac/leader_election_role.yaml b/config/rbac/leader_election_role.yaml deleted file mode 100644 index a12b473f..00000000 --- a/config/rbac/leader_election_role.yaml +++ /dev/null @@ -1,40 +0,0 @@ -# permissions to do leader election. -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - labels: - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: leader-election-role -rules: -- apiGroups: - - "" - resources: - - configmaps - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - coordination.k8s.io - resources: - - leases - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - "" - resources: - - events - verbs: - - create - - patch diff --git a/config/rbac/leader_election_role_binding.yaml b/config/rbac/leader_election_role_binding.yaml deleted file mode 100644 index ca6debb0..00000000 --- a/config/rbac/leader_election_role_binding.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - labels: - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: leader-election-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: leader-election-role -subjects: -- kind: ServiceAccount - name: controller-manager - namespace: system diff --git a/config/rbac/metrics_auth_role.yaml b/config/rbac/metrics_auth_role.yaml deleted file mode 100644 index 32d2e4ec..00000000 --- a/config/rbac/metrics_auth_role.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: metrics-auth-role -rules: -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create diff --git a/config/rbac/metrics_auth_role_binding.yaml b/config/rbac/metrics_auth_role_binding.yaml deleted file mode 100644 index e775d67f..00000000 --- a/config/rbac/metrics_auth_role_binding.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: metrics-auth-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: metrics-auth-role -subjects: -- kind: ServiceAccount - name: controller-manager - namespace: system diff --git a/config/rbac/metrics_reader_role.yaml b/config/rbac/metrics_reader_role.yaml deleted file mode 100644 index 51a75db4..00000000 --- a/config/rbac/metrics_reader_role.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: metrics-reader -rules: -- nonResourceURLs: - - "/metrics" - verbs: - - get diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 0de108eb..c4e3993e 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -13,16 +13,6 @@ rules: - get - list - watch -- apiGroups: - - "" - resources: - - pods - verbs: - - get - - list - - patch - - update - - watch - apiGroups: - '*' resources: diff --git a/config/rbac/watchrule_admin_role.yaml b/config/rbac/watchrule_admin_role.yaml deleted file mode 100644 index c98c941f..00000000 --- a/config/rbac/watchrule_admin_role.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# This rule is not used by the project gitops-reverser itself. -# It is provided to allow the cluster admin to help manage permissions for users. -# -# Grants full permissions ('*') over configbutler.ai. -# This role is intended for users authorized to modify roles and bindings within the cluster, -# enabling them to delegate specific permissions to other users or groups as needed. - -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: watchrule-admin-role -rules: -- apiGroups: - - configbutler.ai - resources: - - watchrules - verbs: - - '*' -- apiGroups: - - configbutler.ai - resources: - - watchrules/status - verbs: - - get diff --git a/config/rbac/watchrule_editor_role.yaml b/config/rbac/watchrule_editor_role.yaml deleted file mode 100644 index 02dd472f..00000000 --- a/config/rbac/watchrule_editor_role.yaml +++ /dev/null @@ -1,33 +0,0 @@ -# This rule is not used by the project gitops-reverser itself. -# It is provided to allow the cluster admin to help manage permissions for users. -# -# Grants permissions to create, update, and delete resources within the configbutler.ai. -# This role is intended for users who need to manage these resources -# but should not control RBAC or manage permissions for others. - -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: watchrule-editor-role -rules: -- apiGroups: - - configbutler.ai - resources: - - watchrules - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - configbutler.ai - resources: - - watchrules/status - verbs: - - get diff --git a/config/rbac/watchrule_viewer_role.yaml b/config/rbac/watchrule_viewer_role.yaml deleted file mode 100644 index 48770b9e..00000000 --- a/config/rbac/watchrule_viewer_role.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# This rule is not used by the project gitops-reverser itself. -# It is provided to allow the cluster admin to help manage permissions for users. -# -# Grants read-only access to configbutler.ai resources. -# This role is intended for users who need visibility into these resources -# without permissions to modify them. It is ideal for monitoring purposes and limited-access viewing. - -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: watchrule-viewer-role -rules: -- apiGroups: - - configbutler.ai - resources: - - watchrules - verbs: - - get - - list - - watch -- apiGroups: - - configbutler.ai - resources: - - watchrules/status - verbs: - - get diff --git a/config/samples/clusterwatchrule.yaml b/config/samples/clusterwatchrule.yaml deleted file mode 100644 index d8d08a2b..00000000 --- a/config/samples/clusterwatchrule.yaml +++ /dev/null @@ -1,33 +0,0 @@ -apiVersion: configbutler.ai/v1alpha1 -kind: ClusterWatchRule -metadata: - name: clusterwatchrule-sample -spec: - gitProviderRef: - name: sample - namespace: gitops-reverser-system - rules: - # Rule 1: Watch cluster-scoped resources (Nodes) - - scope: Cluster - operations: [CREATE, UPDATE, DELETE] - apiGroups: [""] - resources: [nodes] - - # Rule 2: Watch cluster-scoped RBAC resources - - scope: Cluster - apiGroups: [rbac.authorization.k8s.io] - resources: [clusterroles, clusterrolebindings] - - # Rule 3: Watch Deployments in ALL namespaces - - scope: Namespaced - apiGroups: [apps] - resources: [deployments] - # No namespaceSelector = all namespaces - - # Rule 4: Watch Secrets only in PCI-compliant namespaces - - scope: Namespaced - apiGroups: [""] - resources: [secrets] - namespaceSelector: - matchLabels: - compliance: pci diff --git a/config/samples/gitprovider.yaml b/config/samples/gitprovider.yaml deleted file mode 100644 index e67c71ed..00000000 --- a/config/samples/gitprovider.yaml +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: configbutler.ai/v1alpha1 -kind: GitProvider -metadata: - labels: - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: sample -spec: - repoUrl: "http://gitea-http.gitea-e2e.svc.cluster.local:13000/testorg/testrepo.git" - allowedBranches: - - "main" - secretRef: - name: "git-creds" - push: - interval: "1m" - maxCommits: 20 diff --git a/config/samples/gittarget.yaml b/config/samples/gittarget.yaml deleted file mode 100644 index f7a18024..00000000 --- a/config/samples/gittarget.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: configbutler.ai/v1alpha1 -kind: GitTarget -metadata: - labels: - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: sample - namespace: default -spec: - gitProviderRef: - name: gitrepoconfig-sample - branch: main - baseFolder: clusters/default diff --git a/config/samples/kustomization.yaml b/config/samples/kustomization.yaml deleted file mode 100644 index eacc5f8d..00000000 --- a/config/samples/kustomization.yaml +++ /dev/null @@ -1,7 +0,0 @@ -## Append samples of your project ## -resources: - - clusterwatchrule.yaml - - watchrule.yaml - - gittarget.yaml - - gitprovider.yaml -# +kubebuilder:scaffold:manifestskustomizesamples diff --git a/config/samples/watchrule.yaml b/config/samples/watchrule.yaml deleted file mode 100644 index 116bdaa3..00000000 --- a/config/samples/watchrule.yaml +++ /dev/null @@ -1,39 +0,0 @@ -apiVersion: configbutler.ai/v1alpha1 -kind: WatchRule -metadata: - name: watchrule-sample - namespace: default -spec: - # Reference to GitRepoConfig - # If namespace is not specified, defaults to WatchRule's namespace - gitRepoConfigRef: - name: gitrepoconfig-sample - # namespace: default # Optional - defaults to WatchRule's namespace - - # Optional: Filter resources by labels - # This example includes resources with app=production and excludes those with ignore label - objectSelector: - matchExpressions: - - key: app - operator: In - values: [production] - - key: gitops-reverser.io/ignore - operator: DoesNotExist - - # Define which resources to watch (logical OR - matching ANY rule triggers watch) - rules: - # Watch config resources on CREATE and UPDATE (ignore DELETE) - - operations: [CREATE, UPDATE] - apiGroups: [""] # Core API group - apiVersions: ["v1"] - resources: [configmaps, secrets] - - # Watch all operations for app resources - - operations: [CREATE, UPDATE, DELETE] - apiGroups: [apps] - apiVersions: ["v1"] - resources: [deployments, statefulsets] - - # Watch custom resources (all operations, all versions) - - apiGroups: [example.com] - resources: [myapps] diff --git a/config/service.yaml b/config/service.yaml new file mode 100644 index 00000000..8636109e --- /dev/null +++ b/config/service.yaml @@ -0,0 +1,26 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/name: gitops-reverser + name: gitops-reverser-service + namespace: sut +spec: + clusterIP: 10.96.200.200 # This is required because kube-apiserver starts before CoreDNS (so we use a fixed address) + ports: + - name: admission + port: 9443 + protocol: TCP + targetPort: 9443 + - name: audit + port: 9444 + protocol: TCP + targetPort: 9444 + - name: metrics + port: 8443 + protocol: TCP + targetPort: 8443 + selector: + app.kubernetes.io/name: gitops-reverser + control-plane: controller-manager diff --git a/config/webhook.yaml b/config/webhook.yaml new file mode 100644 index 00000000..8d3afdb4 --- /dev/null +++ b/config/webhook.yaml @@ -0,0 +1,30 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + annotations: + cert-manager.io/inject-ca-from: sut/gitops-reverser-admission-server-cert + name: gitops-reverser-validating-webhook-configuration +webhooks: +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: gitops-reverser-service + namespace: sut + port: 9443 + path: /process-validating-webhook + failurePolicy: Ignore + name: gitops-reverser.configbutler.ai + rules: + - apiGroups: + - '*' + apiVersions: + - '*' + operations: + - CREATE + - UPDATE + - DELETE + resources: + - '*' + sideEffects: None + diff --git a/config/webhook/kustomization.yaml b/config/webhook/kustomization.yaml deleted file mode 100644 index 9cf26134..00000000 --- a/config/webhook/kustomization.yaml +++ /dev/null @@ -1,6 +0,0 @@ -resources: -- manifests.yaml -- service.yaml - -configurations: -- kustomizeconfig.yaml diff --git a/config/webhook/kustomizeconfig.yaml b/config/webhook/kustomizeconfig.yaml deleted file mode 100644 index 206316e5..00000000 --- a/config/webhook/kustomizeconfig.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# the following config is for teaching kustomize where to look at when substituting nameReference. -# It requires kustomize v2.1.0 or newer to work properly. -nameReference: -- kind: Service - version: v1 - fieldSpecs: - - kind: MutatingWebhookConfiguration - group: admissionregistration.k8s.io - path: webhooks/clientConfig/service/name - - kind: ValidatingWebhookConfiguration - group: admissionregistration.k8s.io - path: webhooks/clientConfig/service/name - -namespace: -- kind: MutatingWebhookConfiguration - group: admissionregistration.k8s.io - path: webhooks/clientConfig/service/namespace - create: true -- kind: ValidatingWebhookConfiguration - group: admissionregistration.k8s.io - path: webhooks/clientConfig/service/namespace - create: true diff --git a/config/webhook/service.yaml b/config/webhook/service.yaml deleted file mode 100644 index 3a39ec12..00000000 --- a/config/webhook/service.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - labels: - app.kubernetes.io/name: gitops-reverser - app.kubernetes.io/managed-by: kustomize - name: webhook-service - namespace: system -spec: - ports: - - port: 443 - protocol: TCP - targetPort: 9443 - selector: - control-plane: controller-manager - app.kubernetes.io/name: gitops-reverser - role: leader diff --git a/docs/SOPS_ENCRYPTION_PLAN.md b/docs/SOPS_ENCRYPTION_PLAN.md new file mode 100644 index 00000000..45115c40 --- /dev/null +++ b/docs/SOPS_ENCRYPTION_PLAN.md @@ -0,0 +1,248 @@ +# SOPS Encryption Plan For Git Writes + +## Goal + +Encrypt sensitive Kubernetes resources (initially `Secret`) with SOPS before they are written to the Git worktree, so commits contain encrypted payloads instead of plaintext `data`/`stringData`. + +## Scope + +- In scope: + - Encrypt on write path (watch event -> sanitize -> git file write). + - Support SOPS execution strategy for encryption (external binary first iteration). + - Add runtime configuration for enablement, policy, and SOPS invocation. + - Support standard SOPS key backends through mounted credentials/config. + - Tests, docs, and Helm wiring. +- Out of scope (first iteration): + - Decryption in controller runtime. + - Re-encrypting existing historical commits. + - Complex per-namespace/per-rule encryption policies. + +## Current Baseline (Why this is needed) + +- `internal/sanitize/sanitize.go` preserves `data` and `binaryData`. +- `internal/watch/informers.go` enqueues sanitized objects as-is. +- `internal/git/git.go` writes YAML generated from event object directly to disk. +- Result: if a `WatchRule` includes `secrets` (or `*`), secret payloads are committed in plaintext. + +## High-Level Design + +### 1. Encryption Hook Point + +Add encryption at the final write stage in `internal/git/git.go` inside `handleCreateOrUpdateOperation`: + +1. Generate ordered YAML from sanitized object (existing behavior). +2. Apply encryption policy: + - If resource should be encrypted, run SOPS encryption. + - If not, keep plaintext YAML. +3. Continue with existing file compare/write/stage logic. + +This keeps upstream watch/sanitize flow unchanged and centralizes git-output guarantees. + +### 2. Encryption Policy + +Introduce explicit policy config (controller process-level first): + +- `disabled` (default for backward compatibility). +- `secretsOnly` (recommended default when enabled). +- `matchResources` (future): configurable list of `(group, version, resource)` patterns. + +Initial policy decision: +- Encrypt only Kubernetes `Secret` resources (`group=""`, `version="v1"`, `resource="secrets"`). + +### 3. SOPS Invocation Model + +Use external SOPS binary for first implementation, invoked by the manager process. + +Proposed approach: + +1. Write plaintext YAML to a secure temp file in `/tmp`. +2. Run SOPS command to produce encrypted YAML. +3. Read encrypted output and remove temp files. +4. Write encrypted output to repo path. + +Command strategy: +- Prefer `.sops.yaml`-driven encryption rules. +- Allow optional explicit args passthrough only through an allowlist (for example output/input type and config path), not arbitrary raw flags. + +Failure behavior (configurable): +- `failClosed` (recommended): do not write/commit if encryption fails. +- `failOpen` (optional): log error and write plaintext (not recommended for production). + +### 3a. Architecture Choice: External Binary vs Embedded Library + +This should be an explicit engineering decision, not a hidden assumption. + +Option A: External SOPS binary (first iteration) +- Pros: + - Reuses upstream SOPS behavior exactly (CLI parity with existing workflows). + - Faster implementation and lower maintenance in this codebase. + - Keeps cloud KMS/age/PGP backend behavior aligned with standard SOPS usage. +- Cons: + - Extra process spawn overhead per encrypted object. + - Runtime dependency management (binary presence, version pinning, CVE tracking). + +Option B: Embed encryption implementation directly in `gitops-reverser` +- Pros: + - No external process execution; simpler runtime dependency surface. + - Potentially better performance and tighter observability hooks. +- Cons: + - Higher implementation and long-term maintenance cost. + - Risk of behavior drift from upstream SOPS semantics and config handling. + - More complex support burden across key backends. + +Decision for this plan: +- Implement Option A first (external binary), behind feature flags. +- Keep abstraction boundary (`Encryptor` interface) so Option B can be added later without reworking git write flow. + +Revisit triggers: +- Encryption latency becomes a measurable bottleneck. +- Operational burden from binary distribution/versioning is high. +- There is a strong requirement for in-process crypto execution. + +### 4. Runtime Config Model + +Add manager flags + Helm values for encryption: + +- `--encryption-enabled` +- `--encryption-policy=secretsOnly|disabled` +- `--encryption-provider=sops` +- `--sops-binary-path=/usr/local/bin/sops` +- `--sops-config-path=/etc/sops/.sops.yaml` (optional) +- `--encryption-failure-policy=failClosed|failOpen` + +Configuration precedence: +- `--encryption-enabled=false` always disables encryption regardless of policy value. +- `--encryption-enabled=true` requires a non-`disabled` policy. +- Invalid combinations should fail startup with a clear validation error. + +Helm values section proposal: + +```yaml +encryption: + enabled: false + policy: secretsOnly + failurePolicy: failClosed + sops: + binaryPath: /usr/local/bin/sops + configPath: /etc/sops/.sops.yaml +``` + +### 5. Key Material / Backend Configuration + +Do not invent key management inside the operator. Reuse native SOPS backends: + +- `age` via mounted secret and `SOPS_AGE_KEY_FILE`. +- cloud KMS via workload identity / IAM env (AWS/GCP/Azure). +- PGP if needed (lower priority). + +Helm should support: + +- Extra volume mounts for key files and `.sops.yaml`. +- Extra env vars for SOPS backend configuration. + +## Implementation Phases + +## Phase 1: Core plumbing (code-only, no encryption yet) + +- Add `EncryptionConfig` struct and wire it from `cmd/main.go` into git worker path. +- Add policy evaluator utility (`shouldEncrypt(event)`). +- Add unit tests for policy decisions. + +Deliverable: +- Feature-flagged no-op framework merged. + +## Phase 2: SOPS binary integration + +- Implement `SOPSEncryptor` (interface + concrete implementation). +- Integrate into `handleCreateOrUpdateOperation` before file write. +- Implement temp-file execution with strict permissions. +- Add structured logging and metrics: + - encrypt attempts + - encrypt success/failure + - fail-open count + +Deliverable: +- Functional encryption when enabled and policy matches. + +## Phase 3: Packaging and Helm configuration + +- Update `Dockerfile` multi-stage build: + - Add stage to fetch pinned SOPS release binary. + - Copy binary into final distroless image (e.g. `/usr/local/bin/sops`). +- Update chart: + - New `encryption.*` values. + - Add manager args from values. + - Document volume/env examples for keys and `.sops.yaml`. + +Deliverable: +- Deployable encrypted workflow via Helm settings. + +## Phase 4: Test coverage + +- Unit tests: + - `Secret` gets encrypted. + - non-secret not encrypted (policy `secretsOnly`). + - encryption failure with `failClosed` blocks write. + - encryption failure with `failOpen` writes plaintext and emits warning metric. + - invalid flag combinations are rejected at config validation time. +- Integration tests (git operations): + - verify resulting file contains SOPS envelope fields and no raw secret values. +- Optional e2e: + - run with local age key and assert encrypted commits. + +Deliverable: +- CI coverage for happy path and failure modes. + +## Phase 5: Documentation and migration + +- Update `README.md` and chart README with: + - enabling encryption + - key backend setup examples + - operational caveats +- Add migration note: + - existing plaintext history remains in git; requires manual history rewrite if needed. + +Deliverable: +- Operator docs for secure rollout. + +## Security Considerations + +- Default to `failClosed` when encryption is enabled. +- Treat `failOpen` as development-only or break-glass behavior. +- Ensure temp files are `0600` and cleaned up. +- Ensure temp-file cleanup runs on both success and failure paths. +- Avoid logging plaintext content. +- Prefer `age` or cloud KMS over static PGP workflows. +- Recommend separate repos/branches for encrypted outputs when integrating with downstream GitOps tools. + +## Operational Considerations + +- Performance: + - SOPS process spawn per encrypted object adds overhead. + - Mitigation: keep policy narrow (`secretsOnly`) and batch commit behavior unchanged. +- Determinism: + - SOPS metadata may vary; deduplication currently happens pre-write on sanitized plaintext. + - This is acceptable for first iteration but should be documented. +- Compatibility: + - Downstream consumers (Flux/Argo) must be configured for SOPS decryption if they deploy encrypted files. + +## Proposed Acceptance Criteria + +- When enabled with `secretsOnly`, committed Secret manifests are SOPS-encrypted and plaintext secret values never appear in repo files. +- Non-secret resources continue to be committed as before. +- If SOPS is missing or misconfigured: + - `failClosed`: write is rejected and error surfaced. + - `failOpen`: plaintext write proceeds with explicit warning/metric (non-production only). +- If invalid encryption configuration is provided, manager startup fails with actionable error output. +- Helm users can: + - enable encryption + - mount key/config material + - point to SOPS binary/config path without rebuilding chart templates manually. + +## Suggested Rollout + +1. Merge framework + binary integration behind feature flag (disabled by default). +2. Run in staging with `enabled=true`, `policy=secretsOnly`, `failClosed`. +3. Validate commit contents and operational metrics. +4. Roll to production. +5. Optionally extend policy beyond `Secret` after proving stability. diff --git a/docs/audit-setup/cluster/audit/webhook-config.yaml b/docs/audit-setup/cluster/audit/webhook-config.yaml index 413e3bd4..1c25c18b 100644 --- a/docs/audit-setup/cluster/audit/webhook-config.yaml +++ b/docs/audit-setup/cluster/audit/webhook-config.yaml @@ -6,7 +6,7 @@ clusters: - name: audit-webhook cluster: # Use the ClusterIP, but with HTTPS - server: https://10.43.200.200:443/audit-webhook + server: https://10.43.200.200:443/audit-webhook/my-cluster # We could also configure an network wide ingress with it's own cert, but it's better if it has a small surface insecure-skip-tls-verify: true # base64content = kubectl get secret -n -o jsonpath='{.data.ca\.crt}' diff --git a/docs/ci/CI_NON_ROOT_USER_ANALYSIS.md b/docs/ci/CI_NON_ROOT_USER_ANALYSIS.md index b2476fad..39e1366d 100644 --- a/docs/ci/CI_NON_ROOT_USER_ANALYSIS.md +++ b/docs/ci/CI_NON_ROOT_USER_ANALYSIS.md @@ -61,7 +61,7 @@ ```bash # Current CI workflow mounts: -v $HOME/.kube:/root/.kube # ← Would need to change to non-root home --v ${{ github.workspace }}:/workspace # ← Ownership mismatches +-v ${{ github.workspace }}:/__w/... # ← Ownership mismatches ``` ❌ **GitHub Actions Checkout Complications** @@ -71,14 +71,14 @@ ❌ **Docker-in-Docker Challenges** ```yaml -# E2E tests use Docker socket ---network host --v $HOME/.kube:/root/.kube # ← Root path assumptions +# E2E tests use Docker access and kubeconfig paths +-v /var/run/docker.sock:/var/run/docker.sock +-v $HOME/.kube:/root/.kube # ← Root-home assumptions ``` ❌ **Cache and Artifact Permissions** - Go module cache (`/go/pkg/mod`) -- Build artifacts in `/workspace` +- Build artifacts in mounted workspace paths - GitHub Actions cache restoration - All would need careful permission management @@ -291,4 +291,4 @@ Switch to non-root CI only if: - [`GO_MODULE_PERMISSIONS.md`](GO_MODULE_PERMISSIONS.md) - How we solved dev container permissions - [`WINDOWS_DEVCONTAINER_SETUP.md`](WINDOWS_DEVCONTAINER_SETUP.md) - Windows-specific permission handling - [`.devcontainer/Dockerfile`](../.devcontainer/Dockerfile) - Current implementation -- [`.github/workflows/ci.yml`](../.github/workflows/ci.yml) - CI pipeline configuration \ No newline at end of file +- [`.github/workflows/ci.yml`](../.github/workflows/ci.yml) - CI pipeline configuration diff --git a/docs/ci/E2E_IMAGE_ARTIFACT_REUSE_DESIGN.md b/docs/ci/E2E_IMAGE_ARTIFACT_REUSE_DESIGN.md new file mode 100644 index 00000000..d4c8857d --- /dev/null +++ b/docs/ci/E2E_IMAGE_ARTIFACT_REUSE_DESIGN.md @@ -0,0 +1,154 @@ +# E2E Image and Artifact Reuse Design + +## Status +- State: Implemented +- Scope: CI e2e, CI install smoke (helm + manifest), devcontainer/local e2e, IDE direct e2e runs + +## Problem Statement +We need one behavior model that satisfies these constraints: +- CI must reuse artifacts built earlier in the pipeline (image, packaged Helm chart, generated `dist/install.yaml`). +- Local/devcontainer runs should be easy (`make test-e2e`) and should auto-build a local image when no prebuilt image is provided. +- IDE/debugger runs (`go test ./test/e2e/...`) should remain usable without manual pre-steps. +- Image selection logic should be centralized and avoid duplicated implementation. + +## Goals +- Use a single decision input: `PROJECT_IMAGE`. +- If `PROJECT_IMAGE` is set: reuse it and do not rebuild. +- If `PROJECT_IMAGE` is not set: build/load local image once per run path. +- Keep orchestration primarily in `Makefile`. +- Keep Go `BeforeSuite` as IDE fallback only. +- Keep cluster behavior explicit: + - `test-e2e` reuses existing cluster state (fast path). + - install smoke local fallback path performs clean install validation. + +## Non-Goals +- No digest-aware image override logic for Helm values beyond repository/tag split. +- No new CI jobs or artifact formats. +- No changes to release publishing. + +## Decision Model +`PROJECT_IMAGE` is the source of truth: +- `PROJECT_IMAGE` present: + - Treat as prebuilt image. + - Skip local image build/load steps. + - Skip cluster cleanup in install smoke. + - Inject into test/install flows. +- `PROJECT_IMAGE` absent: + - Use local fallback image `$(E2E_LOCAL_IMAGE)` (`gitops-reverser:e2e-local` by default). + - Build image locally and load it into Kind. + - For install smoke, clean cluster first to validate clean install behavior. + - Use that image for test/install flows. + +## Execution Flows + +### 1) CI: e2e test suite +- Workflow passes `PROJECT_IMAGE` from `docker-build` output. +- `make test-e2e` sees `PROJECT_IMAGE` and skips rebuild/load. +- Go tests run with that exact image. +- Cluster is reused (no cleanup in this target). + +Outcome: +- No duplicate image builds in CI. + +### 2) CI: install smoke (`helm` and `manifest`) +- Workflow reuses release bundle artifact (`gitops-reverser.tgz`, `dist/install.yaml`). +- Workflow passes the same prebuilt `PROJECT_IMAGE`. +- `make test-e2e-install-helm` and `make test-e2e-install-manifest` skip cluster cleanup and local image rebuild/load. +- Helm mode injects repository/tag via `--set image.repository` and `--set image.tag`. +- Manifest mode applies `dist/install.yaml`, then overrides deployment image via `kubectl set image`. +- `test-e2e-install-manifest` does not regenerate `dist/install.yaml` when `PROJECT_IMAGE` is set. + +Outcome: +- Reuse of both chart/manifest artifacts and prebuilt image in CI. + +### 3) Devcontainer/local: full e2e via Make +- Run `make test-e2e` with no `PROJECT_IMAGE`. +- Make reuses existing cluster. +- Make auto-builds and Kind-loads `$(E2E_LOCAL_IMAGE)`, then runs tests with it. + +Outcome: +- Single command, no manual image prep. + +### 4) Devcontainer/local: install smoke via Make +- Run `make test-e2e-install-helm` or `make test-e2e-install-manifest` with no `PROJECT_IMAGE`. +- Make cleans cluster first (clean-install validation), then sets up e2e infra. +- Make auto-builds and Kind-loads `$(E2E_LOCAL_IMAGE)`, then runs smoke install using that image. +- For `test-e2e-install-manifest`, `build-installer` is run first to regenerate `dist/install.yaml`. + +Outcome: +- Automatic local behavior with explicit clean-install validation for smoke tests. + +### 5) IDE/debugger direct Go run +- Run `go test ./test/e2e/...` directly (no Make entrypoint). +- `BeforeSuite` checks `PROJECT_IMAGE`. +- If missing, it calls Make targets to prepare cluster + local image. + +Outcome: +- IDE path works without requiring developers to remember pre-steps. + +## Implementation Mapping + +### Makefile +- `E2E_LOCAL_IMAGE`: single local fallback image variable. +- `e2e-build-load-image`: local image build + Kind load. +- `test-e2e`: reuses cluster; branches image behavior based on `PROJECT_IMAGE`. +- `test-e2e-install`: shared install-smoke entry with `PROJECT_IMAGE` branching: + - prebuilt image path: skip cleanup. + - local fallback path: cleanup cluster, setup infra, build/load local image. +- `test-e2e-install-helm`: wrapper to `test-e2e-install`. +- `test-e2e-install-manifest`: + - local path: run `build-installer` first. + - prebuilt path: use existing manifest artifact. + +### Go (`test/e2e/e2e_suite_test.go`) +- `BeforeSuite`: + - if `PROJECT_IMAGE` is set: no prep + - else: call Make for cluster/image prep (IDE fallback) + +### Kind cluster bootstrap (`test/e2e/kind/start-cluster.sh`) +- Reuses existing Kind cluster if present (no delete/recreate in script). +- Creates cluster only when missing. +- Still exports/re-writes kubeconfig endpoint for devcontainer networking. + +### Smoke script (`test/e2e/scripts/install-smoke.sh`) +- Helm mode: + - parse `PROJECT_IMAGE` into repo/tag and override chart values. +- Manifest mode: + - apply `dist/install.yaml` + - if `PROJECT_IMAGE` set, patch deployment image with `kubectl set image`. +- Readiness/diagnostics selector: + - derive pod selector dynamically from `deployment/gitops-reverser` `.spec.selector.matchLabels`. + - avoid hardcoded label assumptions across helm/manifest paths. + +## Why This Split +- Makefile remains the main orchestration layer. +- Go keeps a minimal safety-net role for IDE/direct execution. +- CI avoids redundant work by honoring prebuilt artifacts and prebuilt image. + +## Tradeoffs +- We keep a small amount of orchestration in two places (Make + Go fallback), but avoid duplicated image build logic. +- Manifest image override happens post-apply (`kubectl set image`) rather than regenerating `dist/install.yaml` per image. + +## Command Matrix +- CI e2e: `PROJECT_IMAGE= make test-e2e` +- CI smoke helm: `PROJECT_IMAGE= make test-e2e-install-helm` +- CI smoke manifest: `PROJECT_IMAGE= make test-e2e-install-manifest` +- Local full e2e: `make test-e2e` +- Local smoke helm: `make test-e2e-install-helm` +- Local smoke manifest: `make test-e2e-install-manifest` +- IDE direct: `go test ./test/e2e/...` + +## Failure Modes and Diagnostics +- Wrong image in pods: + - Check deployment image: `kubectl -n gitops-reverser get deploy gitops-reverser -o yaml | rg image:` +- Image pull failures in Kind: + - Ensure local build/load ran or `PROJECT_IMAGE` points to reachable registry. +- Manifest smoke using stale image: + - Local path: verify `build-installer` ran before smoke target. + - CI/prebuilt path: verify artifact source and `kubectl set image` override message. +- Pod readiness says "no matching resources": + - Verify selector in smoke logs (`Pod selector: ...`) and deployment selector labels. + +## Future Improvements +- Add a small shared Make macro/helper to reduce repeated `PROJECT_IMAGE` branching across e2e entrypoints. +- Optionally add an explicit `E2E_AUTO_PREPARE_IMAGE=false` switch for strict mode in advanced local workflows. diff --git a/docs/ci/FINDINGS.md b/docs/ci/FINDINGS.md new file mode 100644 index 00000000..2df67b0c --- /dev/null +++ b/docs/ci/FINDINGS.md @@ -0,0 +1,107 @@ +## CI/Devcontainer Findings (Current Baseline) + +Last updated: 2026-02-13 + +This folder documents why the repository uses its current devcontainer and CI behavior, especially around Go caches, workspace paths, and Kind access from inside the container. + +### 1) Workspace path model + +Current devcontainer intentionally uses: + +- `workspaceMount`: `source=${localWorkspaceFolder},target=/workspaces/${localWorkspaceFolderBasename},type=bind` +- `workspaceFolder`: `/workspaces/${localWorkspaceFolderBasename}` + +Implications: + +- Active source tree is `/workspaces/`. +- `/workspace` may exist in image layers, but it is not the active bind mount for day-to-day development in this repo. + +### 2) Post-create ownership model + +`devcontainer.json` runs: + +```json +"postCreateCommand": "bash .devcontainer/post-create.sh '${containerWorkspaceFolder}'" +``` + +The script resolves the workspace path dynamically and fixes ownership for: + +- the mounted workspace +- `/home/vscode` cache areas used by tools + +This avoids hardcoded path assumptions and keeps Linux/macOS/Windows setups more consistent. + +### 3) Go cache persistence model + +The repository persists heavy Go caches using named Docker volumes: + +- `/go/pkg/mod` (`gomodcache`) +- `/home/vscode/.cache/go-build` (`gobuildcache`) + +Why: + +- Faster rebuild/reopen cycles +- Stable module/build caching independent of repo bind mount +- Fewer permission regressions than putting caches in the workspace tree + +### 4) Kind + kubectl access model inside devcontainer + +The current working model is: + +- Devcontainer does **not** use `--network=host` +- Devcontainer run args include: + - `--group-add=docker` + - `--add-host=host.docker.internal:host-gateway` +- Kind cluster config sets: + - `networking.apiServerAddress: "0.0.0.0"` +- `test/e2e/kind/start-cluster.sh` rewrites kubeconfig server endpoints from + `127.0.0.1|localhost|0.0.0.0` to `host.docker.internal:` and sets + `tls-server-name=localhost` + +Why this is required: + +- If Docker publishes Kind API server on host loopback only (`127.0.0.1`), it is not reachable via `host.docker.internal` from the container. +- Binding on `0.0.0.0` plus kubeconfig rewrite makes in-container `kubectl` stable without host networking. + +### 5) CI root vs non-root stance + +Current recommendation remains: + +- CI build containers can run as root (ephemeral build context) +- Production runtime must run non-root (already implemented) + +Rationale: + +- Keeps CI simpler and less fragile +- Avoids unnecessary permission workarounds +- Preserves security boundary at runtime where it matters most + +### 6) Git safe.directory note + +`safe.directory` in CI is a normal response to UID mismatch between checkout ownership and container process user. This is not, by itself, evidence that CI must be non-root. + +### 7) Practical verification checklist + +After devcontainer rebuild/reopen: + +```bash +# 1) Kind setup +make setup-cluster + +# 2) Confirm API publish bind (expected 0.0.0.0 or ::) +docker inspect gitops-reverser-test-e2e-control-plane --format '{{json .NetworkSettings.Ports}}' + +# 3) Confirm kubeconfig server rewrite +kubectl config view --minify | sed -n '/server:/p;/tls-server-name:/p' + +# 4) Confirm cluster access +kubectl get nodes +``` + +### 8) Related docs in this folder + +- `KUBECTL_TLS_DEBUG_REPORT.md` - incident timeline and final fix +- `GO_MODULE_PERMISSIONS.md` - why `/go` permissions are managed with shared group + ACLs +- `WINDOWS_DEVCONTAINER_SETUP.md` - Windows-specific mount behavior and expected differences +- `CI_NON_ROOT_USER_ANALYSIS.md` - tradeoffs for CI user model +- `GIT_SAFE_DIRECTORY_EXPLAINED.md` - why `safe.directory` is required in containerized CI diff --git a/docs/ci/GIT_SAFE_DIRECTORY_EXPLAINED.md b/docs/ci/GIT_SAFE_DIRECTORY_EXPLAINED.md index 45c58e43..4d6726fb 100644 --- a/docs/ci/GIT_SAFE_DIRECTORY_EXPLAINED.md +++ b/docs/ci/GIT_SAFE_DIRECTORY_EXPLAINED.md @@ -168,7 +168,7 @@ git config --global --add safe.directory '*' **Better:** Explicitly list trusted paths ```bash -git config --global --add safe.directory /workspace +git config --global --add safe.directory /workspaces/ git config --global --add safe.directory /__w/gitops-reverser/gitops-reverser ``` @@ -290,12 +290,14 @@ jobs: **devcontainer.json:** ```json { - "remoteUser": "root", - "postCreateCommand": "git config --global --add safe.directory /workspace" + "remoteUser": "vscode", + "postCreateCommand": "git config --global --add safe.directory ${containerWorkspaceFolder}" } ``` -**Why:** VS Code mounts workspace (owned by host user) into container (running as root) +**Why:** VS Code mounts host workspace into container and ownership can differ from the active user. + +In this repository, local devcontainer flows usually run as `vscode` and often do not need manual `safe.directory`. CI container jobs are the primary place where this setting is required. ### Example 3: Docker Compose Development @@ -305,10 +307,10 @@ services: dev: image: golang:1.25 volumes: - - .:/workspace # Host files β†’ container + - .:/workspaces/ # Host files β†’ container command: | sh -c " - git config --global --add safe.directory /workspace + git config --global --add safe.directory /workspaces/ make test " ``` @@ -333,11 +335,11 @@ services: ```bash # Test in container -docker run --rm -v $(pwd):/workspace golang:1.25 sh -c " - cd /workspace +docker run --rm -v $(pwd):/workspaces/ golang:1.25 sh -c " + cd /workspaces/ git status # Should fail - git config --global --add safe.directory /workspace + git config --global --add safe.directory /workspaces/ git status # Should work " ``` @@ -349,7 +351,7 @@ docker run --rm -v $(pwd):/workspace golang:1.25 sh -c " git config --global --get-all safe.directory # Output example: -/workspace +/workspaces/ /__w/gitops-reverser/gitops-reverser ``` @@ -357,7 +359,7 @@ git config --global --get-all safe.directory ```bash # Remove specific directory -git config --global --unset-all safe.directory /workspace +git config --global --unset-all safe.directory /workspaces/ # Remove all git config --global --remove-section safe @@ -418,4 +420,4 @@ To add an exception for this directory, call: This tells Git: "I trust this specific repository despite the UID mismatch, because I know it's safe in this ephemeral CI container environment." -**It's a pragmatic security trade-off that makes sense in containerized workflows!** \ No newline at end of file +**It's a pragmatic security trade-off that makes sense in containerized workflows!** diff --git a/docs/ci/GO_MODULE_PERMISSIONS.md b/docs/ci/GO_MODULE_PERMISSIONS.md index 600fe7db..7badcbcc 100644 --- a/docs/ci/GO_MODULE_PERMISSIONS.md +++ b/docs/ci/GO_MODULE_PERMISSIONS.md @@ -11,11 +11,11 @@ **If you're on Windows and experiencing permission issues with the workspace directory**, see [`WINDOWS_DEVCONTAINER_SETUP.md`](WINDOWS_DEVCONTAINER_SETUP.md) for Windows-specific guidance. -The ACL solution described in this document works perfectly for the `/go` directory (container filesystem) but **does not apply to the `/workspace` directory when mounted from Windows**. Windows filesystems don't support Linux ACLs, so a different approach is needed for the workspace. +The ACL solution described in this document works perfectly for the `/go` directory (container filesystem) but **does not apply to the `/workspaces/` directory when mounted from Windows**. Windows filesystems don't support Linux ACLs, so a different approach is needed for the workspace. **TL;DR for Windows users:** - The `/go` directory (Go modules cache) works fine with ACLs βœ… -- The `/workspace` directory (your code) needs the `postCreateCommand` fix βœ… +- The `/workspaces/` directory (your code) relies on the post-create ownership fix βœ… - Best solution: Use WSL2 and clone the repo inside WSL2 for full Linux compatibility ## Correct Implementation Order @@ -302,4 +302,4 @@ drwxrwsr-x+ root godev /go/pkg/mod/newdir # Files created in newdir will now correctly inherit godev group ``` -This is why the solution requires **both** setgid and ACLs working together. \ No newline at end of file +This is why the solution requires **both** setgid and ACLs working together. diff --git a/docs/ci/KUBECTL_TLS_DEBUG_REPORT.md b/docs/ci/KUBECTL_TLS_DEBUG_REPORT.md new file mode 100644 index 00000000..d5e07a99 --- /dev/null +++ b/docs/ci/KUBECTL_TLS_DEBUG_REPORT.md @@ -0,0 +1,81 @@ +# kubectl/Kind Connectivity Debug Report (Devcontainer) + +Date: 2026-02-13 +Scope: Why `kubectl get nodes` failed inside devcontainer after Kind cluster creation, and what fixed it. + +## Symptom + +Inside devcontainer, after `make setup-cluster`: + +```bash +kubectl get nodes +``` + +failed with connection errors to `host.docker.internal:`. + +## What We Observed + +1. Kind cluster creation succeeded and reported healthy control plane. +2. Kubeconfig rewrite logic changed server endpoint from `127.0.0.1:` to `host.docker.internal:`. +3. Docker port publish for Kind control-plane showed loopback-only host bind: + +```text +"6443/tcp":[{"HostIp":"127.0.0.1","HostPort":""}] +``` + +## Root Cause + +The kubeconfig rewrite alone was not enough. + +When Kind publishes API server on host loopback (`127.0.0.1`), that port is reachable from the host itself but not from another container via `host.docker.internal`. + +So the devcontainer tried to connect to `host.docker.internal:`, but host had that port bound only to loopback, resulting in connection refused. + +## Final Fix Applied + +### A) Devcontainer networking model + +In `.devcontainer/devcontainer.json`: + +- removed `--network=host` +- kept `--group-add=docker` +- added `--add-host=host.docker.internal:host-gateway` + +### B) Kind API server bind address + +In `test/e2e/kind/cluster-template.yaml`: + +```yaml +networking: + apiServerAddress: "0.0.0.0" +``` + +This ensures host publish is reachable from devcontainer via `host.docker.internal`. + +### C) Kubeconfig rewrite in cluster setup script + +In `test/e2e/kind/start-cluster.sh`, after `kind export kubeconfig`: + +- detect kubeconfig server host in `{127.0.0.1, localhost, 0.0.0.0}` +- rewrite to `https://host.docker.internal:` +- set `tls-server-name=localhost` + +## Verification Steps + +```bash +make setup-cluster +docker inspect gitops-reverser-test-e2e-control-plane --format '{{json .NetworkSettings.Ports}}' +kubectl config view --minify | sed -n '/server:/p;/tls-server-name:/p' +kubectl get nodes +``` + +Expected: + +- Docker `HostIp` for `6443/tcp` is `0.0.0.0` or `::` +- kubeconfig server points to `https://host.docker.internal:` +- `tls-server-name: localhost` is set +- `kubectl get nodes` succeeds inside devcontainer + +## Why We Kept This Design + +This removes the need for `--network=host` while keeping Kind management from inside the devcontainer working reliably. It is easier to reason about, more explicit, and avoids host-network side effects. diff --git a/docs/ci/PROMETHEUS_E2E_HELM_EVALUATION.md b/docs/ci/PROMETHEUS_E2E_HELM_EVALUATION.md new file mode 100644 index 00000000..4d607b35 --- /dev/null +++ b/docs/ci/PROMETHEUS_E2E_HELM_EVALUATION.md @@ -0,0 +1,75 @@ +# Prometheus E2E Helm/ServiceMonitor Evaluation + +## Status +- Date: 2026-02-13 +- Decision: **Do not migrate now** +- Scope: `test/e2e` Prometheus setup only + +## Context +Current e2e Prometheus setup is manifest/script based: +- setup entrypoint: `Makefile` target `setup-prometheus-e2e` +- deploy script: `test/e2e/scripts/setup-prometheus.sh` +- manifests: `test/e2e/prometheus/deployment.yaml`, `test/e2e/prometheus/rbac.yaml` + +Current tests also assume specific Prometheus naming/labels and scrape job naming: +- pod label wait: `app=prometheus` +- service port-forward target: `svc/prometheus:19090` +- PromQL assertions: `job='gitops-reverser-metrics'` + +## Evaluated Plan + +### Option A: Move to standalone Prometheus Helm chart +Use `prometheus-community/prometheus` with pinned chart version, e2e values file, and Helm lifecycle (`upgrade`, revision history, rollback). + +Required changes: +- Replace manifest apply/delete flow in `Makefile` Prometheus targets with Helm install/uninstall. +- Replace `setup-prometheus.sh` behavior with Helm-driven setup. +- Add e2e values file and pin chart version. +- Update port-forward/pod-ready checks that currently assume manual names/labels. + +### Option B: Add ServiceMonitor-based scraping +Use an operator-based chart (typically `kube-prometheus-stack`) because ServiceMonitor discovery is provided by Prometheus Operator, not by standalone Prometheus chart. + +Required changes (in addition to Option A-level migration): +- Install operator CRDs/controllers via Helm in e2e. +- Ensure ServiceMonitor exists for both install paths: + - Helm install smoke path (chart templated ServiceMonitor already exists behind values flag). + - Kustomize e2e path (`make deploy`) requires separate ServiceMonitor manifest. +- Update PromQL tests to avoid hardcoded `job='gitops-reverser-metrics'` assumptions. + +## Pros and Cons + +### Pros of migrating to Helm +- Native `helm upgrade` workflow and revision/rollback history. +- Better consistency with existing e2e dependency setup style (similar to Gitea). +- Centralized values-based configuration. + +### Pros of adding ServiceMonitor path +- Cleaner scrape target management than static scrape config. +- Better alignment with common Kubernetes monitoring practices. +- Reuses chart-level `monitoring.serviceMonitor` support where applicable. + +### Cons / Risks +- Standalone Prometheus chart does **not** provide ServiceMonitor consumption. +- ServiceMonitor requires operator stack, increasing e2e complexity and startup time. +- Existing e2e scripts/tests are coupled to current names/labels/job-name; migration requires non-trivial refactors. +- Adds another chart/version dependency surface in CI and local flows. + +## Decision +We decided to **not do this now**. + +Rationale: +- Current setup is stable and intentionally minimal for e2e signal validation. +- Migration introduces meaningful complexity (especially for ServiceMonitor support). +- The value is primarily operational ergonomics rather than test coverage expansion. + +## Revisit Criteria +Re-open this migration when at least one of the following becomes a priority: +- Need Helm revision/rollback behavior for routine e2e debugging. +- Need ServiceMonitor-driven discovery parity with production environments. +- Need more dynamic scrape target management across multiple test topologies. + +## Next Step (Deferred) +If revisited, prefer a two-phase approach: +1. Phase 1: standalone Prometheus Helm chart migration (no ServiceMonitor). +2. Phase 2: operator-based monitoring stack + ServiceMonitor migration and test query normalization. diff --git a/docs/ci/WINDOWS_DEVCONTAINER_SETUP.md b/docs/ci/WINDOWS_DEVCONTAINER_SETUP.md index cdb260e4..0d469cfc 100644 --- a/docs/ci/WINDOWS_DEVCONTAINER_SETUP.md +++ b/docs/ci/WINDOWS_DEVCONTAINER_SETUP.md @@ -1,148 +1,70 @@ -# Windows DevContainer Setup Guide +# Windows Devcontainer Setup -## Problem +Last updated: 2026-02-13 -On Windows, the devcontainer works differently than on Linux due to how Docker Desktop handles volume mounts: +## Why Windows behaves differently -1. **Container filesystem (`/go`)**: Full Linux filesystem with ACL support βœ… -2. **Mounted workspace (`/workspace`)**: Windows filesystem mounted via Docker, limited Unix permission support ❌ +When the repo is on the Windows filesystem and mounted into a Linux devcontainer, the mounted workspace does not behave exactly like a native Linux filesystem. -## Symptoms +Typical effects: -- Cannot write files in `/workspace` directory -- Permission denied errors when running `go mod tidy` or other commands -- ACL commands fail on mounted volumes +- ownership/permission friction on the mounted workspace +- slower file I/O than WSL-native storage +- Linux ACL-based fixes that work under `/go` do not fully apply to the mounted source tree -## Root Cause +## Current repo behavior -Windows uses NTFS/ReFS filesystems which don't support Linux ACLs. When Docker Desktop mounts a Windows directory into a Linux container, it uses a compatibility layer that: -- Simulates Unix permissions -- Cannot support `setfacl` or advanced ACLs -- May have permission mapping issues between Windows and Linux users +This repo uses: -## Solution +- active workspace path: `/workspaces/` +- `remoteUser`: `vscode` +- post-create hook: `.devcontainer/post-create.sh` (called with `${containerWorkspaceFolder}`) -The solution is to ensure the workspace directory has proper ownership and permissions for the `vscode` user, without relying on ACLs. +The post-create script attempts to normalize ownership for the mounted workspace and `/home/vscode` caches. -### Updated Dockerfile Approach +## Recommended setup (Windows) -The Dockerfile already handles this correctly: +1. Use WSL2. +2. Clone the repository inside the Linux filesystem (for example under `~/git/...` in Ubuntu). +3. Open from WSL in VS Code, then reopen in container. -```dockerfile -# In dev stage - ensure vscode user can write to workspace -RUN chown -R vscode:vscode /workspace && \ - chmod -R 755 /workspace -``` - -However, this only affects the **empty** `/workspace` directory in the image. When you mount your actual Windows workspace, it **overrides** this with the Windows filesystem. - -### Windows-Specific Configuration - -For Windows users, add this to your `.devcontainer/devcontainer.json`: - -```json -{ - "remoteUser": "vscode", - "containerEnv": { - "WORKSPACE_OWNER": "vscode" - }, - "postCreateCommand": "sudo chown -R vscode:vscode /workspace || true", - "mounts": [ - "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind" - ] -} -``` - -The `postCreateCommand` runs **after** the workspace is mounted, ensuring proper ownership. +This is the most reliable and fastest setup. -### Alternative: Use WSL2 Backend - -For the best experience on Windows, use WSL2: - -1. **Install WSL2** with Ubuntu or Debian -2. **Clone the repository inside WSL2** (not in Windows filesystem) -3. **Open in VSCode** using the WSL extension -4. **Use the devcontainer** - it will work exactly like on Linux - -This approach: -- βœ… Full Linux filesystem support -- βœ… ACLs work properly -- βœ… Better performance -- βœ… No permission mapping issues - -### Quick Fix for Existing Setup - -If you're already in the devcontainer and experiencing permission issues: +## Quick checks inside devcontainer ```bash -# Run this inside the devcontainer -sudo chown -R vscode:vscode /workspace -sudo chmod -R 755 /workspace - -# Verify -ls -la /workspace -# Should show vscode:vscode ownership +pwd +ls -ld . +id ``` -## Why `/go` Works But `/workspace` Doesn't +Expected: -| Directory | Location | ACL Support | Why | -|-----------|----------|-------------|-----| -| `/go` | Container filesystem | βœ… Yes | Part of the Linux container image | -| `/workspace` | Mounted from Windows | ❌ No | Windows filesystem mounted via Docker | +- current directory under `/workspaces/` +- effective user is `vscode` +- workspace is writable by `vscode` -The `/go` directory (where Go modules are cached) uses the container's Linux filesystem, so ACLs work perfectly. The `/workspace` directory is mounted from your Windows filesystem, so it doesn't support Linux ACLs. +## If workspace is still not writable -## Recommended Setup for Windows Users - -### Option 1: WSL2 (Recommended) +Run: ```bash -# In WSL2 terminal -cd ~ -git clone -cd gitops-reverser -code . # Opens in VSCode with WSL extension -# Then reopen in container +bash .devcontainer/post-create.sh "${containerWorkspaceFolder:-$(pwd)}" ``` -### Option 2: Windows with Post-Create Fix - -Update `.devcontainer/devcontainer.json`: - -```json -{ - "postCreateCommand": "sudo chown -R vscode:vscode /workspace && sudo chmod -R 755 /workspace || true" -} -``` - -### Option 3: Run as Root (Not Recommended) - -Change `remoteUser` to `root` in `devcontainer.json`, but this is not recommended for security reasons. - -## Verification - -After setup, verify permissions: +Then verify: ```bash -# Check workspace ownership -ls -la /workspace -# Should show: drwxr-xr-x vscode vscode - -# Check you can write files -touch /workspace/test.txt -# Should succeed without errors - -# Check Go operations work +touch .permission-check && rm .permission-check go mod tidy -# Should complete without permission errors - -# Clean up test file -rm /workspace/test.txt ``` +## Notes about `/go` vs workspace + +- `/go` is container filesystem and uses Linux semantics; ACL/setgid strategy documented in `GO_MODULE_PERMISSIONS.md` applies there. +- `/workspaces/` is a bind mount from host; behavior depends on host filesystem and Docker Desktop integration. + ## References -- [Docker Desktop WSL2 Backend](https://docs.docker.com/desktop/wsl/) -- [VSCode Remote - WSL](https://code.visualstudio.com/docs/remote/wsl) -- [Docker Volume Permissions](https://docs.docker.com/storage/bind-mounts/#configure-bind-propagation) \ No newline at end of file +- [Docker Desktop + WSL2](https://docs.docker.com/desktop/features/wsl/) +- [VS Code Remote - WSL](https://code.visualstudio.com/docs/remote/wsl) diff --git a/docs/config-kustomize-simplification-findings.md b/docs/config-kustomize-simplification-findings.md new file mode 100644 index 00000000..e1875b05 --- /dev/null +++ b/docs/config-kustomize-simplification-findings.md @@ -0,0 +1,202 @@ +# Config Kustomize Review: What Is Needed vs. What Can Be Simpler + +## Scope reviewed +- `config/default/kustomization.yaml` +- `config/default/manager_webhook_patch.yaml` +- `config/default/cert_metrics_manager_patch.yaml` +- `config/webhook/*` +- `config/certmanager/*` +- `cmd/main.go` +- `test/e2e/*` (especially namespace/cert assumptions) + +## Executive summary +- The certs are **already rendered into `sut`**, not `system`, when deploying via `config/default`. +- For webhook TLS + cert-manager CA injection, some kustomize wiring is genuinely required. +- There is also clear scaffolding/legacy complexity that can be reduced (especially commented replacement blocks and currently-unused metrics cert flow). + +## What is definitely useful / required + +### 1. `manager_webhook_patch.yaml` is required for current runtime behavior +Why: +- Your manager needs `--webhook-cert-path` and `--audit-cert-path`, plus mounted secrets and container ports (`9443`, `9444`). +- Without this patch, the cert secrets are not mounted where `cmd/main.go` expects them. + +References: +- `config/default/manager_webhook_patch.yaml:5` +- `config/default/manager_webhook_patch.yaml:27` +- `cmd/main.go:365` +- `cmd/main.go:522` + +### 2. Webhook kustomize namespace/name rewriting is required +Why: +- `ValidatingWebhookConfiguration` is cluster-scoped, but it embeds `clientConfig.service.name/namespace` fields. +- Kustomize needs explicit field specs to rewrite those embedded fields with your prefix/namespace. + +References: +- `config/webhook/kustomizeconfig.yaml:1` +- `config/webhook/webhook_service_name_patch.yaml:1` + +### 3. CA injection annotation wiring for cert-manager is required (if using cert-manager) +Why: +- API server must trust the serving cert for the validating webhook. +- `cert-manager.io/inject-ca-from` annotation on `ValidatingWebhookConfiguration` is the mechanism you currently use. + +References: +- `config/default/kustomization.yaml:187` +- Rendered output includes: `cert-manager.io/inject-ca-from: sut/gitops-reverser-admission-server-cert` + +### 4. `certmanager/kustomizeconfig.yaml` is required with `namePrefix` +Why: +- `namePrefix: gitops-reverser-` renames `Issuer` metadata.name. +- `Certificate.spec.issuerRef.name` must be rewritten to match, otherwise cert issuance breaks. + +References: +- `config/default/kustomization.yaml:9` +- `config/certmanager/kustomizeconfig.yaml:1` + +## What is currently over-complicated / likely removable + +### 1. Huge commented replacement blocks in `config/default/kustomization.yaml` +- Most of the metrics/servicemonitor replacement blocks are commented and unused in your current default/e2e flow. +- Keeping them bloats maintenance and confuses intent. + +Reference: +- `config/default/kustomization.yaml:53` + +### 2. Mutating webhook CA injection replacements appear unused +- You only have `ValidatingWebhookConfiguration` in `config/webhook/manifests.yaml`. +- Replacement entries targeting `MutatingWebhookConfiguration` look like kubebuilder scaffold leftovers. + +References: +- `config/default/kustomization.yaml:218` +- `config/webhook/manifests.yaml:3` + +### 3. Metrics certificate is created but not mounted by default +- `metrics-server-cert.yaml` is included in resources. +- But `cert_metrics_manager_patch.yaml` is commented out, so manager does not mount/use `metrics-server-cert` by default. +- E2E Prometheus scrape uses `insecure_skip_verify: true` anyway. + +References: +- `config/certmanager/kustomization.yaml:5` +- `config/default/kustomization.yaml:40` +- `test/e2e/prometheus/deployment.yaml:24` + +## Certificate flow (how certs are used today) + +### Admission webhook cert (`admission-server-cert` secret) +1. `Certificate` resource requests cert for service DNS. +2. cert-manager writes secret `admission-server-cert`. +3. Deployment mounts that secret and passes `--webhook-cert-path`. +4. admission-server listener serves TLS on `9443` using cert watcher. +5. cert-manager injects CA into `ValidatingWebhookConfiguration` annotation target. +6. kube-apiserver calls webhook via Service over TLS and trusts injected CA. + +Key refs: +- `config/certmanager/admission-server-cert.yaml:18` +- `config/default/manager_webhook_patch.yaml:52` +- `config/default/kustomization.yaml:187` + +### Audit ingress cert (`audit-server-cert` secret) +1. Separate `Certificate` resource issues audit cert. +2. Secret `audit-server-cert` is mounted. +3. Manager serves HTTPS audit endpoint on `9444` using `--audit-cert-path`. +4. In e2e, kube-apiserver audit webhook config uses `insecure-skip-tls-verify: true` (so CA pinning is not enforced in test). + +Key refs: +- `config/certmanager/audit-server-cert.yaml:17` +- `config/default/manager_webhook_patch.yaml:60` +- `test/e2e/kind/audit/webhook-config.yaml:14` + +### Metrics cert (`metrics-server-cert` secret) +- Issued by cert-manager, but only actively used if you also enable metrics cert patch and corresponding monitor TLS config. + +Refs: +- `config/certmanager/metrics-server-cert.yaml:20` +- `config/default/cert_metrics_manager_patch.yaml:12` +- `config/prometheus/monitor_tls_patch.yaml:1` + +## Your namespace question: `sut` vs `system` + +Short answer: +- `system` is **not required** for kube-api webhooks. +- Certs should live in the same namespace as the workload/service that uses them. +- In your current default deployment, that namespace is effectively `sut`. + +Important detail: +- Source files still show `namespace: system` in some places, but `config/default/kustomization.yaml` applies `namespace: sut` globally. +- Rendered manifests confirm certs, issuer, service, deployment are in `sut`. + +## Recommended simplification plan (test-focused) + +### Phase 1 (safe cleanup, behavior unchanged) +1. Remove large commented blocks in `config/default/kustomization.yaml` (keep only active replacements). +2. Remove unused mutating-webhook replacement entries if you do not plan mutating webhooks. +3. Add a short comment block at top: "test profile: single service + validating webhook + audit ingress". + +### Phase 2 (decide metrics cert strategy) +Choose one: +1. Keep metrics cert end-to-end: enable `cert_metrics_manager_patch.yaml` and proper monitor TLS usage. +2. Or simplify: remove `metrics-server-cert.yaml` from `config/certmanager/kustomization.yaml` and stop waiting for `metrics-server-cert` in e2e helper. + +Given current e2e (`insecure_skip_verify: true`), option 2 is simpler and consistent. + +### Phase 3 (optional bigger simplification) +If these manifests are truly test-only and namespace/prefix are fixed: +1. Replace dynamic cert DNS replacements with explicit static DNS names. +2. Replace dynamic `inject-ca-from` replacements with static annotation value. + +Tradeoff: +- Less kustomize complexity, but less reusable/generic. + +## Extra note on fixed ClusterIP +- The fixed service ClusterIP (`10.96.200.200`) is coupled to Kind audit webhook bootstrap (API server before DNS). +- Keep it if you depend on that startup behavior in e2e. + +Refs: +- `config/webhook/service.yaml:10` +- `test/e2e/kind/audit/webhook-config.yaml:12` + +## Bold strategy (essence-first): freeze rendered output, delete most kustomize machinery + +### What you mean in practice +1. Render today’s desired install profile once (`kustomize build config/default`). +2. Split that output into plain, human-owned files by concern (for example `namespace.yaml`, `crds.yaml`, `rbac.yaml`, `deployment.yaml`, `service.yaml`, `certificates.yaml`, `webhook.yaml`). +3. Remove the current deep transformer/replacement structure from `config/`. +4. Keep either: + - no kustomize at all (apply a folder of plain YAML in order), or + - one tiny `kustomization.yaml` that just lists resources with zero patches/replacements. + +This is a valid strategy if your goal is readability and low cognitive overhead over portability. + +### Why this can be good +1. You get back to essentials: explicit manifests, no hidden transformations. +2. Refactoring confidence improves because object names/refs are visible directly. +3. New contributors can reason about install behavior without learning kustomize tricks. +4. Debugging production/test drift is easier because rendered state is source of truth. + +### What you lose +1. Easy rebasing of namespace/namePrefix/env variants. +2. Automatic reference rewriting (`Issuer` name, webhook service references, CA injection path assembly). +3. Scaffold compatibility with future kubebuilder regeneration patterns. + +### Where this can hurt later +1. If you later need a second profile (for example non-e2e namespace or no fixed ClusterIP), you will duplicate YAML or re-introduce templating. +2. If cert naming/service naming changes, all references must be manually updated everywhere. +3. Large CRD/regenerated sections can become noisy unless you keep strict ownership boundaries. + +### Guardrails to keep this maintainable +1. Declare one supported raw-manifest profile explicitly (for example: `sut` test profile). +2. Keep clear file boundaries: + - `config/raw/00-namespace.yaml` + - `config/raw/10-crds.yaml` + - `config/raw/20-rbac.yaml` + - `config/raw/30-manager.yaml` + - `config/raw/40-service.yaml` + - `config/raw/50-certificates.yaml` + - `config/raw/60-webhook.yaml` +3. If you keep minimal kustomize, allow only `resources:` entries (no `patches`, no `replacements`, no `configurations`). +4. Add a lightweight validation target (for example `kubectl apply --dry-run=server -f config/raw` in CI). + +### Recommendation for your repo +- If these manifests are primarily for e2e and internal testing, this essence-first model is reasonable and likely worth it. +- If you want `config/` to be a broadly reusable install path, keep some kustomize composition and instead prune it aggressively (not fully remove it). diff --git a/docs/design/best-practices-webhook-ingress.md b/docs/design/best-practices-webhook-ingress.md new file mode 100644 index 00000000..de387415 --- /dev/null +++ b/docs/design/best-practices-webhook-ingress.md @@ -0,0 +1,213 @@ +1) Mutating webhook from a Kubernetes Service: minimal settings to support +A. Listener + routing + +listenAddress / port (default 8443) + +path (e.g. /mutate), and optionally multiple paths if you’ll have multiple webhooks + +readTimeout / writeTimeout / idleTimeout + +maxRequestBodyBytes (defensive; AdmissionReview can be big with certain objects) + +B. TLS (this is non-negotiable in real clusters) + +Kubernetes expects HTTPS for webhooks (service or URL). Minimally support: + +Provide TLS cert + key + +Either via: tls.secretName (mounted secret) + +Or direct file paths (less β€œKubernetes-y”, but useful for dev) + +Provide CA bundle for the webhook configuration + +In practice you’ll set caBundle on the MutatingWebhookConfiguration (or let cert-manager inject it) + +Best practice: integrate with cert-manager and expose: + +certManager.enabled (bool) + +certManager.issuerRef (name/kind/group) + +dnsNames (at least service.namespace.svc and service.namespace.svc.cluster.local) + +rotation: rely on cert-manager renewal; your pod must reload certs (or restart on secret change) + +C. Webhook registration (what you control via config/helm values) + +Even if you generate the MutatingWebhookConfiguration from code/helm, you want these as configurable knobs: + +Per webhook: + +failurePolicy: Fail vs Ignore + +Default recommendation: Fail for security/consistency webhooks; Ignore only if mutation is β€œnice to have” + +timeoutSeconds: keep low (1–5s). Default 2–3s. + +sideEffects: usually None (and mean it) + +admissionReviewVersions: support v1 (and accept v1beta1 only if you must) + +matchPolicy: typically Equivalent + +reinvocationPolicy: consider IfNeeded if you mutate fields other mutators might touch + +Selectors + +namespaceSelector (exclude system namespaces by default) + +objectSelector (optional but great for opt-in via label) + +Rules + +resources + operations you mutate (keep tight) + +scope: cluster vs namespaced where relevant + +D. Runtime safety knobs + +Expose: + +concurrency (max in-flight) + +rateLimit (optional but helpful under thundering herd) + +metrics (Prometheus) + request duration histogram + +pprof optional (dev only) + +logLevel with request IDs and admission UID + +E. Leader election (only if you have shared mutable state) + +For pure stateless mutation, you can run multiple replicas with no leader election. +If you rely on a single writer (e.g., CRD-backed shared cache warmup, or you do coordinated external writes), support: + +leaderElection.enabled + +lease namespace/name + +2) Mutating webhook best practices (the stuff that prevents outages) + +Correctness & determinism + +Make patches deterministic (same input β†’ same output). + +Be idempotent (if called twice, you don’t double-apply). + +Respect dryRun (don’t create external side effects). + +Don’t depend on β€œlive GET” calls in the hot path unless cached; API calls add latency and can deadlock during API stress. + +Performance + +Keep p99 latency low; webhooks are on the API request path. + +Prefer fast local validation/mutation + cached lookups. + +Set tight timeoutSeconds and tune server timeouts accordingly. + +Safety + +Default namespaceSelector to exclude kube-system, kube-public, kube-node-lease, and your own operator namespace until you explicitly need them. + +Use objectSelector to allow opt-in (label) for risky mutations. + +Use failurePolicy=Fail only when you’re confident in HA + readiness + rollout strategy. + +Rollout strategy + +Run at least 2 replicas (or more, depending on API QPS). + +Use a PodDisruptionBudget. + +Ensure readinessProbe only goes ready when: + +certs are loaded + +any required caches are warm (if you depend on them) + +Prefer β€œversioned” webhook names/paths when doing breaking changes. + +Observability + +Log: admission UID, kind, namespace/name, userInfo, decision, latency + +Metrics: requests, rejections, patch size, errors, timeouts + +3) Should you β€œsupport the same settings” for audit webhook handling? + +Some overlap, yes (TLS/HA/observability), but don’t treat them as the same product surface. Audit has very different operational requirements. + +What overlaps (you should support in both) + +HTTPS listener, cert management, rotation + +AuthN (ideally mTLS) and authorization/allowlisting + +Timeouts + max body size + +Concurrency limits and metrics + +What’s different (audit needs extra settings) + +Audit webhook backends can get a lot of traffic and the API server will retry under some failure modes, but you still need to assume: + +bursts + +duplicates + +out-of-order delivery + +occasional loss depending on audit config and backpressure + +So minimally for audit ingestion, add: + +queue.enabled + queue.size + +batching (optional, but very useful downstream) + +durability choice: + +memory queue (simple, lossy on restart) + +persistent queue (disk/DB/Kafka/etc.) + +Backpressure behavior + +what happens when full: drop / block / shed by priority + +Deduplication keying (best-effort): use audit event IDs if present + +Separate endpoint / separate Deployment strongly recommended + +Auth for audit + +For the audit webhook backend, the API server can be configured with a kubeconfig to talk to your endpoint, which makes mTLS client cert auth a clean approach. If you already have a public wildcard cert, that helps with server identity, but client auth is what prevents random in-cluster callers from spamming your audit ingest. + +Recommendation: + +Admission webhook: rely on in-cluster service + TLS + CA bundle (standard) + +Audit webhook: mTLS (client certs) and strict allowlisting/rate limits + +Practical recommendation on architecture + +Keep admission and audit as separate handlers, ideally separate deployments. + +Admission: optimized for latency + correctness + +Audit: optimized for throughput + buffering + durability + +Share libraries (TLS, metrics, logging), but do not share the same scaling knobs or failure modes. + +If you want a simple β€œminimal config surface” that still scales, expose two top-level blocks: + +admissionWebhooks: (tls, selectors, failurePolicy, timeouts, concurrency) + +auditIngest: (tls, authn, queue/durability, backpressure, concurrency) + +That’s the line where you stay sane when traffic grows. + +If you want, paste your current helm values / flags structure and I’ll suggest a clean config schema (what should be values vs generated defaults) without blowing up the number of knobs. \ No newline at end of file diff --git a/docs/design/https-server-alignment-and-service-plan.md b/docs/design/https-server-alignment-and-service-plan.md new file mode 100644 index 00000000..460e5c87 --- /dev/null +++ b/docs/design/https-server-alignment-and-service-plan.md @@ -0,0 +1,268 @@ +# HTTPS Server Alignment And Service Plan + +## Goal + +Improve consistency across the three HTTPS surfaces: + +1. admission-server +2. audit ingress server +3. metrics-server + +With Service topology now simplified after removing the leader-only Service. + +## Current Operating Mode + +Run with **a single pod** for the current phase. + +## Single-Replica Checklist + +- [ ] `replicaCount: 1` is the chart default for this phase. +- [ ] HA-specific behavior is disabled/ignored by default. +- [ ] Leader-only Service has been removed from active topology. +- [ ] HA reintroduction is explicitly deferred to the planned rewrite. +- [ ] Service exposure is consolidated to a single Service named only `{{ include "gitops-reverser.fullname" . }}`. + +## Current Constraints + +- Leader-only routing is no longer part of active Service topology. +- Kubernetes Service selectors are per-Service, not per-port. + +## Decision On "One Service, Three Ports" + +### Can we do it technically? + +Yes, Kubernetes supports one Service exposing multiple ports. + +### Should we do it here? + +Yes, this is now the active direction for the single-pod phase. + +### Recommended topology (single pod) + +Use **one Service with three ports**: + +1. admission HTTPS +2. audit HTTPS +3. metrics HTTPS + +This minimizes moving parts for the interim single-pod phase. + +### Service naming decision + +Use a single Service with the base release fullname only: + +- target name: `{{ include "gitops-reverser.fullname" . }}` +- avoid suffixes such as `-webhook`, `-audit`, `-metrics` for the primary Service +- keep distinct named ports for routing/monitoring clarity + +## Single Service Necessity Analysis + +### Is a single Service still needed? + +Yes for this phase, and there is no strong technical reason to keep separate Services right now. + +### Why this still makes sense now + +- Current services all select the same controller Pod labels, so they do not provide workload isolation. +- Single-replica mode removes the previous leader-vs-all selector split that justified separate routing. +- Operationally, one stable Service name simplifies client configuration and day-2 debugging. +- The design already requires one endpoint surface with different ports, which matches Service named ports well. + +### What currently depends on split service names (implementation impact) + +- `charts/gitops-reverser/templates/validating-webhook.yaml` references the `-webhook` service name. +- `charts/gitops-reverser/templates/certificates.yaml` SANs include `-webhook` and `-audit` DNS names. +- `charts/gitops-reverser/templates/servicemonitor.yaml` and e2e checks currently expect a dedicated metrics service identity. +- `test/e2e/e2e_test.go` asserts `gitops-reverser-webhook-service` and `gitops-reverser-audit-webhook-service`. + +### Conclusion + +- Keep the plan to converge to **one Service**. +- Use one canonical Service name only (release fullname). +- Keep multiple ports; do not keep multiple Services unless HA/service-isolation requirements return. + +## No-Compatibility Decision + +For this refactor, use a direct switch without migration compatibility measures. + +Rationale: + +- The old settings layout is already causing conceptual drift (`webhook.server`, `auditIngress`, `controllerManager.metrics`). +- A compatibility layer would preserve that drift and increase implementation/testing complexity. +- The project is intentionally converging on one topology and one config model for this phase. +- A hard cut keeps behavior deterministic and easier to reason about during rapid iteration. + +## Alignment Plan + +## 1. Unify server config model + +- Introduce a shared internal server config shape for: + - bind address/port + - cert path/name/key + - read/write/idle timeout + - TLS enabled/insecure mode guard +- Define baseline defaults in source code, not in Helm values. +- Map flags into this model for all three servers. +- Keep one parser/defaulting path for all listeners (no per-listener parsing forks). + +## 2. Unify TLS/cert watcher bootstrap + +- Add one helper that: + - validates cert config + - creates optional certwatcher + - wires `GetCertificate` + - applies shared TLS defaults (minimum version + HTTP/2 policy) +- Use same helper for metrics, admission, and audit. +- Keep TLS-off behavior in the same helper path (no duplicate conditional logic per server). + +## 3. Unify server lifecycle wiring + +- Keep all servers manager-managed. +- Reuse one runnable pattern for startup/shutdown + timeout. +- Standardize startup/shutdown logs and error paths. +- Build servers through one reusable constructor/builder function that accepts a typed server config. + +## 4. Align Helm values and args + +- Replace legacy split keys with one canonical settings structure. +- Ensure timeout and cert naming is consistent across all three listeners. + +## 5. Simplify deployment model now + +- Default chart/config to single replica. +- Keep leader-only Service removed from this phase. +- Keep optional leader-election code path only if low-cost; otherwise disable in defaults. + +## 6. Service simplification (single service) + +- Merge admission, audit, and metrics onto one Service with three target ports. +- Name the Service as release fullname only (no role suffix). +- Update validating webhook client config, cert SANs, ServiceMonitor selector/port, and docs accordingly. + +## 7. Tests and rollout checks + +- Unit: + - shared server config parsing + - shared TLS helper behavior + - service template rendering for single Service with three ports +- E2E: + - admission and audit reachable on same Service + - metrics reachable on same Service +- Validation sequence: + - `make build` + - `make lint` + - `make test-e2e` + +## Target Settings Design (Markdown-Only) + +This section defines the intended end-state configuration model without implementing it yet. + +### End-State Overview + +The chart should converge on: + +- One Pod replica by default for this phase. +- One Service exposing three HTTPS ports (admission, audit, metrics). +- One shared server settings shape reused by all three listeners. +- Per-surface overrides only where behavior is genuinely different. +- Per-server TLS can be enabled/disabled independently. +- Defaults are centralized in source code; Helm values provide explicit overrides only. + +### Proposed Helm Values Shape + +```yaml +replicaCount: 1 + +network: + service: + enabled: true + name: "" # defaults to {{ include "gitops-reverser.fullname" . }} + type: ClusterIP + ports: + admission: 9443 + audit: 8444 + metrics: 8443 + +servers: + admission: + enabled: true + bindAddress: :9443 + enableHTTP2: false # optional override + timeouts: {} # optional override + tls: + enabled: true # may be set false for local/dev scenarios + secretName: "" # optional if cert-manager manages mount/secret + + audit: + enabled: true + bindAddress: :9444 + maxRequestBodyBytes: 10485760 + enableHTTP2: false + timeouts: {} + tls: + enabled: true + secretName: "" + + metrics: + enabled: true + bindAddress: :8080 + enableHTTP2: false + timeouts: {} + tls: + enabled: true + secretName: "" +``` +If `servers..tls.enabled` (or timeout/http2 overrides) is omitted, source-code defaults apply. + +### Settings Responsibilities + +| Area | Purpose | Notes | +|---|---|---| +| `servers.admission` | Admission-specific listener settings | Keeps webhook behavior settings separate under `webhook.validating` | +| `servers.audit` | Audit ingress listener settings | Retains audit payload controls like `maxRequestBodyBytes` | +| `servers.metrics` | Metrics listener settings | Supports secure metrics endpoint consistently, but can be intentionally downgraded per environment | +| `network.service` | Cluster Service exposure | Owns service name and externally reachable ports (not container bind ports) | +| Source code defaults | Runtime baseline behavior | Holds canonical defaults for timeouts, TLS baseline, and HTTP/2 policy | + +### Key Mapping (Current -> Target, No Compatibility Layer) + +| Current key | Target key | +|---|---| +| `webhook.server.port` | `servers.admission.bindAddress` | +| `webhook.server.certPath/certName/certKey` | `servers.admission.tls.*` (or source-code defaults) | +| `auditIngress.port` | `servers.audit.bindAddress` | +| `auditIngress.tls.*` | `servers.audit.tls.*` | +| `auditIngress.timeouts.*` | `servers.audit.timeouts.*` | +| `controllerManager.metrics.bindAddress` | `servers.metrics.bindAddress` | +| `controllerManager.enableHTTP2` | `servers..enableHTTP2` or source-code default | + +### CLI Args/Runtime Mapping Direction + +Desired runtime model: + +- Parse Helm values into one internal server settings struct per surface. +- Apply shared defaulting/validation once. +- Generate listener-specific runtime config from the same code path. +- Construct `http.Server` instances via shared functions (for example `buildHTTPServer`, `buildTLSConfig`, `buildServerRunnable`) instead of per-listener copies. + +Resulting behavior goals: + +- Same TLS validation rules for all listeners. +- Same timeout parsing and error messages for all listeners. +- Same startup/shutdown lifecycle pattern for all listeners. +- If TLS is disabled for a listener, skip cert watcher/bootstrap for that listener and run plain HTTP on its bind address. +- No triple repetition of server setup code for admission/audit/metrics. + +### TLS Disable Guardrails + +- Keep TLS enabled by default for all listeners. +- Treat TLS-disabled mode as non-production convenience for local/dev/test. +- Emit a startup warning whenever any listener runs with TLS disabled. +- Admission/audit TLS disable should be opt-in only and clearly visible in rendered values. + +### Rollout Notes For Settings Refactor + +- Use a clean-cut switch to the new settings model. +- Do not ship compatibility aliases or legacy key fallbacks. +- Update chart docs/examples and templates in the same change set. +- Fail fast on invalid/unknown legacy settings to avoid ambiguous runtime behavior. diff --git a/docs/design/quickstart-flow-e2e-strategy.md b/docs/design/quickstart-flow-e2e-strategy.md new file mode 100644 index 00000000..0e94b7b2 --- /dev/null +++ b/docs/design/quickstart-flow-e2e-strategy.md @@ -0,0 +1,184 @@ +# Quickstart Flow E2E Strategy + +## Why this document exists + +We currently validate core behavior with: + +- unit/integration tests (`make test`) +- full e2e tests (`make test-e2e`) with Kind + Gitea + Prometheus + +But we do **not** have a dedicated test focused on "new user install paths": + +- install from the raw/basic Helm chart +- install from generated `dist/install.yaml` (the path shown in quickstart) + +This document defines how to test those flows in CI and whether Gitea should be part of that validation. + +## Goals + +- Ensure first-time installation paths do not regress. +- Catch packaging/rendering/rollout failures before release. +- Keep runtime and maintenance cost reasonable. + +## Non-goals + +- Replacing existing full behavior e2e coverage. +- Re-testing every reconciliation scenario in this new flow. + +## Current gaps + +- Existing e2e deploys using `make install` + `make deploy` (kustomize path), not Helm chart install. +- `dist/install.yaml` is generated in release pipeline, but not validated as an install/rollout path in e2e. +- Quickstart user journey is not directly tested end-to-end. + +## Should we add Gitea here? + +Short answer: **yes, but as a second phase**. + +### Option A: Install smoke only (no Gitea) + +What it tests: + +- Kind cluster bootstrap +- cert-manager dependency +- Helm install from `charts/gitops-reverser` +- `kubectl apply -f dist/install.yaml` +- controller rollout readiness +- CRDs and webhook objects present + +Pros: + +- Fast, stable, low maintenance. +- Directly validates packaging and install UX. +- Best signal per minute for quickstart regressions. + +Cons: + +- Does not prove end-to-end "create resource -> commit appears in git" in this specific path. + +### Option B: Full quickstart flow with Gitea + +What it adds: + +- Create credentials secret +- Apply minimal `GitProvider` + `GitTarget` + `WatchRule` +- Create a ConfigMap +- Verify resulting commit/file in Git repo (Gitea) + +Pros: + +- Closest possible validation of "new user success" narrative. +- Strong confidence that install path + runtime behavior work together. + +Cons: + +- Higher runtime and flakiness surface. +- More setup/teardown complexity. +- Duplicates part of existing heavy e2e coverage. + +### Option C: Commit to a dedicated GitHub repository + +What it adds: + +- Use a purpose-built GitHub repository for e2e output validation. +- Create short-lived branch per run (for example `e2e/`). +- Configure `GitProvider` credentials for GitHub. +- Apply minimal quickstart CRs and assert commit/file appears in that branch. + +Pros: + +- Highest fidelity to real user setup from quickstart perspective. +- Validates network/auth/provider behavior against actual GitHub. +- Catches provider-specific issues that local Gitea cannot. + +Cons: + +- More operational overhead (token/key rotation, branch cleanup, rate limits). +- Higher flakiness due to external service dependency and internet variability. +- Secret handling is stricter in CI (especially for PRs from forks). + +Security/ops considerations: + +- Use a dedicated low-privilege bot account and repo. +- Scope credentials to one repo and minimal permissions. +- Never run secret-bearing jobs for untrusted fork PRs. +- Auto-clean old e2e branches with retention policy. + +## Recommendation + +Adopt a **three-layer strategy**: + +1. **Layer 1 (required in CI): install smoke tests without Gitea** +2. **Layer 2 (targeted quickstart journey with Gitea): one focused scenario** +3. **Layer 3 (external reality check): periodic quickstart run against dedicated GitHub repo** + +This balances reliability and confidence: + +- Layer 1 catches most breakages early (chart, manifest, certs, webhook, rollout). +- Layer 2 ensures we do not disappoint new users on the full "it commits to git" story. +- Layer 3 validates real hosted-provider behavior without making every PR depend on external systems. + +## Proposed test matrix + +### Layer 1: `install-smoke` + +Run on every PR: + +- Scenario 1: Helm chart install (raw/basic values) +- Scenario 2: Generated `dist/install.yaml` install + +Assertions: + +- Namespace/resources created +- Deployment available and pod ready +- CRDs installed +- Validating webhook configuration exists + +### Layer 2: `quickstart-e2e` + +Run on main and/or nightly at first (can be promoted to PR later): + +- Start fresh Kind cluster +- Install via `dist/install.yaml` (quickstart parity) +- Bring up lightweight local Git endpoint (Gitea as today) +- Apply minimal quickstart CRs +- Create test ConfigMap +- Assert git repo contains expected YAML/commit + +### Layer 3: `quickstart-e2e-github` + +Run on schedule (nightly) and on protected branches only: + +- Start fresh Kind cluster +- Install via `dist/install.yaml` +- Configure GitHub credentials from CI secrets +- Apply minimal quickstart CRs against dedicated e2e repo +- Create test ConfigMap +- Assert commit/file appears in dedicated branch +- Optionally delete branch at end (or rely on periodic cleanup job) + +## CI integration proposal + +- Add dedicated Make targets: + - `test-e2e-install-helm` + - `test-e2e-install-manifest` + - `test-e2e-quickstart` (includes Gitea) + - `test-e2e-quickstart-github` (external provider validation) +- Add a new workflow job for Layer 1 and keep it mandatory. +- Add Layer 2 as non-blocking initially; promote to required once stable. +- Add Layer 3 as scheduled/protected-branch only (non-blocking for PRs). + +## Success criteria + +- PRs fail if Helm/basic install or `install.yaml` install cannot roll out cleanly. +- Quickstart flow test validates an actual commit path at least on main/nightly. +- External GitHub quickstart path is exercised regularly and alerts on failures. +- Runtime overhead remains acceptable and failures are actionable. + +## Rollout plan + +1. Implement Layer 1 install smoke tests first. +2. Land CI wiring and make it required. +3. Implement Layer 2 quickstart-with-Gitea scenario. +4. Observe flakiness for 1-2 weeks; then decide if Layer 2 should be required on PRs. +5. Add Layer 3 scheduled GitHub-repo validation with strict secret handling. diff --git a/internal/leader/leader.go b/internal/leader/leader.go deleted file mode 100644 index d106306d..00000000 --- a/internal/leader/leader.go +++ /dev/null @@ -1,137 +0,0 @@ -/* -SPDX-License-Identifier: Apache-2.0 - -Copyright 2025 ConfigButler - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* -Package leader provides leader election functionality for the GitOps Reverser controller. -It manages pod labeling to identify the active leader instance in a multi-replica deployment. -*/ -package leader - -// +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;update;patch - -import ( - "context" - "os" - - "github.com/go-logr/logr" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -const ( - leaderLabelKey = "role" - leaderLabelValue = "leader" -) - -// PodLabeler is a Runnable that adds a label to the pod when it becomes the leader -// and removes it when it stops being the leader. -// It implements the LeaderElectionRunnable interface so it only runs on the leader. -type PodLabeler struct { - Client client.Client - Log logr.Logger - PodName string - Namespace string -} - -// NeedLeaderElection implements the LeaderElectionRunnable interface. -// This ensures the PodLabeler only runs on the elected leader. -func (p *PodLabeler) NeedLeaderElection() bool { - return true -} - -// Start adds the leader label to the pod and blocks until the context is canceled. -// This method is only called on the elected leader pod when NeedLeaderElection returns true. -func (p *PodLabeler) Start(ctx context.Context) error { - log := p.Log.WithValues("pod", p.PodName, "namespace", p.Namespace) - log.Info("🎯 PodLabeler.Start() called - This pod is the leader, adding leader label.") - - if err := p.addLabel(ctx, log); err != nil { - log.Error(err, "❌ Failed to add leader label") - return err - } - - log.Info("βœ… Leader label added successfully") - - // The context is canceled when the manager stops. - <-ctx.Done() - - log.Info("Leader is shutting down, removing leader label.") - // Use a new context for the cleanup operation. - if err := p.removeLabel(context.Background(), log); err != nil { - log.Error(err, "failed to remove leader label on shutdown") - // Don't return error on shutdown, just log it. - } - return nil -} - -func (p *PodLabeler) addLabel(ctx context.Context, log logr.Logger) error { - pod, err := p.getPod(ctx) - if err != nil { - return err - } - - if pod.Labels == nil { - pod.Labels = make(map[string]string) - } - - if val, ok := pod.Labels[leaderLabelKey]; ok && val == leaderLabelValue { - log.Info("Pod already has leader label") - return nil - } - - pod.Labels[leaderLabelKey] = leaderLabelValue - return p.Client.Update(ctx, pod) -} - -func (p *PodLabeler) removeLabel(ctx context.Context, log logr.Logger) error { - pod, err := p.getPod(ctx) - if err != nil { - if errors.IsNotFound(err) { - log.Info("Pod not found, cannot remove leader label.") - return nil - } - return err - } - - if _, ok := pod.Labels[leaderLabelKey]; !ok { - log.Info("Pod does not have leader label, nothing to remove.") - return nil - } - - delete(pod.Labels, leaderLabelKey) - return p.Client.Update(ctx, pod) -} - -func (p *PodLabeler) getPod(ctx context.Context) (*corev1.Pod, error) { - pod := &corev1.Pod{} - key := types.NamespacedName{Name: p.PodName, Namespace: p.Namespace} - err := p.Client.Get(ctx, key, pod) - return pod, err -} - -// GetPodName returns the pod name from the POD_NAME environment variable. -func GetPodName() string { - return os.Getenv("POD_NAME") -} - -// GetPodNamespace returns the pod namespace from the POD_NAMESPACE environment variable. -func GetPodNamespace() string { - return os.Getenv("POD_NAMESPACE") -} diff --git a/internal/leader/leader_test.go b/internal/leader/leader_test.go deleted file mode 100644 index 40a2d7e8..00000000 --- a/internal/leader/leader_test.go +++ /dev/null @@ -1,688 +0,0 @@ -/* -SPDX-License-Identifier: Apache-2.0 - -Copyright 2025 ConfigButler - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package leader - -import ( - "context" - "testing" - "time" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client/fake" - "sigs.k8s.io/controller-runtime/pkg/log/zap" -) - -func TestPodLabeler_Start_AddLabel(t *testing.T) { - // Setup - scheme := runtime.NewScheme() - err := corev1.AddToScheme(scheme) - require.NoError(t, err) - - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Namespace: "test-namespace", - Labels: map[string]string{}, - }, - } - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(pod). - Build() - - logger := zap.New(zap.UseDevMode(true)) - labeler := &PodLabeler{ - Client: client, - Log: logger, - PodName: "test-pod", - Namespace: "test-namespace", - } - - // Create a context that will be canceled after a short time - ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) - defer cancel() - - // Execute - err = labeler.Start(ctx) - require.NoError(t, err) - - // Verify the label was added - updatedPod := &corev1.Pod{} - err = client.Get(context.Background(), types.NamespacedName{ - Name: "test-pod", - Namespace: "test-namespace", - }, updatedPod) - require.NoError(t, err) - - // The label should have been added and then removed during shutdown - // Since we can't easily test the intermediate state, we verify the cleanup happened - assert.NotContains(t, updatedPod.Labels, leaderLabelKey) -} - -func TestPodLabeler_Start_PodNotFound(t *testing.T) { - // Setup - no pod in the fake client - scheme := runtime.NewScheme() - err := corev1.AddToScheme(scheme) - require.NoError(t, err) - - client := fake.NewClientBuilder().WithScheme(scheme).Build() - - logger := zap.New(zap.UseDevMode(true)) - labeler := &PodLabeler{ - Client: client, - Log: logger, - PodName: "non-existent-pod", - Namespace: "test-namespace", - } - - ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) - defer cancel() - - // Execute - err = labeler.Start(ctx) - require.Error(t, err) - assert.True(t, errors.IsNotFound(err)) -} - -func TestPodLabeler_addLabel_NewLabel(t *testing.T) { - // Setup - scheme := runtime.NewScheme() - err := corev1.AddToScheme(scheme) - require.NoError(t, err) - - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Namespace: "test-namespace", - Labels: map[string]string{}, - }, - } - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(pod). - Build() - - logger := zap.New(zap.UseDevMode(true)) - labeler := &PodLabeler{ - Client: client, - Log: logger, - PodName: "test-pod", - Namespace: "test-namespace", - } - - // Execute - ctx := context.Background() - err = labeler.addLabel(ctx, logger) - require.NoError(t, err) - - // Verify - updatedPod := &corev1.Pod{} - err = client.Get(ctx, types.NamespacedName{ - Name: "test-pod", - Namespace: "test-namespace", - }, updatedPod) - require.NoError(t, err) - - assert.Equal(t, leaderLabelValue, updatedPod.Labels[leaderLabelKey]) -} - -func TestPodLabeler_addLabel_ExistingLabel(t *testing.T) { - // Setup - scheme := runtime.NewScheme() - err := corev1.AddToScheme(scheme) - require.NoError(t, err) - - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Namespace: "test-namespace", - Labels: map[string]string{ - leaderLabelKey: leaderLabelValue, // Already has the leader label - }, - }, - } - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(pod). - Build() - - logger := zap.New(zap.UseDevMode(true)) - labeler := &PodLabeler{ - Client: client, - Log: logger, - PodName: "test-pod", - Namespace: "test-namespace", - } - - // Execute - ctx := context.Background() - err = labeler.addLabel(ctx, logger) - require.NoError(t, err) - - // Verify the label is still there (no error should occur) - updatedPod := &corev1.Pod{} - err = client.Get(ctx, types.NamespacedName{ - Name: "test-pod", - Namespace: "test-namespace", - }, updatedPod) - require.NoError(t, err) - - assert.Equal(t, leaderLabelValue, updatedPod.Labels[leaderLabelKey]) -} - -func TestPodLabeler_addLabel_NilLabels(t *testing.T) { - // Setup - scheme := runtime.NewScheme() - err := corev1.AddToScheme(scheme) - require.NoError(t, err) - - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Namespace: "test-namespace", - Labels: nil, // Nil labels map - }, - } - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(pod). - Build() - - logger := zap.New(zap.UseDevMode(true)) - labeler := &PodLabeler{ - Client: client, - Log: logger, - PodName: "test-pod", - Namespace: "test-namespace", - } - - // Execute - ctx := context.Background() - err = labeler.addLabel(ctx, logger) - require.NoError(t, err) - - // Verify - updatedPod := &corev1.Pod{} - err = client.Get(ctx, types.NamespacedName{ - Name: "test-pod", - Namespace: "test-namespace", - }, updatedPod) - require.NoError(t, err) - - assert.NotNil(t, updatedPod.Labels) - assert.Equal(t, leaderLabelValue, updatedPod.Labels[leaderLabelKey]) -} - -func TestPodLabeler_removeLabel_ExistingLabel(t *testing.T) { - // Setup - scheme := runtime.NewScheme() - err := corev1.AddToScheme(scheme) - require.NoError(t, err) - - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Namespace: "test-namespace", - Labels: map[string]string{ - leaderLabelKey: leaderLabelValue, - "other-label": "other-value", - }, - }, - } - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(pod). - Build() - - logger := zap.New(zap.UseDevMode(true)) - labeler := &PodLabeler{ - Client: client, - Log: logger, - PodName: "test-pod", - Namespace: "test-namespace", - } - - // Execute - ctx := context.Background() - err = labeler.removeLabel(ctx, logger) - require.NoError(t, err) - - // Verify - updatedPod := &corev1.Pod{} - err = client.Get(ctx, types.NamespacedName{ - Name: "test-pod", - Namespace: "test-namespace", - }, updatedPod) - require.NoError(t, err) - - assert.NotContains(t, updatedPod.Labels, leaderLabelKey) - assert.Equal(t, "other-value", updatedPod.Labels["other-label"]) // Other labels preserved -} - -func TestPodLabeler_removeLabel_NoLabel(t *testing.T) { - // Setup - scheme := runtime.NewScheme() - err := corev1.AddToScheme(scheme) - require.NoError(t, err) - - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Namespace: "test-namespace", - Labels: map[string]string{ - "other-label": "other-value", - }, - }, - } - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(pod). - Build() - - logger := zap.New(zap.UseDevMode(true)) - labeler := &PodLabeler{ - Client: client, - Log: logger, - PodName: "test-pod", - Namespace: "test-namespace", - } - - // Execute - ctx := context.Background() - err = labeler.removeLabel(ctx, logger) - require.NoError(t, err) - - // Verify - should be no-op - updatedPod := &corev1.Pod{} - err = client.Get(ctx, types.NamespacedName{ - Name: "test-pod", - Namespace: "test-namespace", - }, updatedPod) - require.NoError(t, err) - - assert.NotContains(t, updatedPod.Labels, leaderLabelKey) - assert.Equal(t, "other-value", updatedPod.Labels["other-label"]) -} - -func TestPodLabeler_removeLabel_PodNotFound(t *testing.T) { - // Setup - no pod in the fake client - scheme := runtime.NewScheme() - err := corev1.AddToScheme(scheme) - require.NoError(t, err) - - client := fake.NewClientBuilder().WithScheme(scheme).Build() - - logger := zap.New(zap.UseDevMode(true)) - labeler := &PodLabeler{ - Client: client, - Log: logger, - PodName: "non-existent-pod", - Namespace: "test-namespace", - } - - // Execute - ctx := context.Background() - err = labeler.removeLabel(ctx, logger) - require.NoError(t, err) // Should not error when pod is not found during cleanup -} - -func TestPodLabeler_getPod_Success(t *testing.T) { - // Setup - scheme := runtime.NewScheme() - err := corev1.AddToScheme(scheme) - require.NoError(t, err) - - expectedPod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Namespace: "test-namespace", - Labels: map[string]string{ - "test-label": "test-value", - }, - }, - } - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(expectedPod). - Build() - - labeler := &PodLabeler{ - Client: client, - PodName: "test-pod", - Namespace: "test-namespace", - } - - // Execute - ctx := context.Background() - pod, err := labeler.getPod(ctx) - require.NoError(t, err) - assert.NotNil(t, pod) - assert.Equal(t, "test-pod", pod.Name) - assert.Equal(t, "test-namespace", pod.Namespace) - assert.Equal(t, "test-value", pod.Labels["test-label"]) -} - -func TestPodLabeler_getPod_NotFound(t *testing.T) { - // Setup - no pod in the fake client - scheme := runtime.NewScheme() - err := corev1.AddToScheme(scheme) - require.NoError(t, err) - - client := fake.NewClientBuilder().WithScheme(scheme).Build() - - labeler := &PodLabeler{ - Client: client, - PodName: "non-existent-pod", - Namespace: "test-namespace", - } - - // Execute - ctx := context.Background() - pod, err := labeler.getPod(ctx) - require.Error(t, err) - assert.True(t, errors.IsNotFound(err)) - assert.NotNil(t, pod) // getPod always returns a Pod object, even when not found -} - -func TestGetPodName(t *testing.T) { - // Test with environment variable set - t.Setenv("POD_NAME", "test-pod-name") - - podName := GetPodName() - assert.Equal(t, "test-pod-name", podName) -} - -func TestGetPodName_Empty(t *testing.T) { - // Test with environment variable unset - t.Setenv("POD_NAME", "") - - podName := GetPodName() - assert.Empty(t, podName) -} - -func TestGetPodNamespace(t *testing.T) { - // Test with environment variable set - t.Setenv("POD_NAMESPACE", "test-namespace") - - podNamespace := GetPodNamespace() - assert.Equal(t, "test-namespace", podNamespace) -} - -func TestGetPodNamespace_Empty(t *testing.T) { - // Test with environment variable unset - t.Setenv("POD_NAMESPACE", "") - - podNamespace := GetPodNamespace() - assert.Empty(t, podNamespace) -} - -func TestLeaderLabelConstants(t *testing.T) { - // Verify the constants are set correctly - assert.Equal(t, "role", leaderLabelKey) - assert.Equal(t, "leader", leaderLabelValue) -} - -func TestPodLabeler_ConcurrentOperations(t *testing.T) { - // Setup - scheme := runtime.NewScheme() - err := corev1.AddToScheme(scheme) - require.NoError(t, err) - - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Namespace: "test-namespace", - Labels: map[string]string{}, - }, - } - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(pod). - Build() - - logger := zap.New(zap.UseDevMode(true)) - labeler := &PodLabeler{ - Client: client, - Log: logger, - PodName: "test-pod", - Namespace: "test-namespace", - } - - ctx := context.Background() - - // Execute concurrent add operations - done := make(chan error, 2) - - go func() { - done <- labeler.addLabel(ctx, logger) - }() - - go func() { - done <- labeler.addLabel(ctx, logger) - }() - - // Wait for both operations to complete - err1 := <-done - err2 := <-done - - // Both should succeed (or at least one should succeed) - assert.True(t, err1 == nil || err2 == nil, "At least one add operation should succeed") - - // Verify final state - updatedPod := &corev1.Pod{} - err = client.Get(ctx, types.NamespacedName{ - Name: "test-pod", - Namespace: "test-namespace", - }, updatedPod) - require.NoError(t, err) - - assert.Equal(t, leaderLabelValue, updatedPod.Labels[leaderLabelKey]) -} - -func TestPodLabeler_AddRemoveCycle(t *testing.T) { - // Setup - scheme := runtime.NewScheme() - err := corev1.AddToScheme(scheme) - require.NoError(t, err) - - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Namespace: "test-namespace", - Labels: map[string]string{}, - }, - } - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(pod). - Build() - - logger := zap.New(zap.UseDevMode(true)) - labeler := &PodLabeler{ - Client: client, - Log: logger, - PodName: "test-pod", - Namespace: "test-namespace", - } - - ctx := context.Background() - - // Add label - err = labeler.addLabel(ctx, logger) - require.NoError(t, err) - - // Verify label was added - updatedPod := &corev1.Pod{} - err = client.Get(ctx, types.NamespacedName{ - Name: "test-pod", - Namespace: "test-namespace", - }, updatedPod) - require.NoError(t, err) - assert.Equal(t, leaderLabelValue, updatedPod.Labels[leaderLabelKey]) - - // Remove label - err = labeler.removeLabel(ctx, logger) - require.NoError(t, err) - - // Verify label was removed - err = client.Get(ctx, types.NamespacedName{ - Name: "test-pod", - Namespace: "test-namespace", - }, updatedPod) - require.NoError(t, err) - assert.NotContains(t, updatedPod.Labels, leaderLabelKey) - - // Add label again - err = labeler.addLabel(ctx, logger) - require.NoError(t, err) - - // Verify label was added again - err = client.Get(ctx, types.NamespacedName{ - Name: "test-pod", - Namespace: "test-namespace", - }, updatedPod) - require.NoError(t, err) - assert.Equal(t, leaderLabelValue, updatedPod.Labels[leaderLabelKey]) -} - -func TestPodLabeler_WithExistingLabels(t *testing.T) { - // Setup - scheme := runtime.NewScheme() - err := corev1.AddToScheme(scheme) - require.NoError(t, err) - - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Namespace: "test-namespace", - Labels: map[string]string{ - "app": "my-app", - "version": "v1.0.0", - "environment": "production", - }, - }, - } - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(pod). - Build() - - logger := zap.New(zap.UseDevMode(true)) - labeler := &PodLabeler{ - Client: client, - Log: logger, - PodName: "test-pod", - Namespace: "test-namespace", - } - - ctx := context.Background() - - // Add leader label - err = labeler.addLabel(ctx, logger) - require.NoError(t, err) - - // Verify all labels are preserved - updatedPod := &corev1.Pod{} - err = client.Get(ctx, types.NamespacedName{ - Name: "test-pod", - Namespace: "test-namespace", - }, updatedPod) - require.NoError(t, err) - - expectedLabels := map[string]string{ - "app": "my-app", - "version": "v1.0.0", - "environment": "production", - leaderLabelKey: leaderLabelValue, - } - - assert.Equal(t, expectedLabels, updatedPod.Labels) - - // Remove leader label - err = labeler.removeLabel(ctx, logger) - require.NoError(t, err) - - // Verify only leader label was removed - err = client.Get(ctx, types.NamespacedName{ - Name: "test-pod", - Namespace: "test-namespace", - }, updatedPod) - require.NoError(t, err) - - expectedLabelsAfterRemoval := map[string]string{ - "app": "my-app", - "version": "v1.0.0", - "environment": "production", - } - - assert.Equal(t, expectedLabelsAfterRemoval, updatedPod.Labels) - assert.NotContains(t, updatedPod.Labels, leaderLabelKey) -} - -func TestPodLabeler_ContextCancellation(t *testing.T) { - // Setup - scheme := runtime.NewScheme() - err := corev1.AddToScheme(scheme) - require.NoError(t, err) - - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Namespace: "test-namespace", - Labels: map[string]string{}, - }, - } - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(pod). - Build() - - logger := zap.New(zap.UseDevMode(true)) - labeler := &PodLabeler{ - Client: client, - Log: logger, - PodName: "test-pod", - Namespace: "test-namespace", - } - - // Create a context that gets canceled immediately - ctx, cancel := context.WithCancel(context.Background()) - cancel() // Cancel immediately - - // Execute - Start should handle the canceled context gracefully - err = labeler.Start(ctx) - require.NoError(t, err) // Should not error, just exit cleanly -} diff --git a/internal/watch/gvr.go b/internal/watch/gvr.go index 1f981e70..46283cb6 100644 --- a/internal/watch/gvr.go +++ b/internal/watch/gvr.go @@ -178,6 +178,10 @@ func addGVR( out *[]GVR, seen map[string]struct{}, ) { + if shouldIgnoreResource(group, resource) { + return + } + key := group + "|" + version + "|" + resource + "|" + string(scope) if _, ok := seen[key]; ok { return diff --git a/internal/watch/informers.go b/internal/watch/informers.go index 261ac5dc..d7adba2f 100644 --- a/internal/watch/informers.go +++ b/internal/watch/informers.go @@ -64,6 +64,11 @@ func (m *Manager) handleEvent(obj interface{}, g GVR, op configv1alpha1.Operatio if u == nil { return } + if shouldIgnoreResource(g.Group, g.Resource) { + m.Log.V(1).Info("Skipping resource due to safety filter", + "group", g.Group, "version", g.Version, "resource", g.Resource) + return + } ctx := context.Background() diff --git a/internal/watch/manager.go b/internal/watch/manager.go index 42a6ba32..204e88cc 100644 --- a/internal/watch/manager.go +++ b/internal/watch/manager.go @@ -581,6 +581,12 @@ func (m *Manager) processListedObject( u *unstructured.Unstructured, g GVR, ) { + if shouldIgnoreResource(g.Group, g.Resource) { + m.Log.V(1).Info("Skipping seeded resource due to safety filter", + "group", g.Group, "version", g.Version, "resource", g.Resource) + return + } + id := types.NewResourceIdentifier(g.Group, g.Version, g.Resource, u.GetNamespace(), u.GetName()) var nsLabels map[string]string @@ -693,7 +699,7 @@ func (m *Manager) addGVRsFromResourceRule( } for _, resource := range rr.Resources { normalized := normalizeResource(resource) - if normalized == "*" { + if normalized == "*" || shouldIgnoreResource(group, normalized) { continue // Skip wildcards } gvr := schema.GroupVersionResource{ @@ -731,7 +737,7 @@ func (m *Manager) addGVRsFromClusterResourceRule( } for _, resource := range rr.Resources { normalized := normalizeResource(resource) - if normalized == "*" { + if normalized == "*" || shouldIgnoreResource(group, normalized) { continue // Skip wildcards } gvr := schema.GroupVersionResource{ @@ -752,6 +758,10 @@ func (m *Manager) listResourcesForGVR( gvr schema.GroupVersionResource, gitTarget *configv1alpha1.GitTarget, ) ([]types.ResourceIdentifier, error) { + if shouldIgnoreResource(gvr.Group, gvr.Resource) { + return nil, nil + } + var resources []types.ResourceIdentifier // List resources (cluster-wide for now, namespace filtering would go here) diff --git a/internal/watch/resource_filter.go b/internal/watch/resource_filter.go new file mode 100644 index 00000000..90b0ebfa --- /dev/null +++ b/internal/watch/resource_filter.go @@ -0,0 +1,25 @@ +/* +SPDX-License-Identifier: Apache-2.0 + +Copyright 2025 ConfigButler + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package watch + +import "strings" + +func shouldIgnoreResource(group, resource string) bool { + return group == "" && strings.EqualFold(resource, "secrets") +} diff --git a/internal/watch/resource_filter_test.go b/internal/watch/resource_filter_test.go new file mode 100644 index 00000000..76763f9f --- /dev/null +++ b/internal/watch/resource_filter_test.go @@ -0,0 +1,47 @@ +/* +SPDX-License-Identifier: Apache-2.0 + +Copyright 2025 ConfigButler + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package watch + +import "testing" + +func TestShouldIgnoreResource(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + group string + resource string + want bool + }{ + {name: "core secrets", group: "", resource: "secrets", want: true}, + {name: "core secrets case insensitive", group: "", resource: "Secrets", want: true}, + {name: "core configmaps", group: "", resource: "configmaps", want: false}, + {name: "non-core secrets", group: "example.com", resource: "secrets", want: false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + got := shouldIgnoreResource(tt.group, tt.resource) + if got != tt.want { + t.Fatalf("shouldIgnoreResource(%q, %q) = %v, want %v", tt.group, tt.resource, got, tt.want) + } + }) + } +} diff --git a/internal/webhook/audit_handler.go b/internal/webhook/audit_handler.go index 436f8647..fd12aadb 100644 --- a/internal/webhook/audit_handler.go +++ b/internal/webhook/audit_handler.go @@ -27,6 +27,7 @@ import ( "os" "path/filepath" "strconv" + "strings" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" @@ -45,6 +46,12 @@ import ( const ( // DefaultAuditDumpDir is the default directory for audit event dumps. DefaultAuditDumpDir = "/tmp/audit-events" + // DefaultAuditMaxRequestBodyBytes limits incoming audit payload size. + DefaultAuditMaxRequestBodyBytes = int64(10 * 1024 * 1024) + // MaxClusterIDMetricLabelLength constrains label cardinality impact. + MaxClusterIDMetricLabelLength = 63 + // UnknownClusterIDMetricValue is used when cluster ID cannot be labeled safely. + UnknownClusterIDMetricValue = "unknown" ) // AuditHandlerConfig contains configuration for the audit handler. @@ -52,6 +59,8 @@ type AuditHandlerConfig struct { // DumpDir is the directory where audit events are written for debugging. // If empty, defaults to DefaultAuditDumpDir. DumpDir string + // MaxRequestBodyBytes is the maximum accepted HTTP request body size. + MaxRequestBodyBytes int64 } // AuditHandler handles incoming audit events and collects metrics. @@ -64,6 +73,10 @@ type AuditHandler struct { // NewAuditHandler creates a new audit handler with the given configuration. // If config.DumpDir is empty, file dumping is disabled. func NewAuditHandler(config AuditHandlerConfig) (*AuditHandler, error) { + if config.MaxRequestBodyBytes <= 0 { + config.MaxRequestBodyBytes = DefaultAuditMaxRequestBodyBytes + } + scheme := runtime.NewScheme() if err := audit.AddToScheme(scheme); err != nil { return nil, fmt.Errorf("failed to initialize scheme: %w", err) @@ -85,50 +98,67 @@ func NewAuditHandler(config AuditHandlerConfig) (*AuditHandler, error) { // ServeHTTP implements http.Handler for audit event processing. func (h *AuditHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { ctx := r.Context() - log := logf.FromContext(ctx) + log := logf.Log.WithName("audit-handler") if r.Method != http.MethodPost { http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) return } + clusterID, err := extractClusterID(r.URL.Path) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + reqLog := log.WithValues( + "clusterID", clusterID, + "remoteAddr", r.RemoteAddr, + "path", r.URL.Path, + ) + eventListV1, err := h.decodeEventList(r) if err != nil { - log.Error(err, "Failed to decode audit event list") + reqLog.Error(err, "Failed to decode audit event list") http.Error(w, err.Error(), http.StatusBadRequest) return } if len(eventListV1.Items) == 0 { - log.Info("Received empty audit event list") + reqLog.Info("Received empty audit event list", "eventCount", 0, "processingOutcome", "empty") w.WriteHeader(http.StatusOK) _, err = w.Write([]byte("Empty event list processed")) if err != nil { - log.Error(err, "Failed to write response") + reqLog.Error(err, "Failed to write response") } return } - if err := h.processEvents(ctx, eventListV1.Items); err != nil { - log.Error(err, "Failed to process audit events") + if err := h.processEvents(ctx, clusterID, eventListV1.Items); err != nil { + reqLog.Error(err, "Failed to process audit events") http.Error(w, err.Error(), http.StatusInternalServerError) return } + reqLog.Info("Processed audit request", "eventCount", len(eventListV1.Items), "processingOutcome", "success") w.WriteHeader(http.StatusOK) _, err = w.Write([]byte("Audit event processed")) if err != nil { - log.Error(err, "Failed to write response") + reqLog.Error(err, "Failed to write response") } } // decodeEventList reads and decodes the audit event list from the request. func (h *AuditHandler) decodeEventList(r *http.Request) (*auditv1.EventList, error) { - body, err := io.ReadAll(r.Body) + limited := io.LimitReader(r.Body, h.config.MaxRequestBodyBytes+1) + body, err := io.ReadAll(limited) if err != nil { return nil, fmt.Errorf("failed to read body: %w", err) } defer r.Body.Close() + if int64(len(body)) > h.config.MaxRequestBodyBytes { + return nil, fmt.Errorf("request body too large: max %d bytes", h.config.MaxRequestBodyBytes) + } var eventListV1 auditv1.EventList _, _, err = h.deserializer.Decode(body, nil, &eventListV1) @@ -140,8 +170,9 @@ func (h *AuditHandler) decodeEventList(r *http.Request) (*auditv1.EventList, err } // processEvents processes a list of audit events. -func (h *AuditHandler) processEvents(ctx context.Context, events []auditv1.Event) error { - log := logf.FromContext(ctx) +func (h *AuditHandler) processEvents(ctx context.Context, clusterID string, events []auditv1.Event) error { + log := logf.Log.WithName("audit-handler") + clusterIDMetric := sanitizeClusterIDForMetric(clusterID) for _, auditEventV1 := range events { var auditEvent audit.Event @@ -161,6 +192,8 @@ func (h *AuditHandler) processEvents(ctx context.Context, events []auditv1.Event if auditEvent.ImpersonatedUser != nil { log.Info( "Audit event impersonated", + "clusterID", + clusterID, "authUser", auditEvent.User.Username, "impersonatedUser", @@ -170,6 +203,7 @@ func (h *AuditHandler) processEvents(ctx context.Context, events []auditv1.Event } metrics.AuditEventsReceivedTotal.Add(ctx, 1, metric.WithAttributes( + attribute.String("cluster_id", clusterIDMetric), attribute.String("gvr", gvr), attribute.String("action", action), attribute.String("user", user), @@ -179,6 +213,7 @@ func (h *AuditHandler) processEvents(ctx context.Context, events []auditv1.Event if process { // For now we hardly do a thing log.Info("Processed audit event", + "clusterID", clusterID, "gvr", gvr, "action", action, "auditID", auditEvent.AuditID, @@ -193,6 +228,61 @@ func (h *AuditHandler) processEvents(ctx context.Context, events []auditv1.Event return nil } +func extractClusterID(path string) (string, error) { + const auditPrefix = "/audit-webhook/" + if path == "/audit-webhook" { + return "", errors.New("missing cluster ID in path; expected /audit-webhook/{clusterID}") + } + if !strings.HasPrefix(path, auditPrefix) { + return "", errors.New("invalid path; expected /audit-webhook/{clusterID}") + } + + clusterID := strings.TrimPrefix(path, auditPrefix) + clusterID = strings.TrimSuffix(clusterID, "/") + if clusterID == "" { + return "", errors.New("missing cluster ID in path; expected /audit-webhook/{clusterID}") + } + if strings.Contains(clusterID, "/") { + return "", errors.New("invalid path; expected single segment cluster ID in /audit-webhook/{clusterID}") + } + + return clusterID, nil +} + +func sanitizeClusterIDForMetric(clusterID string) string { + clusterID = strings.TrimSpace(clusterID) + if clusterID == "" { + return UnknownClusterIDMetricValue + } + + var builder strings.Builder + builder.Grow(len(clusterID)) + for _, r := range clusterID { + if isAllowedClusterIDRune(r) { + builder.WriteRune(r) + continue + } + builder.WriteByte('_') + } + + sanitized := builder.String() + if len(sanitized) > MaxClusterIDMetricLabelLength { + sanitized = sanitized[:MaxClusterIDMetricLabelLength] + } + if sanitized == "" { + return UnknownClusterIDMetricValue + } + + return sanitized +} + +func isAllowedClusterIDRune(r rune) bool { + return (r >= 'a' && r <= 'z') || + (r >= 'A' && r <= 'Z') || + (r >= '0' && r <= '9') || + r == '-' || r == '_' || r == '.' +} + // extractGVR constructs the Group/Version/Resource string from the audit event // using k8s.io/apimachinery/pkg/runtime/schema utilities. func (h *AuditHandler) extractGVR(event *audit.Event) string { diff --git a/internal/webhook/audit_handler_test.go b/internal/webhook/audit_handler_test.go index c51900bc..40a2c20a 100644 --- a/internal/webhook/audit_handler_test.go +++ b/internal/webhook/audit_handler_test.go @@ -25,6 +25,7 @@ import ( "net/http" "net/http/httptest" "os" + "strings" "testing" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -51,39 +52,66 @@ func TestAuditHandler_ServeHTTP(t *testing.T) { tests := []struct { name string method string + path string body string expectedStatus int }{ { name: "valid audit event - create configmap", method: http.MethodPost, + path: "/audit-webhook/cluster-a", body: `{"kind":"EventList","apiVersion":"audit.k8s.io/v1","items":[{"kind":"Event","level":"RequestResponse","auditID":"test-id","stage":"ResponseComplete","requestURI":"/api/v1/namespaces/default/configmaps","verb":"create","user":{"username":"test-user"},"objectRef":{"resource":"configmaps","namespace":"default","name":"test-config","apiVersion":"v1"},"responseStatus":{"code":200}}]}`, expectedStatus: http.StatusOK, }, { name: "valid audit event - update deployment", method: http.MethodPost, + path: "/audit-webhook/cluster-a", body: `{"kind":"EventList","apiVersion":"audit.k8s.io/v1","items":[{"kind":"Event","level":"RequestResponse","auditID":"test-id","stage":"ResponseComplete","requestURI":"/apis/apps/v1/namespaces/default/deployments/test-deploy","verb":"update","user":{"username":"test-user"},"objectRef":{"resource":"deployments","namespace":"default","name":"test-deploy","apiVersion":"apps/v1"},"responseStatus":{"code":200}}]}`, expectedStatus: http.StatusOK, }, { name: "multiple events in batch", method: http.MethodPost, + path: "/audit-webhook/cluster-a", body: `{"kind":"EventList","apiVersion":"audit.k8s.io/v1","items":[{"kind":"Event","auditID":"batch-event-1","verb":"create","user":{"username":"test-user"},"objectRef":{"resource":"configmaps","apiVersion":"v1"}},{"kind":"Event","auditID":"batch-event-2","verb":"update","user":{"username":"test-user"},"objectRef":{"resource":"deployments","apiVersion":"apps/v1"}}]}`, expectedStatus: http.StatusOK, }, + { + name: "newly seen cluster ID is accepted", + method: http.MethodPost, + path: "/audit-webhook/new-cluster-42", + body: `{"kind":"EventList","apiVersion":"audit.k8s.io/v1","items":[{"kind":"Event","auditID":"new-cluster-test","verb":"create","user":{"username":"test-user"},"objectRef":{"resource":"configmaps","apiVersion":"v1"}}]}`, + expectedStatus: http.StatusOK, + }, { name: "invalid method", method: http.MethodGet, + path: "/audit-webhook/cluster-a", body: `{"kind":"EventList","apiVersion":"audit.k8s.io/v1","items":[{"kind":"Event","auditID":"invalid-method-test","verb":"create","user":{"username":"test-user"},"objectRef":{"resource":"configmaps","apiVersion":"v1"}}]}`, expectedStatus: http.StatusMethodNotAllowed, }, { name: "invalid JSON", method: http.MethodPost, + path: "/audit-webhook/cluster-a", body: "invalid json", expectedStatus: http.StatusBadRequest, }, + { + name: "missing cluster ID path", + method: http.MethodPost, + path: "/audit-webhook", + body: `{"kind":"EventList","apiVersion":"audit.k8s.io/v1","items":[{"kind":"Event","auditID":"missing-cluster","verb":"create","user":{"username":"test-user"},"objectRef":{"resource":"configmaps","apiVersion":"v1"}}]}`, + expectedStatus: http.StatusBadRequest, + }, + { + name: "extra path segments are rejected", + method: http.MethodPost, + path: "/audit-webhook/cluster-a/extra", + body: `{"kind":"EventList","apiVersion":"audit.k8s.io/v1","items":[{"kind":"Event","auditID":"missing-cluster","verb":"create","user":{"username":"test-user"},"objectRef":{"resource":"configmaps","apiVersion":"v1"}}]}`, + expectedStatus: http.StatusBadRequest, + }, } for _, tt := range tests { @@ -94,7 +122,7 @@ func TestAuditHandler_ServeHTTP(t *testing.T) { require.NoError(t, err) // Create request - req := httptest.NewRequest(tt.method, "/audit-webhook", bytes.NewReader([]byte(tt.body))) + req := httptest.NewRequest(tt.method, tt.path, bytes.NewReader([]byte(tt.body))) w := httptest.NewRecorder() // Call handler @@ -162,7 +190,7 @@ func TestAuditHandler_InvalidJSON(t *testing.T) { }) require.NoError(t, err) - req := httptest.NewRequest(http.MethodPost, "/audit-webhook", bytes.NewReader([]byte("invalid json"))) + req := httptest.NewRequest(http.MethodPost, "/audit-webhook/cluster-a", bytes.NewReader([]byte("invalid json"))) w := httptest.NewRecorder() handler.ServeHTTP(w, req) @@ -201,7 +229,7 @@ func TestAuditHandler_FileDump(t *testing.T) { body, err := json.Marshal(eventList) require.NoError(t, err) - req := httptest.NewRequest(http.MethodPost, "/audit-webhook", bytes.NewReader(body)) + req := httptest.NewRequest(http.MethodPost, "/audit-webhook/cluster-a", bytes.NewReader(body)) w := httptest.NewRecorder() // Call handler @@ -266,7 +294,7 @@ func TestAuditHandler_FileDump(t *testing.T) { eventJSON, err := json.Marshal(eventList) require.NoError(t, err) - req := httptest.NewRequest(http.MethodPost, "/audit-webhook", bytes.NewReader(eventJSON)) + req := httptest.NewRequest(http.MethodPost, "/audit-webhook/cluster-a", bytes.NewReader(eventJSON)) w := httptest.NewRecorder() // Call handler @@ -373,3 +401,76 @@ func TestAuditHandler_ReadYAMLToJSON(t *testing.T) { // Log the JSON for verification t.Logf("Converted JSON: %s", jsonString) } + +func TestAuditHandler_RejectsOversizedBody(t *testing.T) { + handler, err := NewAuditHandler(AuditHandlerConfig{ + DumpDir: "/tmp/audit-events", + MaxRequestBodyBytes: 32, + }) + require.NoError(t, err) + + oversizedBody := `{"kind":"EventList","apiVersion":"audit.k8s.io/v1","items":[]}` + req := httptest.NewRequest(http.MethodPost, "/audit-webhook/cluster-a", bytes.NewReader([]byte(oversizedBody))) + w := httptest.NewRecorder() + + handler.ServeHTTP(w, req) + + assert.Equal(t, http.StatusBadRequest, w.Code) + assert.Contains(t, w.Body.String(), "request body too large") +} + +func TestExtractClusterID(t *testing.T) { + tests := []struct { + name string + path string + expectedID string + expectError bool + }{ + { + name: "valid cluster ID", + path: "/audit-webhook/cluster-a", + expectedID: "cluster-a", + }, + { + name: "valid cluster ID with trailing slash", + path: "/audit-webhook/cluster-a/", + expectedID: "cluster-a", + }, + { + name: "missing cluster ID", + path: "/audit-webhook", + expectError: true, + }, + { + name: "extra segment", + path: "/audit-webhook/cluster-a/extra", + expectError: true, + }, + { + name: "invalid prefix", + path: "/wrong/cluster-a", + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + clusterID, err := extractClusterID(tt.path) + if tt.expectError { + require.Error(t, err) + return + } + require.NoError(t, err) + assert.Equal(t, tt.expectedID, clusterID) + }) + } +} + +func TestSanitizeClusterIDForMetric(t *testing.T) { + assert.Equal(t, "cluster-a", sanitizeClusterIDForMetric("cluster-a")) + assert.Equal(t, "cluster_a", sanitizeClusterIDForMetric("cluster/a")) + assert.Equal(t, "unknown", sanitizeClusterIDForMetric(" ")) + + longID := strings.Repeat("a", MaxClusterIDMetricLabelLength+5) + assert.Len(t, sanitizeClusterIDForMetric(longID), MaxClusterIDMetricLabelLength) +} diff --git a/test/e2e/E2E_DEBUGGING.md b/test/e2e/E2E_DEBUGGING.md index b3c18615..ba3fa69a 100644 --- a/test/e2e/E2E_DEBUGGING.md +++ b/test/e2e/E2E_DEBUGGING.md @@ -61,7 +61,7 @@ go_goroutines{job="gitops-reverser-metrics"} ``` Host Machine (port 13000, 19090) - ↕ (exposed via --network=host) + ↕ (VS Code forwarded ports from devcontainer) DevContainer ↕ (kubectl port-forward) Kind Cluster @@ -117,4 +117,4 @@ make setup-port-forwards # Start port-forwards (Gitea:13000, Prometheus:19090 make cleanup-port-forwards # Stop all port-forwards make e2e-setup # Setup Gitea + Prometheus + port-forwards make test-e2e # Run e2e tests (includes port-forwards) -make e2e-cleanup # Clean up all infrastructure \ No newline at end of file +make e2e-cleanup # Clean up all infrastructure diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index 4112476c..bdd973ad 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -42,7 +42,7 @@ func getProjectImage() string { if img := os.Getenv("PROJECT_IMAGE"); img != "" { return img } - return "example.com/gitops-reverser:v0.0.1" + return "gitops-reverser:e2e-local" } // TestE2E runs the end-to-end (e2e) test suite for the project. These tests execute in an isolated, @@ -63,15 +63,11 @@ var _ = BeforeSuite(func() { return } - // Local testing: ALWAYS rebuild to ensure latest code changes are included - By("building the manager(Operator) image for local testing (forcing rebuild)") - cmd := exec.Command("make", "docker-build", fmt.Sprintf("IMG=%s", projectImage)) + // IDE/direct go test path: ensure cluster exists and local image is built+loaded via Makefile. + By("PROJECT_IMAGE is not set; preparing cluster/image through Makefile for local run") + cmd := exec.Command("make", "setup-cluster", "e2e-build-load-image", fmt.Sprintf("PROJECT_IMAGE=%s", projectImage)) _, err := utils.Run(cmd) - ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to build the manager(Operator) image") - - By("loading the manager(Operator) image on Kind (forcing reload)") - err = utils.LoadImageToKindClusterWithName(projectImage) - ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to load the manager(Operator) image into Kind") + ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to build/load manager image via Makefile") }) var _ = AfterSuite(func() { diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 726578e5..efca3949 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -184,7 +184,7 @@ var _ = Describe("Manager", Ordered, func() { podOutput, err := utils.Run(cmd) g.Expect(err).NotTo(HaveOccurred(), "Failed to retrieve controller-manager pod information") podNames := utils.GetNonEmptyLines(podOutput) - g.Expect(podNames).To(HaveLen(2), "expected 2 controller pods running for HA") + g.Expect(podNames).To(HaveLen(1), "expected exactly 1 controller pod running") controllerPodName = podNames[0] // Use first pod for logging g.Expect(controllerPodName).To(ContainSubstring("controller-manager")) @@ -202,67 +202,74 @@ var _ = Describe("Manager", Ordered, func() { Eventually(verifyControllerUp).Should(Succeed()) }) - It("should identify leader pod with role=leader label", func() { - By("verifying that exactly one pod has the role=leader label") - verifyLeaderLabel := func(g Gomega) { - cmd := exec.Command("kubectl", "get", - "pods", "-l", "control-plane=controller-manager,role=leader", - "-o", "go-template={{ range .items }}"+ - "{{ if not .metadata.deletionTimestamp }}"+ - "{{ .metadata.name }}"+ - "{{ \"\\n\" }}{{ end }}{{ end }}", - "-n", namespace, - ) - - podOutput, err := utils.Run(cmd) - g.Expect(err).NotTo(HaveOccurred(), "Failed to retrieve leader pod information") - leaderPods := utils.GetNonEmptyLines(podOutput) - g.Expect(leaderPods).To(HaveLen(1), "expected exactly 1 leader pod") - - leaderPodName := leaderPods[0] - g.Expect(leaderPodName).To(ContainSubstring("controller-manager")) - - // Update controllerPodName to use the leader pod for subsequent tests - controllerPodName = leaderPodName - - By(fmt.Sprintf("Leader pod identified: %s", leaderPodName)) - } - Eventually(verifyLeaderLabel, 30*time.Second).Should(Succeed()) - }) - - It("should route webhook traffic only to leader pod", func() { - By("verifying webhook service selects only the leader pod") + It("should route webhook traffic to the running controller pod", func() { + By("verifying controller service selects the running controller pod") verifyWebhookService := func(g Gomega) { - // Get webhook service endpoints + // Get controller service endpoints cmd := exec.Command("kubectl", "get", "endpoints", - "gitops-reverser-webhook-service", "-n", namespace, + controllerServiceName, "-n", namespace, "-o", "jsonpath={.subsets[*].addresses[*].targetRef.name}") output, err := utils.Run(cmd) - g.Expect(err).NotTo(HaveOccurred(), "Failed to get webhook service endpoints") + g.Expect(err).NotTo(HaveOccurred(), "Failed to get controller service endpoints") // Filter out kubectl deprecation warnings from output lines := utils.GetNonEmptyLines(output) - var podNames []string + podSet := map[string]struct{}{} for _, line := range lines { // Skip warning lines if !strings.HasPrefix(line, "Warning:") && !strings.Contains(line, "deprecated") && strings.Contains(line, "controller-manager") { - podNames = append(podNames, line) + podSet[line] = struct{}{} } } + var podNames []string + for podName := range podSet { + podNames = append(podNames, podName) + } - // Should only have one endpoint (the leader pod) - g.Expect(podNames).To(HaveLen(1), "webhook service should route to exactly 1 pod (leader)") - - // Verify it's the leader pod - g.Expect(podNames[0]).To(Equal(controllerPodName), "webhook should route to leader pod") + // Should only have one endpoint in single-pod mode. + g.Expect(podNames).To(HaveLen(1), "controller service should route to exactly 1 pod") + g.Expect(podNames[0]).To(Equal(controllerPodName), "controller service should route to controller pod") - By(fmt.Sprintf("βœ… Webhook service correctly routes to leader pod: %s", controllerPodName)) + By(fmt.Sprintf("βœ… Controller service correctly routes to pod: %s", controllerPodName)) } Eventually(verifyWebhookService, 30*time.Second).Should(Succeed()) }) + It("should expose admission and audit ports on one controller service", func() { + By("verifying controller service exists") + cmd := exec.Command("kubectl", "get", "svc", controllerServiceName, "-n", namespace) + _, err := utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Controller service should exist") + + By("verifying controller service routes to the controller pod") + Eventually(func(g Gomega) { + endpointsCmd := exec.Command("kubectl", "get", "endpoints", + controllerServiceName, "-n", namespace, + "-o", "jsonpath={.subsets[*].addresses[*].targetRef.name}") + output, endpointsErr := utils.Run(endpointsCmd) + g.Expect(endpointsErr).NotTo(HaveOccurred(), "Failed to get controller service endpoints") + + lines := utils.GetNonEmptyLines(output) + podSet := map[string]struct{}{} + for _, line := range lines { + if !strings.HasPrefix(line, "Warning:") && + !strings.Contains(line, "deprecated") && + strings.Contains(line, "controller-manager") { + podSet[line] = struct{}{} + } + } + var podNames []string + for podName := range podSet { + podNames = append(podNames, podName) + } + + g.Expect(podNames).To(HaveLen(1), "controller service should route to exactly 1 pod") + g.Expect(podNames[0]).To(Equal(controllerPodName), "controller service should route to controller pod") + }, 30*time.Second).Should(Succeed()) + }) + It("should have webhook registration configured", func() { By("verifying webhook registration for event handler") verifyWebhook := func(g Gomega) { @@ -278,14 +285,14 @@ var _ = Describe("Manager", Ordered, func() { }) It("should ensure the metrics endpoint is serving metrics", func() { - By("validating that the metrics service is available") - cmd := exec.Command("kubectl", "get", "service", metricsServiceName, "-n", namespace) + By("validating that the controller service is available for metrics") + cmd := exec.Command("kubectl", "get", "service", controllerServiceName, "-n", namespace) _, err := utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Metrics service should exist") + Expect(err).NotTo(HaveOccurred(), "Controller service should exist") By("waiting for the metrics endpoint to be ready") verifyMetricsEndpointReady := func(g Gomega) { - cmd := exec.Command("kubectl", "get", "endpoints", metricsServiceName, "-n", namespace) + cmd := exec.Command("kubectl", "get", "endpoints", controllerServiceName, "-n", namespace) output, err := utils.Run(cmd) g.Expect(err).NotTo(HaveOccurred()) g.Expect(output).To(ContainSubstring("8443"), "Metrics endpoint is not ready") @@ -312,10 +319,10 @@ var _ = Describe("Manager", Ordered, func() { func(v float64) bool { return v > 0 }, "process metrics should exist") - By("verifying metrics from both controller pods") + By("verifying metrics from the controller pod") podCount, err := queryPrometheus("count(up{job='gitops-reverser-metrics'})") Expect(err).NotTo(HaveOccurred()) - Expect(podCount).To(Equal(2.0), "Should scrape from 2 controller pods") + Expect(podCount).To(Equal(1.0), "Should scrape from 1 controller pod") fmt.Printf("βœ… Metrics collection verified from %.0f pods\n", podCount) fmt.Printf("πŸ“Š Inspect metrics: %s\n", getPrometheusURL()) @@ -339,24 +346,13 @@ var _ = Describe("Manager", Ordered, func() { func(v float64) bool { return v > baselineEvents }, "webhook events should increment") - By("verifying only leader pod received webhook events") - leaderEvents, err := queryPrometheus( - "sum(gitopsreverser_events_received_total{role='leader'}) or vector(0)", - ) - Expect(err).NotTo(HaveOccurred()) - Expect(leaderEvents).To(BeNumerically(">", baselineEvents), - "Leader should have processed webhook events") - fmt.Printf("βœ… Leader processed %.0f events\n", leaderEvents-baselineEvents) - - By("confirming follower pod has no new webhook events") - followerEvents, err := queryPrometheus( - "sum(gitopsreverser_events_received_total{role!='leader'}) or vector(0)", - ) + By("verifying webhook events were received") + currentEvents, err := queryPrometheus("sum(gitopsreverser_events_received_total) or vector(0)") Expect(err).NotTo(HaveOccurred()) - Expect(followerEvents).To(Equal(0.0), - "Follower should not process webhook events") + Expect(currentEvents).To(BeNumerically(">", baselineEvents), "Controller should process webhook events") + fmt.Printf("βœ… Controller processed %.0f events\n", currentEvents-baselineEvents) - fmt.Printf("βœ… Webhook routing validated - only leader receives events\n") + fmt.Printf("βœ… Webhook routing validated\n") fmt.Printf("πŸ“Š Inspect metrics: %s\n", getPrometheusURL()) By("cleaning up webhook test resources") @@ -369,6 +365,11 @@ var _ = Describe("Manager", Ordered, func() { baselineAuditEvents, err := queryPrometheus("sum(gitopsreverser_audit_events_received_total) or vector(0)") Expect(err).NotTo(HaveOccurred()) fmt.Printf("πŸ“Š Baseline audit events: %.0f\n", baselineAuditEvents) + baselineClusterAuditEvents, err := queryPrometheus( + "sum(gitopsreverser_audit_events_received_total{cluster_id='kind-e2e'}) or vector(0)", + ) + Expect(err).NotTo(HaveOccurred()) + fmt.Printf("πŸ“Š Baseline kind-e2e audit events: %.0f\n", baselineClusterAuditEvents) By("creating a ConfigMap to trigger audit events") cmd := exec.Command("kubectl", "create", "configmap", "audit-test-cm", @@ -376,11 +377,22 @@ var _ = Describe("Manager", Ordered, func() { "--from-literal=test=audit") _, err = utils.Run(cmd) Expect(err).NotTo(HaveOccurred(), "ConfigMap creation should succeed") + cmd = exec.Command("kubectl", "patch", "configmap", "audit-test-cm", + "--namespace", namespace, + "--type=merge", + "--patch", `{"data":{"test":"audit-updated"}}`) + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "ConfigMap update should succeed") By("waiting for audit event metric to increment") - waitForMetric("sum(gitopsreverser_audit_events_received_total) or vector(0)", + waitForMetricWithTimeout("sum(gitopsreverser_audit_events_received_total) or vector(0)", func(v float64) bool { return v > baselineAuditEvents }, - "audit events should increment") + "audit events should increment", 2*time.Minute) + waitForMetricWithTimeout( + "sum(gitopsreverser_audit_events_received_total{cluster_id='kind-e2e'}) or vector(0)", + func(v float64) bool { return v > baselineClusterAuditEvents }, + "audit events should increment for cluster_id=kind-e2e", 2*time.Minute, + ) By("verifying audit events were received") currentAuditEvents, err := queryPrometheus("sum(gitopsreverser_audit_events_received_total) or vector(0)") @@ -509,6 +521,64 @@ var _ = Describe("Manager", Ordered, func() { cleanupGitTarget(destName, namespace) }) + It("should never commit Secret manifests even if WatchRule includes secrets", func() { + gitProviderName := "gitprovider-normal" + watchRuleName := "watchrule-secret-ignore-test" + secretName := "test-secret-ignore" + + By("creating WatchRule that includes secrets") + destName := watchRuleName + "-dest" + createGitTarget(destName, namespace, gitProviderName, "e2e/secret-ignore-test", "main") + + data := struct { + Name string + Namespace string + DestinationName string + }{ + Name: watchRuleName, + Namespace: namespace, + DestinationName: destName, + } + + err := applyFromTemplate("test/e2e/templates/watchrule.tmpl", data, namespace) + Expect(err).NotTo(HaveOccurred(), "Failed to apply WatchRule") + verifyResourceStatus("watchrule", watchRuleName, namespace, "True", "Ready", "") + + By("creating Secret in watched namespace") + _, _ = utils.Run(exec.Command("kubectl", "delete", "secret", secretName, + "-n", namespace, "--ignore-not-found=true")) + + cmd := exec.Command("kubectl", "create", "secret", "generic", secretName, + "--from-literal=password=do-not-commit", "-n", namespace) + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Secret creation should succeed") + + By("verifying Secret file never appears in Git repository") + verifySecretNotCommitted := func(g Gomega) { + pullCmd := exec.Command("git", "pull") + pullCmd.Dir = checkoutDir + pullOutput, pullErr := pullCmd.CombinedOutput() + if pullErr != nil { + g.Expect(pullErr).NotTo(HaveOccurred(), + fmt.Sprintf("Should successfully pull latest changes. Output: %s", string(pullOutput))) + } + + expectedFile := filepath.Join(checkoutDir, + "e2e/secret-ignore-test", + fmt.Sprintf("v1/secrets/%s/%s.yaml", namespace, secretName)) + _, statErr := os.Stat(expectedFile) + g.Expect(statErr).To(HaveOccurred(), fmt.Sprintf("Secret file must NOT exist at %s", expectedFile)) + g.Expect(os.IsNotExist(statErr)).To(BeTrue(), "Error should be 'file does not exist'") + } + Consistently(verifySecretNotCommitted, "20s", "2s").Should(Succeed()) + + By("cleaning up test resources") + _, _ = utils.Run(exec.Command("kubectl", "delete", "secret", secretName, + "-n", namespace, "--ignore-not-found=true")) + cleanupWatchRule(watchRuleName, namespace) + cleanupGitTarget(destName, namespace) + }) + It("should create Git commit when ConfigMap is added via WatchRule", func() { gitProviderName := "gitprovider-normal" watchRuleName := "watchrule-configmap-test" @@ -554,7 +624,7 @@ var _ = Describe("Manager", Ordered, func() { By("waiting for controller reconciliation of ConfigMap event") verifyReconciliationLogs := func(g Gomega) { - // Get controller logs from all pods (leader will have the reconciliation logs) + // Get controller logs from all pods (single-pod mode still uses label selector). cmd := exec.Command("kubectl", "logs", "-l", "control-plane=controller-manager", "-n", namespace, "--tail=500", "--prefix=true") output, err := utils.Run(cmd) @@ -562,7 +632,7 @@ var _ = Describe("Manager", Ordered, func() { // Check for git commit operation in logs g.Expect(output).To(ContainSubstring("git commit"), - "Should see git commit operation in logs from leader pod") + "Should see git commit operation in controller logs") } Eventually(verifyReconciliationLogs).Should(Succeed()) diff --git a/test/e2e/helpers.go b/test/e2e/helpers.go index b859140e..c58798eb 100644 --- a/test/e2e/helpers.go +++ b/test/e2e/helpers.go @@ -42,9 +42,10 @@ import ( // namespace where the project is deployed in. const namespace = "sut" +const metricWaitDefaultTimeout = 30 * time.Second -// metricsServiceName is the name of the metrics service of the project. -const metricsServiceName = "gitops-reverser-controller-manager-metrics-service" +// controllerServiceName is the single Service name used by the controller. +const controllerServiceName = "gitops-reverser-service" // promAPI is the Prometheus API client instance var promAPI v1.API //nolint:gochecknoglobals // Shared across test functions @@ -98,13 +99,23 @@ func queryPrometheus(query string) (float64, error) { // waitForMetric waits for a Prometheus metric query to satisfy a condition func waitForMetric(query string, condition func(float64) bool, description string) { + waitForMetricWithTimeout(query, condition, description, metricWaitDefaultTimeout) +} + +// waitForMetricWithTimeout waits for a Prometheus metric query with a custom timeout. +func waitForMetricWithTimeout( + query string, + condition func(float64) bool, + description string, + timeout time.Duration, +) { By(fmt.Sprintf("waiting for metric: %s", description)) Eventually(func(g Gomega) { value, err := queryPrometheus(query) g.Expect(err).NotTo(HaveOccurred(), "Failed to query Prometheus") g.Expect(condition(value)).To(BeTrue(), fmt.Sprintf("%s (query: %s, value: %.2f)", description, query, value)) - }, 30*time.Second, 2*time.Second).Should(Succeed()) //nolint:mnd // reasonable timeout and polling interval + }, timeout, 2*time.Second).Should(Succeed()) //nolint:mnd // reasonable polling interval } // getPrometheusURL returns the URL for accessing Prometheus UI @@ -163,18 +174,18 @@ func waitForCertificateSecrets() { Eventually(func(g Gomega) { ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) //nolint:mnd // reasonable timeout defer cancel() - cmd := exec.CommandContext(ctx, "kubectl", "get", "secret", "webhook-server-cert", "-n", namespace) + cmd := exec.CommandContext(ctx, "kubectl", "get", "secret", "admission-server-cert", "-n", namespace) _, err := utils.Run(cmd) - g.Expect(err).NotTo(HaveOccurred(), "webhook-server-cert secret should exist") + g.Expect(err).NotTo(HaveOccurred(), "admission-server-cert secret should exist") }, 60*time.Second, 2*time.Second).Should(Succeed()) //nolint:mnd // reasonable timeout for cert-manager - By("waiting for metrics certificate secret to be created by cert-manager") + By("waiting for dedicated audit certificate secret to be created by cert-manager") Eventually(func(g Gomega) { ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) //nolint:mnd // reasonable timeout defer cancel() - cmd := exec.CommandContext(ctx, "kubectl", "get", "secret", "metrics-server-cert", "-n", namespace) + cmd := exec.CommandContext(ctx, "kubectl", "get", "secret", "audit-server-cert", "-n", namespace) _, err := utils.Run(cmd) - g.Expect(err).NotTo(HaveOccurred(), "metrics-server-cert secret should exist") + g.Expect(err).NotTo(HaveOccurred(), "audit-server-cert secret should exist") }, 60*time.Second, 2*time.Second).Should(Succeed()) //nolint:mnd // reasonable timeout for cert-manager By("βœ… All certificate secrets are ready") diff --git a/test/e2e/kind/README.md b/test/e2e/kind/README.md index f8a728db..d63c922d 100644 --- a/test/e2e/kind/README.md +++ b/test/e2e/kind/README.md @@ -4,7 +4,7 @@ This directory contains configuration files to set up a Kind cluster with Kubern ## Overview -The gitops-reverser operator exposes an experimental audit webhook endpoint at `/audit-webhook` that receives audit events from the Kubernetes API server. This setup configures Kind to send audit events to this endpoint for testing and metrics collection. +The gitops-reverser operator exposes an experimental audit webhook endpoint at `/audit-webhook/{clusterID}` that receives audit events from the Kubernetes API server. This setup configures Kind to send audit events to this endpoint for testing and metrics collection. ## Files @@ -28,7 +28,7 @@ The [`cluster.yaml`](cluster.yaml:1) mounts the audit policy and webhook configu The [`webhook-config.yaml`](audit/webhook-config.yaml:1) configures the kube-apiserver to send audit events to: ``` -https://gitops-reverser-webhook-service.gitops-reverser-system.svc.cluster.local:443/audit-webhook +https://10.96.200.200:443/audit-webhook/kind-e2e ``` **Important**: Uses `insecure-skip-tls-verify: true` because the webhook service uses a self-signed certificate from cert-manager. @@ -114,9 +114,9 @@ The audit webhook tracks metrics with labels: docker exec gitops-reverser-test-e2e-control-plane cat /var/log/kubernetes/kube-apiserver-audit.log ``` -2. **Verify webhook service exists**: +2. **Verify audit webhook service exists**: ```bash - kubectl get svc -n gitops-reverser-system gitops-reverser-webhook-service + kubectl get svc -n gitops-reverser-system gitops-reverser-service ``` 3. **Check if kube-apiserver can reach the webhook**: @@ -143,13 +143,13 @@ The K3s setup in [`docs/audit-setup/cluster/`](../../../docs/audit-setup/cluster The Kind setup uses: -- Service DNS: `gitops-reverser-webhook-service.gitops-reverser-system.svc.cluster.local` +- Fixed ClusterIP: `10.96.200.200` +- Path-based cluster identity: `/audit-webhook/kind-e2e` - Config location: `/etc/kubernetes/` (Kind standard) -- Automatic service discovery via DNS ## References - [Kubernetes Audit Documentation](https://kubernetes.io/docs/tasks/debug/debug-cluster/audit/) - [Kind Extra Mounts](https://kind.sigs.k8s.io/docs/user/configuration/#extra-mounts) - [Kubeadm Config Patches](https://kind.sigs.k8s.io/docs/user/configuration/#kubeadm-config-patches) -- [Audit Handler Implementation](../../../internal/webhook/audit_handler.go:1) \ No newline at end of file +- [Audit Handler Implementation](../../../internal/webhook/audit_handler.go:1) diff --git a/test/e2e/kind/audit/webhook-config.yaml b/test/e2e/kind/audit/webhook-config.yaml index f8cdf15e..4d3b4f2d 100644 --- a/test/e2e/kind/audit/webhook-config.yaml +++ b/test/e2e/kind/audit/webhook-config.yaml @@ -8,8 +8,8 @@ clusters: cluster: # IMPORTANT: Use fixed ClusterIP instead of DNS name # kube-apiserver starts before CoreDNS, so DNS resolution fails at startup - # The ClusterIP is set in test/e2e/manifests/webhook-service-fixed-ip.yaml - server: https://10.96.200.200:443/audit-webhook + # The ClusterIP is set in config/default/audit_webhook_service_fixed_ip_patch.yaml + server: https://10.96.200.200:9444/audit-webhook/kind-e2e # Skip TLS verification for testing (webhook uses self-signed cert from cert-manager) insecure-skip-tls-verify: true contexts: diff --git a/test/e2e/kind/cluster-template.yaml b/test/e2e/kind/cluster-template.yaml index 13154536..2b9619bc 100644 --- a/test/e2e/kind/cluster-template.yaml +++ b/test/e2e/kind/cluster-template.yaml @@ -3,9 +3,13 @@ # with the actual host path by the start-cluster.sh script kind: Cluster apiVersion: kind.x-k8s.io/v1alpha4 +networking: + # Bind the API server on all host interfaces so devcontainer clients can + # reach it via host.docker.internal. + apiServerAddress: "0.0.0.0" nodes: - role: control-plane - # Mount the entire audit directory using HOST path for Docker-in-Docker + # Mount the entire audit directory using $HOST_PROJECT_PATH. extraMounts: - hostPath: ${HOST_PROJECT_PATH}/test/e2e/kind/audit containerPath: /etc/kubernetes/audit diff --git a/test/e2e/kind/start-cluster.sh b/test/e2e/kind/start-cluster.sh index eba395a1..42963085 100755 --- a/test/e2e/kind/start-cluster.sh +++ b/test/e2e/kind/start-cluster.sh @@ -24,10 +24,8 @@ echo "βœ… Generated configuration:" cat "$CONFIG_FILE" echo "" -# Check if cluster already exists if kind get clusters 2>/dev/null | grep -q "^${CLUSTER_NAME}$"; then - echo "βœ… Cluster '$CLUSTER_NAME' already exists. Skipping creation." - kind export kubeconfig --name "$CLUSTER_NAME" + echo "♻️ Reusing existing Kind cluster '$CLUSTER_NAME' (no delete/recreate)" else echo "πŸš€ Creating Kind cluster '$CLUSTER_NAME' with audit webhook support..." kind create cluster --name "$CLUSTER_NAME" --config "$CONFIG_FILE" --wait 5m @@ -37,4 +35,18 @@ fi echo "πŸ“‹ Configuring kubeconfig for cluster '$CLUSTER_NAME'..." kind export kubeconfig --name "$CLUSTER_NAME" +current_cluster_name="$(kubectl config view --minify -o jsonpath='{.clusters[0].name}')" +current_server="$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.server}')" + +if [[ "$current_server" =~ ^https://(127\.0\.0\.1|localhost|0\.0\.0\.0):([0-9]+)$ ]]; then + apiserver_port="${BASH_REMATCH[2]}" + echo "πŸ” Rewriting kubeconfig server endpoint to host.docker.internal:${apiserver_port}..." + kubectl config set-cluster "$current_cluster_name" \ + --server="https://host.docker.internal:${apiserver_port}" \ + --tls-server-name=localhost >/dev/null + echo "βœ… kubeconfig endpoint updated for devcontainer networking" +else + echo "ℹ️ kubeconfig server is '$current_server' (no rewrite needed)" +fi + echo "βœ… Cluster setup complete!" diff --git a/test/e2e/prometheus/deployment.yaml b/test/e2e/prometheus/deployment.yaml index afada271..22ced0de 100644 --- a/test/e2e/prometheus/deployment.yaml +++ b/test/e2e/prometheus/deployment.yaml @@ -17,39 +17,13 @@ data: scrape_timeout: 4s scrape_configs: - # Scrape gitops-reverser metrics from 'sut' namespace + # Scrape gitops-reverser metrics from the single controller Service in 'sut' - job_name: 'gitops-reverser-metrics' - scheme: https - tls_config: - insecure_skip_verify: true # Self-signed certs in e2e + scheme: http bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: endpoints - namespaces: - names: - - sut # Target test namespace - - relabel_configs: - # Keep only the metrics service - - source_labels: [__meta_kubernetes_service_name] - action: keep - regex: gitops-reverser-controller-manager-metrics-service - - # Keep only HTTPS port - - source_labels: [__meta_kubernetes_endpoint_port_name] - action: keep - regex: https - - # Add pod name label for per-pod metrics - - source_labels: [__meta_kubernetes_pod_name] - target_label: pod - action: replace - - # Add role label from pod labels (leader/follower) - - source_labels: [__meta_kubernetes_pod_label_role] - target_label: role - action: replace + static_configs: + - targets: + - gitops-reverser-service.sut.svc.cluster.local:8443 --- apiVersion: apps/v1 kind: Deployment @@ -127,4 +101,4 @@ spec: ports: - name: http port: 19090 - targetPort: http \ No newline at end of file + targetPort: http diff --git a/test/e2e/scripts/install-smoke.sh b/test/e2e/scripts/install-smoke.sh new file mode 100755 index 00000000..c6a8c87c --- /dev/null +++ b/test/e2e/scripts/install-smoke.sh @@ -0,0 +1,144 @@ +#!/usr/bin/env bash +set -euo pipefail + +MODE="${1:-}" +NAMESPACE="gitops-reverser" +HELM_CHART_SOURCE="${HELM_CHART_SOURCE:-charts/gitops-reverser}" +WAIT_TIMEOUT="${WAIT_TIMEOUT:-60s}" +PROJECT_IMAGE="${PROJECT_IMAGE:-}" + +if [[ -z "${MODE}" ]]; then + echo "usage: $0 " + exit 1 +fi + +get_controller_pod_selector() { + local selector + selector="$(kubectl -n "${NAMESPACE}" get deployment gitops-reverser \ + -o jsonpath='{range $k,$v := .spec.selector.matchLabels}{$k}={$v},{end}' 2>/dev/null || true)" + selector="${selector%,}" + + if [[ -z "${selector}" ]]; then + # Fallback selector used by chart/manifests if deployment query is not available yet. + selector="app.kubernetes.io/name=gitops-reverser" + fi + + printf '%s' "${selector}" +} + +install_helm() { + local helm_image_args=() + + if [[ -n "${PROJECT_IMAGE}" ]]; then + # Helm chart image is repository + tag. For smoke tests, parse PROJECT_IMAGE and override both. + local image_no_digest image_repo image_tag + image_no_digest="${PROJECT_IMAGE%%@*}" + if [[ "${image_no_digest##*/}" == *:* ]]; then + image_repo="${image_no_digest%:*}" + image_tag="${image_no_digest##*:}" + else + image_repo="${image_no_digest}" + image_tag="latest" + fi + helm_image_args+=(--set "image.repository=${image_repo}" --set "image.tag=${image_tag}") + echo "Overriding chart image from PROJECT_IMAGE=${PROJECT_IMAGE}" + fi + + echo "Installing from Helm chart (mode=helm, source=${HELM_CHART_SOURCE})" + helm upgrade --install "name-is-cool-but-not-relevant" "${HELM_CHART_SOURCE}" \ + --namespace "${NAMESPACE}" \ + --create-namespace \ + --set fullnameOverride=gitops-reverser \ + "${helm_image_args[@]}" +} + +install_manifest() { + echo "Installing from generated dist/install.yaml (mode=manifest)" + kubectl apply -f dist/install.yaml + + if [[ -n "${PROJECT_IMAGE}" ]]; then + echo "Overriding manifest deployment image from PROJECT_IMAGE=${PROJECT_IMAGE}" + kubectl -n "${NAMESPACE}" set image deployment/gitops-reverser manager="${PROJECT_IMAGE}" + fi +} + +print_debug_info() { + local pod_selector + pod_selector="$(get_controller_pod_selector)" + + echo + echo "Install smoke test diagnostics (${MODE})" + echo "Namespace: ${NAMESPACE}" + echo "Pod selector: ${pod_selector}" + echo "Deployment status:" + kubectl -n "${NAMESPACE}" get deployment gitops-reverser -o wide || true + echo + echo "Deployment describe:" + kubectl -n "${NAMESPACE}" describe deployment gitops-reverser || true + echo + echo "Pods:" + kubectl -n "${NAMESPACE}" get pods -o wide || true + echo + echo "Controller-manager pod describe:" + kubectl -n "${NAMESPACE}" describe pod -l "${pod_selector}" || true + echo + echo "Controller-manager logs (last 200 lines):" + kubectl -n "${NAMESPACE}" logs -l "${pod_selector}" --tail=200 --all-containers=true || true + echo + echo "Recent namespace events:" + kubectl -n "${NAMESPACE}" get events --sort-by=.metadata.creationTimestamp | tail -n 50 || true +} + +run_or_debug() { + local description="$1" + shift + echo "${description}" + if ! "$@"; then + echo "FAILED: ${description}" >&2 + print_debug_info + return 1 + fi +} + +verify_installation() { + local pod_selector + pod_selector="$(get_controller_pod_selector)" + + run_or_debug \ + "Waiting for deployment rollout (timeout=${WAIT_TIMEOUT})" \ + kubectl -n "${NAMESPACE}" rollout status deployment/gitops-reverser --timeout="${WAIT_TIMEOUT}" + + run_or_debug \ + "Checking deployment availability (timeout=${WAIT_TIMEOUT})" \ + kubectl -n "${NAMESPACE}" wait --for=condition=available deployment/gitops-reverser --timeout="${WAIT_TIMEOUT}" + + run_or_debug \ + "Checking pod readiness (selector=${pod_selector}, timeout=${WAIT_TIMEOUT})" \ + kubectl -n "${NAMESPACE}" wait --for=condition=ready pod -l "${pod_selector}" --timeout="${WAIT_TIMEOUT}" + + echo "Checking CRDs" + kubectl get crd \ + gitproviders.configbutler.ai \ + gittargets.configbutler.ai \ + watchrules.configbutler.ai \ + clusterwatchrules.configbutler.ai >/dev/null + + echo "Checking validating webhook configuration" + kubectl get validatingwebhookconfiguration gitops-reverser-validating-webhook-configuration >/dev/null +} + +case "${MODE}" in + helm) + install_helm + ;; + manifest) + install_manifest + ;; + *) + echo "unsupported mode: ${MODE} (expected helm or manifest)" + exit 1 + ;; +esac + +verify_installation +echo "Install smoke test passed (${MODE})" diff --git a/test/e2e/scripts/setup-prometheus.sh b/test/e2e/scripts/setup-prometheus.sh index 2aafb328..46565591 100755 --- a/test/e2e/scripts/setup-prometheus.sh +++ b/test/e2e/scripts/setup-prometheus.sh @@ -15,4 +15,8 @@ kubectl apply -f test/e2e/prometheus/rbac.yaml echo "Deploying Prometheus..." kubectl apply -f test/e2e/prometheus/deployment.yaml +echo "Restarting Prometheus deployment to pick up ConfigMap changes..." +kubectl rollout restart deployment/prometheus -n prometheus-e2e +kubectl rollout status deployment/prometheus -n prometheus-e2e --timeout=120s + echo "βœ… Prometheus manifests deployed"