diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index a02b8944..b3606c28 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -5,7 +5,7 @@ FROM golang:1.26.0-bookworm AS ci # Avoid warnings by switching to noninteractive ENV DEBIAN_FRONTEND=noninteractive -# Install essential packages (no Docker) +# Install essential packages + Docker CLI (for DooD via socket mount in CI) RUN apt-get update \ && apt-get -y install --no-install-recommends \ git \ @@ -18,6 +18,14 @@ RUN apt-get update \ acl \ rsync \ gettext-base \ + gnupg \ + && install -m 0755 -d /etc/apt/keyrings \ + && curl -fsSL https://download.docker.com/linux/debian/gpg -o /etc/apt/keyrings/docker.asc \ + && chmod a+r /etc/apt/keyrings/docker.asc \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian $(. /etc/os-release && echo "$VERSION_CODENAME") stable" \ + > /etc/apt/sources.list.d/docker.list \ + && apt-get update \ + && apt-get -y install --no-install-recommends docker-ce-cli \ && apt-get autoremove -y \ && apt-get clean -y \ && rm -rf /var/lib/apt/lists/* @@ -27,7 +35,8 @@ ENV KUBECTL_VERSION=v1.35.0 \ KUSTOMIZE_VERSION=5.8.1 \ KUBEBUILDER_VERSION=4.11.1 \ GOLANGCI_LINT_VERSION=v2.10.1 \ - HELM_VERSION=v4.0.0 + HELM_VERSION=v4.0.0 \ + KIND_VERSION=v0.31.0 # Install kubectl RUN curl -LO "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl" \ @@ -53,6 +62,10 @@ RUN curl -fsSL https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz \ RUN curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh \ | sh -s -- -b /usr/local/bin ${GOLANGCI_LINT_VERSION} +# Install Kind (needed in CI for e2e cluster creation, and in dev for local clusters) +RUN curl -Lo /usr/local/bin/kind "https://github.com/kubernetes-sigs/kind/releases/download/${KIND_VERSION}/kind-linux-amd64" \ + && chmod +x /usr/local/bin/kind + # Set working directory WORKDIR /workspaces @@ -101,11 +114,23 @@ USER root # Switch to noninteractive for package installation ENV DEBIAN_FRONTEND=noninteractive -# Install Kind (for local Kubernetes clusters) -# Docker is provided by devcontainer.json docker-in-docker feature -ENV KIND_VERSION=v0.31.0 -RUN curl -Lo /usr/local/bin/kind https://github.com/kubernetes-sigs/kind/releases/download/${KIND_VERSION}/kind-linux-amd64 -RUN chmod +x /usr/local/bin/kind +# Kind is already installed in the ci stage above. + +# Install Tilt CLI (local dev loop orchestration; dev stage only) +# https://github.com/tilt-dev/tilt/releases +ENV TILT_VERSION=v0.36.3 +RUN arch="$(dpkg --print-architecture)" && \ + case "${arch}" in \ + amd64) tilt_arch="x86_64" ;; \ + arm64) tilt_arch="arm64" ;; \ + *) echo "unsupported architecture for Tilt: ${arch}" && exit 1 ;; \ + esac && \ + tilt_version_no_v="${TILT_VERSION#v}" && \ + curl -fsSL -o /tmp/tilt.tar.gz \ + "https://github.com/tilt-dev/tilt/releases/download/${TILT_VERSION}/tilt.${tilt_version_no_v}.linux.${tilt_arch}.tar.gz" && \ + tar -xzf /tmp/tilt.tar.gz -C /tmp tilt && \ + install -m 0755 /tmp/tilt /usr/local/bin/tilt && \ + rm -f /tmp/tilt /tmp/tilt.tar.gz # Install Delve debugger for Go debugging in VSCode # ACLs from CI stage should be preserved, so no need to re-apply diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 7ada7341..4b5cde7a 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -22,7 +22,8 @@ ], "forwardPorts": [ 13000, - 19090 + 19090, + 10350 ], "portsAttributes": { "13000": { @@ -32,6 +33,10 @@ "19090": { "label": "Prometheus", "onAutoForward": "notify" + }, + "10350": { + "label": "tilt", + "onAutoForward": "notify" } }, "customizations": { diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index be735903..e7916f42 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -77,6 +77,8 @@ jobs: helm version golangci-lint version controller-gen --version + kind version + docker --version echo '✅ All CI container tools verified' " @@ -122,7 +124,7 @@ jobs: helm version golangci-lint version kind version - docker --version || echo 'Docker CLI not available (expected in container)' + docker --version echo '✅ All dev container tools verified' " @@ -252,36 +254,12 @@ jobs: needs: [build-ci-container, docker-build] env: PROJECT_IMAGE: ${{ needs.docker-build.outputs.image }} - KIND_CLUSTER: gitops-reverser-test-e2e CI_CONTAINER: ${{ needs.build-ci-container.outputs.image }} - + steps: - name: Checkout code uses: actions/checkout@v6 - - name: Generate Kind cluster config from template - env: - HOST_PROJECT_PATH: ${{ github.workspace }} - run: | - echo "🔧 Generating cluster config with HOST_PROJECT_PATH=${HOST_PROJECT_PATH}" - envsubst < test/e2e/kind/cluster-template.yaml > test/e2e/kind/cluster.yaml - echo "✅ Generated configuration:" - cat test/e2e/kind/cluster.yaml - - - name: Set up Kind cluster with audit webhook support - uses: helm/kind-action@v1.13.0 - with: - cluster_name: ${{ env.KIND_CLUSTER }} - version: v0.31.0 - config: test/e2e/kind/cluster.yaml - wait: 5m - - - name: Verify cluster and audit setup - run: | - kubectl cluster-info - kubectl get nodes - echo "✅ Kind cluster is ready" - - name: Login to Docker registry run: | echo "${{ secrets.GITHUB_TOKEN }}" | docker login ${{ env.REGISTRY }} -u ${{ github.actor }} --password-stdin @@ -291,10 +269,10 @@ jobs: docker run --rm \ --network host \ -v "${GITHUB_WORKSPACE}:${{ env.CI_WORKDIR }}" \ - -v "$HOME/.kube:/root/.kube" \ + -v /var/run/docker.sock:/var/run/docker.sock \ -w "${{ env.CI_WORKDIR }}" \ -e PROJECT_IMAGE=${{ env.PROJECT_IMAGE }} \ - -e KIND_CLUSTER=${{ env.KIND_CLUSTER }} \ + -e HOST_PROJECT_PATH=${{ github.workspace }} \ ${{ env.CI_CONTAINER }} \ bash -c " git config --global --add safe.directory ${{ env.CI_WORKDIR }} @@ -310,7 +288,6 @@ jobs: scenario: [helm, manifest] env: PROJECT_IMAGE: ${{ needs.docker-build.outputs.image }} - KIND_CLUSTER: gitops-reverser-test-e2e-smoke-${{ matrix.scenario }} CI_CONTAINER: ${{ needs.build-ci-container.outputs.image }} steps: @@ -323,29 +300,6 @@ jobs: name: release-bundle path: . - - name: Generate Kind cluster config from template - env: - HOST_PROJECT_PATH: ${{ github.workspace }} - run: | - echo "🔧 Generating cluster config with HOST_PROJECT_PATH=${HOST_PROJECT_PATH}" - envsubst < test/e2e/kind/cluster-template.yaml > test/e2e/kind/cluster.yaml - echo "✅ Generated configuration:" - cat test/e2e/kind/cluster.yaml - - - name: Set up Kind cluster with audit webhook support - uses: helm/kind-action@v1.13.0 - with: - cluster_name: ${{ env.KIND_CLUSTER }} - version: v0.31.0 - config: test/e2e/kind/cluster.yaml - wait: 5m - - - name: Verify cluster setup - run: | - kubectl cluster-info - kubectl get nodes - echo "✅ Kind cluster is ready" - - name: Login to Docker registry run: | echo "${{ secrets.GITHUB_TOKEN }}" | docker login ${{ env.REGISTRY }} -u ${{ github.actor }} --password-stdin @@ -356,11 +310,11 @@ jobs: docker run --rm \ --network host \ -v "${GITHUB_WORKSPACE}:${{ env.CI_WORKDIR }}" \ - -v "$HOME/.kube:/root/.kube" \ + -v /var/run/docker.sock:/var/run/docker.sock \ -w "${{ env.CI_WORKDIR }}" \ -e PROJECT_IMAGE=${{ env.PROJECT_IMAGE }} \ - -e KIND_CLUSTER=${{ env.KIND_CLUSTER }} \ -e HELM_CHART_SOURCE="./gitops-reverser.tgz" \ + -e HOST_PROJECT_PATH=${{ github.workspace }} \ ${{ env.CI_CONTAINER }} \ bash -c " git config --global --add safe.directory ${{ env.CI_WORKDIR }} diff --git a/.gitignore b/.gitignore index e09469fb..0d919021 100644 --- a/.gitignore +++ b/.gitignore @@ -39,3 +39,4 @@ dist/install.yaml *.o.yaml *.ignore.* +.stamps/* \ No newline at end of file diff --git a/.stamps/cluster/pf.running b/.stamps/cluster/pf.running new file mode 100644 index 00000000..e69de29b diff --git a/AGENTS.md b/AGENTS.md index f7010eb1..f87a142e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -17,6 +17,13 @@ make test # Must pass all unit tests with >90% coverage make test-e2e # Must pass end-to-end tests ``` +And before you are really really wrapping up, also run: + +```bash +make test-e2e-quickstart-manifest +make test-e2e-quickstart-helm +``` + ## PRE-IMPLEMENTATION BEHAVIOR 1. **Check Docker availability for e2e tests**: Before running `make test-e2e`, verify Docker is running with `docker info` or ask user to start Docker daemon if needed diff --git a/Makefile b/Makefile index 2246bca8..620dc75a 100644 --- a/Makefile +++ b/Makefile @@ -44,13 +44,14 @@ help: ## Display this help. .PHONY: manifests manifests: ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. @rm -f config/crd/bases/*.yaml - $(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases + $(CONTROLLER_GEN) rbac:roleName=gitops-reverser crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases .PHONY: helm-sync helm-sync: ## Sync CRDs and roles from config/crd/bases to Helm chart crds directory (for packaging) @rm -f charts/gitops-reverser/crds/*.yaml @cp config/crd/bases/*.yaml charts/gitops-reverser/crds/ - @cp config/rbac/role.yaml charts/gitops-reverser/config + @rm -f charts/gitops-reverser/config/*.yaml + @cp config/rbac/cluster-role.yaml charts/gitops-reverser/config/ .PHONY: generate generate: ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations. @@ -68,60 +69,56 @@ vet: ## Run go vet against code. test: manifests generate fmt vet setup-envtest ## Run tests. KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(shell pwd)/bin -p path)" go test $$(go list ./... | grep -v /e2e) -coverprofile cover.out -KIND_CLUSTER ?= gitops-reverser-test-e2e +# Kind cluster names per E2E test type (kept separate to avoid cross-test contamination). +KIND_CLUSTER_E2E ?= gitops-reverser-test-e2e +KIND_CLUSTER_QUICKSTART_HELM ?= gitops-reverser-test-e2e-quickstart-helm +KIND_CLUSTER_QUICKSTART_MANIFEST ?= gitops-reverser-test-e2e-quickstart-manifest +# KIND_CLUSTER is used by cleanup-cluster; defaults to the main e2e cluster. +KIND_CLUSTER ?= $(KIND_CLUSTER_E2E) E2E_LOCAL_IMAGE ?= gitops-reverser:e2e-local +# In CI, PROJECT_IMAGE is the pre-built image from the docker-build job; locally we build E2E_LOCAL_IMAGE. +E2E_IMAGE := $(if $(PROJECT_IMAGE),$(PROJECT_IMAGE),$(E2E_LOCAL_IMAGE)) +E2E_AGE_KEY_FILE ?= /tmp/e2e-age-key.txt CERT_MANAGER_WAIT_TIMEOUT ?= 600s CERT_MANAGER_VERSION ?= v1.19.1 CERT_MANAGER_MANIFEST_URL ?= https://github.com/cert-manager/cert-manager/releases/download/$(CERT_MANAGER_VERSION)/cert-manager.yaml -.PHONY: setup-cluster -setup-cluster: ## Set up a Kind cluster for e2e tests if it does not exist - @if ! command -v $(KIND) >/dev/null 2>&1; then \ - echo "Kind is not installed - skipping Makefile cluster creation (expected in CI runs since we use helm/kind-action)"; \ - else \ - KIND_CLUSTER=$(KIND_CLUSTER) bash test/e2e/kind/start-cluster.sh; \ - fi +# CTX: kubeconfig context for the cluster being operated on. +# Defaults to the main e2e cluster; override with CTX=kind- to reuse stamp targets for other clusters. +CTX ?= kind-$(KIND_CLUSTER_E2E) +# Derive the Kind cluster name by stripping the "kind-" prefix from CTX. +CLUSTER_FROM_CTX = $(patsubst kind-%,%,$(CTX)) +CS := .stamps/cluster/$(CTX) +IS := .stamps/image +GO_SOURCES := $(shell find cmd internal -type f -name '*.go') go.mod go.sum + +KUBECONTEXT_QS_HELM := kind-$(KIND_CLUSTER_QUICKSTART_HELM) +KUBECONTEXT_QS_MANIFEST := kind-$(KIND_CLUSTER_QUICKSTART_MANIFEST) .PHONY: cleanup-cluster -cleanup-cluster: ## Tear down the Kind cluster used for e2e tests +cleanup-cluster: ## Tear down the Kind cluster used for e2e tests and remove its stamps @if $(KIND) get clusters 2>/dev/null | grep -q "^$(KIND_CLUSTER)$$"; then \ echo "🧹 Deleting Kind cluster '$(KIND_CLUSTER)'"; \ $(KIND) delete cluster --name $(KIND_CLUSTER); \ else \ echo "ℹ️ Kind cluster '$(KIND_CLUSTER)' does not exist; skipping cleanup"; \ fi - -.PHONY: e2e-build-load-image -e2e-build-load-image: ## Build local image and load it into the Kind cluster used by local e2e flows - @if [ -n "$(PROJECT_IMAGE)" ]; then \ - echo "🐳 Building local image $(PROJECT_IMAGE)"; \ - $(CONTAINER_TOOL) build -t $(PROJECT_IMAGE) .; \ - echo "📦 Loading image $(PROJECT_IMAGE) into Kind cluster $(KIND_CLUSTER)"; \ - $(KIND) load docker-image $(PROJECT_IMAGE) --name $(KIND_CLUSTER); \ - else \ - echo "🐳 Building local image $(E2E_LOCAL_IMAGE)"; \ - $(CONTAINER_TOOL) build -t $(E2E_LOCAL_IMAGE) .; \ - echo "📦 Loading image $(E2E_LOCAL_IMAGE) into Kind cluster $(KIND_CLUSTER)"; \ - $(KIND) load docker-image $(E2E_LOCAL_IMAGE) --name $(KIND_CLUSTER); \ - fi + rm -rf .stamps/cluster/kind-$(KIND_CLUSTER) + +.PHONY: cleanup-e2e-clusters +cleanup-e2e-clusters: ## Tear down all E2E Kind clusters and their stamps + @for cluster in "$(KIND_CLUSTER_E2E)" "$(KIND_CLUSTER_QUICKSTART_HELM)" "$(KIND_CLUSTER_QUICKSTART_MANIFEST)"; do \ + if $(KIND) get clusters 2>/dev/null | grep -q "^$$cluster$$"; then \ + echo "🧹 Deleting Kind cluster '$$cluster'"; \ + $(KIND) delete cluster --name "$$cluster"; \ + else \ + echo "ℹ️ Kind cluster '$$cluster' does not exist; skipping cleanup"; \ + fi; \ + rm -rf .stamps/cluster/kind-$$cluster; \ + done .PHONY: test-e2e -test-e2e: setup-cluster cleanup-webhook setup-e2e check-cert-manager manifests setup-port-forwards ## Run end-to-end tests in Kind cluster, note that vet, fmt and generate are not run! - @echo "ℹ️ test-e2e reuses the existing Kind cluster (no cluster cleanup in this target)"; \ - if [ -n "$(PROJECT_IMAGE)" ]; then \ - echo "ℹ️ Entry point selected pre-built image (CI-friendly): $(PROJECT_IMAGE)"; \ - echo "ℹ️ Skipping local image build/load for pre-built image path"; \ - KIND_CLUSTER=$(KIND_CLUSTER) PROJECT_IMAGE="$(PROJECT_IMAGE)" go test ./test/e2e/ -v -ginkgo.v; \ - else \ - echo "ℹ️ Entry point selected local fallback image: $(E2E_LOCAL_IMAGE)"; \ - echo "ℹ️ Building/loading local image into existing cluster"; \ - $(MAKE) e2e-build-load-image KIND_CLUSTER=$(KIND_CLUSTER); \ - KIND_CLUSTER=$(KIND_CLUSTER) PROJECT_IMAGE="$(E2E_LOCAL_IMAGE)" go test ./test/e2e/ -v -ginkgo.v; \ - fi - -.PHONY: cleanup-webhook -cleanup-webhook: ## Preventive cleanup of ValidatingWebhookConfiguration potenially left by previous test runs - $(KUBECTL) delete ValidatingWebhookConfiguration gitops-reverser-validating-webhook-configuration --ignore-not-found=true +test-e2e: $(CS)/e2e.passed ## Run end-to-end tests (stamp-based; only reruns what changed) .PHONY: lint lint: ## Run golangci-lint linter @@ -230,27 +227,156 @@ setup-envtest: ## Setup envtest binaries for unit tests exit 1; \ } -##@ E2E Test Infrastructure +##@ E2E Stamp Targets (cluster-parameterized; pass CTX=kind- to target a different cluster) -.PHONY: setup-gitea-e2e -setup-gitea-e2e: ## Set up Gitea for e2e testing - @echo "🚀 Setup Gitea for e2e testing..." - @$(HELM) repo add gitea-charts https://dl.gitea.com/charts/ 2>/dev/null || true - @$(HELM) repo update gitea-charts - @$(KUBECTL) create namespace $(GITEA_NAMESPACE) --dry-run=client -o yaml | $(KUBECTL) apply -f - - @$(HELM) upgrade --install gitea gitea-charts/gitea \ - --namespace $(GITEA_NAMESPACE) \ - --version $(GITEA_CHART_VERSION) \ - --values test/e2e/gitea-values.yaml +# DEPLOY_INPUTS: all config files except CRDs (tracked separately by crds.applied). +# Only relevant for the main e2e cluster (controller.deployed is not used by quickstart targets). +# When PROJECT_IMAGE is set (CI), skip the local image build stamp. +ifeq ($(PROJECT_IMAGE),) +DEPLOY_INPUTS := $(CS)/crds.applied $(CS)/cert-manager.installed $(CS)/prometheus.installed \ + $(CS)/sut.namespace.ready \ + $(IS)/controller.id \ + $(shell find config -type f -not -path 'config/crd/*') +else +DEPLOY_INPUTS := $(CS)/crds.applied $(CS)/cert-manager.installed $(CS)/prometheus.installed \ + $(CS)/sut.namespace.ready \ + $(shell find config -type f -not -path 'config/crd/*') +endif + +$(CS)/ready: test/e2e/kind/start-cluster.sh test/e2e/kind/cluster-template.yaml + mkdir -p $(CS) + KIND_CLUSTER=$(CLUSTER_FROM_CTX) bash test/e2e/kind/start-cluster.sh + kubectl --context $(CTX) get ns >/dev/null + touch $@ + +$(CS)/sut.namespace.ready: $(CS)/ready config/namespace.yaml + mkdir -p $(CS) + kubectl --context $(CTX) apply -f config/namespace.yaml + kubectl --context $(CTX) label --overwrite ns sut pod-security.kubernetes.io/enforce=restricted + touch $@ + +$(CS)/cert-manager.installed: $(CS)/ready + mkdir -p $(CS) + kubectl --context $(CTX) apply -f $(CERT_MANAGER_MANIFEST_URL) | grep -v unchanged + kubectl --context $(CTX) -n cert-manager rollout status deploy/cert-manager --timeout=$(CERT_MANAGER_WAIT_TIMEOUT) + kubectl --context $(CTX) -n cert-manager rollout status deploy/cert-manager-webhook --timeout=$(CERT_MANAGER_WAIT_TIMEOUT) + kubectl --context $(CTX) -n cert-manager rollout status deploy/cert-manager-cainjector --timeout=$(CERT_MANAGER_WAIT_TIMEOUT) + touch $@ + +$(CS)/gitea.installed: $(CS)/ready test/e2e/gitea-values.yaml + mkdir -p $(CS) + $(HELM) repo add gitea-charts https://dl.gitea.com/charts/ 2>/dev/null || true + $(HELM) repo update gitea-charts + kubectl --context $(CTX) create namespace $(GITEA_NAMESPACE) --dry-run=client -o yaml \ + | kubectl --context $(CTX) apply -f - + $(HELM) --kube-context $(CTX) upgrade --install gitea gitea-charts/gitea \ + --namespace $(GITEA_NAMESPACE) \ + --version $(GITEA_CHART_VERSION) \ + --values test/e2e/gitea-values.yaml + kubectl --context $(CTX) -n $(GITEA_NAMESPACE) rollout status deploy/gitea --timeout=300s + touch $@ + +$(CS)/prometheus.installed: $(CS)/ready test/e2e/scripts/ensure-prometheus-operator.sh + mkdir -p $(CS) + KUBECONTEXT=$(CTX) bash test/e2e/scripts/ensure-prometheus-operator.sh + touch $@ + +$(CS)/age-key.applied: $(CS)/sut.namespace.ready test/e2e/tools/gen-age-key/main.go + mkdir -p $(CS) + go run ./test/e2e/tools/gen-age-key \ + --key-file $(CS)/age-key.txt \ + --secret-file $(CS)/age-key-secret.yaml \ + --namespace sut \ + --secret-name sops-age-key + kubectl --context $(CTX) apply -f $(CS)/age-key-secret.yaml + touch $@ + +$(IS)/controller.id: $(GO_SOURCES) Dockerfile + mkdir -p $(IS) + $(CONTAINER_TOOL) build \ + --build-arg TARGETOS=$(shell go env GOOS) \ + --build-arg TARGETARCH=$(shell go env GOARCH) \ + -t $(E2E_LOCAL_IMAGE) . + $(CONTAINER_TOOL) inspect --format='{{.Id}}' $(E2E_LOCAL_IMAGE) > $@ + +$(CS)/crds.applied: $(CS)/ready $(shell find config/crd -type f) + mkdir -p $(CS) + kubectl --context $(CTX) apply -k config/crd + kubectl --context $(CTX) wait --for=condition=Established crd --all --timeout=120s + touch $@ + +$(CS)/image.loaded: $(IS)/controller.id $(CS)/ready + $(KIND) load docker-image $(E2E_LOCAL_IMAGE) --name $(CLUSTER_FROM_CTX) + touch $@ + +$(CS)/controller.deployed: $(DEPLOY_INPUTS) + mkdir -p $(CS) + @[ -z "$(PROJECT_IMAGE)" ] || $(CONTAINER_TOOL) pull $(E2E_IMAGE) + $(KIND) load docker-image $(E2E_IMAGE) --name $(CLUSTER_FROM_CTX) + kubectl --context $(CTX) delete validatingwebhookconfiguration \ + gitops-reverser-validating-webhook-configuration --ignore-not-found=true + cd config && $(KUSTOMIZE) edit set image gitops-reverser=$(E2E_IMAGE) + $(KUSTOMIZE) build config | kubectl --context $(CTX) apply -f - + @if [ -z "$(PROJECT_IMAGE)" ]; then \ + IMAGE_ID="$$(cat $(IS)/controller.id)"; \ + kubectl --context $(CTX) -n sut patch deploy/gitops-reverser --type=merge \ + -p "{\"spec\":{\"template\":{\"metadata\":{\"annotations\":{\"configbutler.ai/e2e-image-id\":\"$$IMAGE_ID\"}}}}}"; \ + fi + kubectl --context $(CTX) -n sut rollout status deploy/gitops-reverser --timeout=180s + touch $@ + +.PHONY: $(CS)/portforward.running +$(CS)/portforward.running: $(CS)/controller.deployed $(CS)/gitea.installed $(CS)/prometheus.installed + mkdir -p $(CS) + if curl -fsS http://localhost:13000/api/healthz >/dev/null 2>&1 && \ + curl -fsS http://localhost:19090/-/healthy >/dev/null 2>&1; then \ + echo "port-forwards already healthy, skipping restart"; \ + touch $@; exit 0; \ + fi + kubectl config use-context $(CTX) + bash test/e2e/scripts/setup-port-forwards.sh + curl -fsS http://localhost:13000/api/healthz >/dev/null || { echo "Gitea health check failed"; exit 1; } + curl -fsS http://localhost:19090/-/healthy >/dev/null || { echo "Prometheus health check failed"; exit 1; } + touch $@ + +$(CS)/e2e.passed: $(CS)/portforward.running $(CS)/age-key.applied $(shell find test/e2e -name '*.go') + mkdir -p $(CS) + kubectl --context $(CTX) delete crd icecreamorders.shop.example.com --ignore-not-found=true + KIND_CLUSTER=$(CLUSTER_FROM_CTX) PROJECT_IMAGE=$(E2E_IMAGE) \ + E2E_AGE_KEY_FILE=$(CS)/age-key.txt \ + go test ./test/e2e/ -v -ginkgo.v + touch $@ + +##@ E2E Quickstart Tests -.PHONY: setup-cert-manager -setup-cert-manager: - @echo "🚀 Setting up cert-manager..." - @$(KUBECTL) apply -f $(CERT_MANAGER_MANIFEST_URL) | grep -v "unchanged" +.PHONY: test-e2e-quickstart-helm +test-e2e-quickstart-helm: ## Run quickstart smoke test (Helm install) - always starts with a clean cluster + $(MAKE) cleanup-cluster KIND_CLUSTER=$(KIND_CLUSTER_QUICKSTART_HELM) + $(MAKE) CTX=$(KUBECONTEXT_QS_HELM) \ + .stamps/cluster/$(KUBECONTEXT_QS_HELM)/cert-manager.installed \ + .stamps/cluster/$(KUBECONTEXT_QS_HELM)/gitea.installed + @if [ -z "$(PROJECT_IMAGE)" ]; then \ + $(MAKE) CTX=$(KUBECONTEXT_QS_HELM) \ + .stamps/cluster/$(KUBECONTEXT_QS_HELM)/image.loaded; \ + fi + kubectl config use-context $(KUBECONTEXT_QS_HELM) + PROJECT_IMAGE=$(E2E_IMAGE) \ + bash test/e2e/scripts/run-quickstart.sh helm -.PHONY: setup-port-forwards -setup-port-forwards: ## Start all port-forwards in background - @bash test/e2e/scripts/setup-port-forwards.sh +.PHONY: test-e2e-quickstart-manifest +test-e2e-quickstart-manifest: ## Run quickstart smoke test (manifest install) - always starts with a clean cluster + $(MAKE) cleanup-cluster KIND_CLUSTER=$(KIND_CLUSTER_QUICKSTART_MANIFEST) + $(MAKE) CTX=$(KUBECONTEXT_QS_MANIFEST) \ + .stamps/cluster/$(KUBECONTEXT_QS_MANIFEST)/cert-manager.installed \ + .stamps/cluster/$(KUBECONTEXT_QS_MANIFEST)/gitea.installed + @if [ -z "$(PROJECT_IMAGE)" ]; then \ + $(MAKE) build-installer; \ + $(MAKE) CTX=$(KUBECONTEXT_QS_MANIFEST) \ + .stamps/cluster/$(KUBECONTEXT_QS_MANIFEST)/image.loaded; \ + fi + kubectl config use-context $(KUBECONTEXT_QS_MANIFEST) + PROJECT_IMAGE=$(E2E_IMAGE) \ + bash test/e2e/scripts/run-quickstart.sh manifest .PHONY: cleanup-port-forwards cleanup-port-forwards: ## Stop all port-forwards @@ -265,82 +391,3 @@ cleanup-gitea-e2e: cleanup-port-forwards ## Clean up Gitea e2e environment @$(HELM) uninstall gitea --namespace $(GITEA_NAMESPACE) 2>/dev/null || true @$(KUBECTL) delete namespace $(GITEA_NAMESPACE) 2>/dev/null || true @echo "✅ Gitea cleanup completed" - -.PHONY: setup-prometheus-e2e -setup-prometheus-e2e: ## Set up Prometheus for e2e metrics testing - @echo "🚀 Setup Prometheus for e2e testing..." - @bash test/e2e/scripts/setup-prometheus.sh - -.PHONY: cleanup-prometheus-e2e -cleanup-prometheus-e2e: ## Clean up Prometheus e2e environment - @echo "🧹 Cleaning up Prometheus e2e environment..." - @$(KUBECTL) delete -f test/e2e/prometheus/deployment.yaml --ignore-not-found=true - @$(KUBECTL) delete -f test/e2e/prometheus/rbac.yaml --ignore-not-found=true - @$(KUBECTL) delete namespace prometheus-e2e --ignore-not-found=true - @echo "✅ Prometheus cleanup completed" - -.PHONY: setup-e2e -setup-e2e: setup-cert-manager setup-gitea-e2e setup-prometheus-e2e ## Setup all e2e test infrastructure - @echo "✅ E2E infrastructure initialized" - -.PHONY: wait-cert-manager -wait-cert-manager: ## Wait for cert-manager pods to become ready - @echo "⏳ Waiting for cert-manager deployments to become available (timeout=$(CERT_MANAGER_WAIT_TIMEOUT))..." - @set -e; \ - for dep in cert-manager cert-manager-webhook cert-manager-cainjector; do \ - echo " - waiting for deployment/$$dep"; \ - if ! $(KUBECTL) -n cert-manager rollout status deploy/$$dep --timeout=$(CERT_MANAGER_WAIT_TIMEOUT); then \ - echo "❌ Timed out waiting for cert-manager readiness (deployment=$$dep)"; \ - echo "📋 cert-manager deployments and pods:"; \ - $(KUBECTL) -n cert-manager get deploy,pod -o wide || true; \ - echo "📋 recent cert-manager events:"; \ - $(KUBECTL) -n cert-manager get events --sort-by=.metadata.creationTimestamp | tail -n 80 || true; \ - echo "📋 recent cert-manager logs:"; \ - $(KUBECTL) -n cert-manager logs -l app.kubernetes.io/instance=cert-manager --all-containers=true --tail=120 || true; \ - exit 1; \ - fi; \ - done - @echo "✅ cert-manager is ready" - -.PHONY: check-cert-manager -check-cert-manager: wait-cert-manager ## Explicit readiness check for cert-manager - @echo "✅ cert-manager check passed" - -## Smoke test: install from local Helm chart and validate first quickstart flow -.PHONY: test-e2e-quickstart -test-e2e-quickstart: ## Install + quickstart smoke with E2E_INSTALL_MODE=helm|manifest - @MODE="$(E2E_INSTALL_MODE)"; \ - if [ "$$MODE" != "helm" ] && [ "$$MODE" != "manifest" ]; then \ - echo "❌ Invalid E2E_INSTALL_MODE='$$MODE' (expected: helm|manifest)"; \ - exit 1; \ - fi; \ - PROJECT_IMAGE_VALUE="$(PROJECT_IMAGE)"; \ - if [ -n "$$PROJECT_IMAGE_VALUE" ]; then \ - echo "ℹ️ Entry point selected pre-built image (probably running in CI): $$PROJECT_IMAGE_VALUE"; \ - echo "ℹ️ Skipping cluster cleanup for pre-built image path"; \ - KIND_CLUSTER=$(KIND_CLUSTER) $(MAKE) setup-cluster setup-e2e check-cert-manager; \ - else \ - PROJECT_IMAGE_VALUE="$(E2E_LOCAL_IMAGE)"; \ - echo "🧹 Local fallback path: cleaning cluster to test a clean install"; \ - KIND_CLUSTER=$(KIND_CLUSTER) $(MAKE) cleanup-cluster; \ - echo "ℹ️ Entry point selected local fallback image: $$PROJECT_IMAGE_VALUE"; \ - KIND_CLUSTER=$(KIND_CLUSTER) PROJECT_IMAGE="$$PROJECT_IMAGE_VALUE" $(MAKE) setup-cluster setup-e2e check-cert-manager e2e-build-load-image; \ - fi; \ - echo "ℹ️ Running install quickstart smoke mode: $$MODE"; \ - PROJECT_IMAGE="$$PROJECT_IMAGE_VALUE" bash test/e2e/scripts/run-quickstart.sh "$$MODE"; \ - -## Smoke test: install from local Helm chart and validate first quickstart flow -.PHONY: test-e2e-quickstart-helm -test-e2e-quickstart-helm: - @$(MAKE) test-e2e-quickstart E2E_INSTALL_MODE=helm PROJECT_IMAGE="$(PROJECT_IMAGE)" KIND_CLUSTER="$(KIND_CLUSTER)" - -## Smoke test: install from generated dist/install.yaml and validate first quickstart flow -.PHONY: test-e2e-quickstart-manifest -test-e2e-quickstart-manifest: - @if [ -n "$(PROJECT_IMAGE)" ]; then \ - echo "ℹ️ test-e2e-quickstart-manifest using existing artifact (PROJECT_IMAGE set, CI/pre-built path)"; \ - else \ - echo "ℹ️ test-e2e-quickstart-manifest local path: regenerating dist/install.yaml via build-installer"; \ - $(MAKE) build-installer; \ - fi - @$(MAKE) test-e2e-quickstart E2E_INSTALL_MODE=manifest PROJECT_IMAGE="$(PROJECT_IMAGE)" KIND_CLUSTER="$(KIND_CLUSTER)" diff --git a/Tiltfile b/Tiltfile new file mode 100644 index 00000000..5aedefd0 --- /dev/null +++ b/Tiltfile @@ -0,0 +1,51 @@ +# Tilt inner loop for gitops-reverser on Kind (DOOD-friendly). +# This keeps the current Makefile flow and automates rebuild/load/restart. + +kind_cluster = os.getenv('KIND_CLUSTER', 'gitops-reverser-test-e2e') +expected_context = 'kind-' + kind_cluster + +allow_k8s_contexts(expected_context) + +current_context = str(k8s_context()) +if current_context != expected_context: + fail( + 'Tilt expects kube context "%s", but current context is "%s". Set KIND_CLUSTER or switch context before running tilt.' + % (expected_context, current_context), + ) + +update_settings(max_parallel_updates=1) + +docker_build( + 'gitops-reverser:e2e-local', + '.', + dockerfile='Dockerfile', +) + +local_resource( + 'cluster-prereqs', + cmd='KIND_CLUSTER=%s make setup-cluster cleanup-webhook setup-e2e check-cert-manager' % kind_cluster, + trigger_mode=TRIGGER_MODE_MANUAL, + auto_init=True, +) + +k8s_yaml(kustomize('config')) + +k8s_resource( + 'gitops-reverser', + resource_deps=['cluster-prereqs'], + port_forwards=[8443, 9443, 9444], +) + +local_resource( + 'port-forwards', + cmd='bash test/e2e/scripts/setup-port-forwards.sh', + trigger_mode=TRIGGER_MODE_MANUAL, + resource_deps=['cluster-prereqs'], +) + +local_resource( + 'make test-e2e', + cmd='KIND_CLUSTER=%s PROJECT_IMAGE=gitops-reverser:e2e-local make test-e2e' % kind_cluster, + trigger_mode=TRIGGER_MODE_MANUAL, + resource_deps=['gitops-reverser'], +) diff --git a/charts/gitops-reverser/templates/_helpers.tpl b/charts/gitops-reverser/templates/_helpers.tpl index 9a647acc..b0b20256 100644 --- a/charts/gitops-reverser/templates/_helpers.tpl +++ b/charts/gitops-reverser/templates/_helpers.tpl @@ -58,7 +58,9 @@ Create the name of the service account to use {{- define "gitops-reverser.serviceAccountName" -}} {{- if .Values.serviceAccount.create }} {{- default (include "gitops-reverser.fullname" .) .Values.serviceAccount.name }} +{{- else if .Values.serviceAccount.name }} +{{- .Values.serviceAccount.name }} {{- else }} -{{- default "default" .Values.serviceAccount.name }} +{{- fail "serviceAccount.name must be set when serviceAccount.create=false" }} +{{- end }} {{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/gitops-reverser/templates/rbac.yaml b/charts/gitops-reverser/templates/rbac.yaml index a37fba09..4b6bd0d5 100644 --- a/charts/gitops-reverser/templates/rbac.yaml +++ b/charts/gitops-reverser/templates/rbac.yaml @@ -1,7 +1,7 @@ {{- if .Values.rbac.create -}} -{{- $roleYaml := .Files.Get "config/role.yaml" -}} +{{- $roleYaml := .Files.Get "config/cluster-role.yaml" -}} {{- if eq $roleYaml "" -}} -{{- fail "charts/gitops-reverser/config/role.yaml is missing. Run `make helm-sync` to regenerate chart RBAC artifacts before templating/installing." -}} +{{- fail "charts/gitops-reverser/config/cluster-role.yaml is missing. Run `make helm-sync` to regenerate chart RBAC artifacts before templating/installing." -}} {{- end -}} {{- $roleFile := $roleYaml | fromYaml }} @@ -25,53 +25,6 @@ roleRef: kind: ClusterRole name: {{ include "gitops-reverser.fullname" . }}-manager-role subjects: -- kind: ServiceAccount - name: {{ include "gitops-reverser.serviceAccountName" . }} - namespace: {{ .Release.Namespace }} ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: {{ include "gitops-reverser.fullname" . }}-metrics-reader - labels: - {{- include "gitops-reverser.labels" . | nindent 4 }} -rules: -- nonResourceURLs: - - "/metrics" - verbs: - - get ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: {{ include "gitops-reverser.fullname" . }}-proxy-role - labels: - {{- include "gitops-reverser.labels" . | nindent 4 }} -rules: -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: {{ include "gitops-reverser.fullname" . }}-proxy-rolebinding - labels: - {{- include "gitops-reverser.labels" . | nindent 4 }} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: {{ include "gitops-reverser.fullname" . }}-proxy-role -subjects: - kind: ServiceAccount name: {{ include "gitops-reverser.serviceAccountName" . }} namespace: {{ .Release.Namespace }} diff --git a/charts/gitops-reverser/templates/serviceaccount.yaml b/charts/gitops-reverser/templates/serviceaccount.yaml index d2cc5198..cf31405c 100644 --- a/charts/gitops-reverser/templates/serviceaccount.yaml +++ b/charts/gitops-reverser/templates/serviceaccount.yaml @@ -10,5 +10,4 @@ metadata: annotations: {{- toYaml . | nindent 4 }} {{- end }} -automountServiceAccountToken: true {{- end }} \ No newline at end of file diff --git a/charts/gitops-reverser/values.yaml b/charts/gitops-reverser/values.yaml index 620e3164..f87b2749 100644 --- a/charts/gitops-reverser/values.yaml +++ b/charts/gitops-reverser/values.yaml @@ -146,10 +146,10 @@ rbac: resources: limits: cpu: 500m - memory: 128Mi + memory: 256Mi requests: cpu: 10m - memory: 64Mi + memory: 128Mi # Node selection nodeSelector: {} diff --git a/cmd/main.go b/cmd/main.go index 3f147bf8..0828ffa0 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -341,9 +341,9 @@ func parseFlagsWithArgs(fs *flag.FlagSet, args []string) (appConfig, error) { fs.DurationVar(&cfg.auditIdleTimeout, "audit-idle-timeout", defaultAuditIdleTimeout, "Idle timeout for the dedicated audit ingress HTTPS server.") cfg.zapOpts = zap.Options{ - Development: true, - // Enable more detailed logging for debugging - Level: zapcore.InfoLevel, // Change to DebugLevel for even more verbose output + // Production mode defaults to JSON encoding, which is easier for log processors to parse. + Development: false, + Level: zapcore.InfoLevel, } cfg.zapOpts.BindFlags(fs) diff --git a/cmd/main_audit_server_test.go b/cmd/main_audit_server_test.go index a644eab9..c81b36d6 100644 --- a/cmd/main_audit_server_test.go +++ b/cmd/main_audit_server_test.go @@ -42,6 +42,7 @@ func TestParseFlagsWithArgs_Defaults(t *testing.T) { assert.Equal(t, 15*time.Second, cfg.auditReadTimeout) assert.Equal(t, 30*time.Second, cfg.auditWriteTimeout) assert.Equal(t, 60*time.Second, cfg.auditIdleTimeout) + assert.False(t, cfg.zapOpts.Development) } func TestParseFlagsWithArgs_AuditUnsecure(t *testing.T) { diff --git a/config/README.md b/config/README.md index 961026ae..8c1b1f6a 100644 --- a/config/README.md +++ b/config/README.md @@ -15,3 +15,5 @@ The Helm chart is the recommended installation and lifecycle management path for ## Notes - These manifests are opinionated toward the local/e2e setup. - Keep them simple and explicit; avoid reintroducing heavy kustomize indirection here. +- `config/monitoring/` contains Prometheus Operator e2e monitoring resources + (Prometheus + ServiceMonitor + scrape RBAC). diff --git a/config/deployment.yaml b/config/deployment.yaml index eddd54f5..e52e56c4 100644 --- a/config/deployment.yaml +++ b/config/deployment.yaml @@ -4,22 +4,22 @@ metadata: labels: app.kubernetes.io/managed-by: kustomize app.kubernetes.io/name: gitops-reverser - control-plane: controller-manager - name: gitops-reverser-controller-manager + control-plane: gitops-reverser + name: gitops-reverser namespace: sut spec: replicas: 1 selector: matchLabels: app.kubernetes.io/name: gitops-reverser - control-plane: controller-manager + control-plane: gitops-reverser template: metadata: annotations: kubectl.kubernetes.io/default-container: manager labels: app.kubernetes.io/name: gitops-reverser - control-plane: controller-manager + control-plane: gitops-reverser spec: containers: - args: @@ -66,10 +66,10 @@ spec: resources: limits: cpu: 500m - memory: 128Mi + memory: 256Mi requests: cpu: 10m - memory: 64Mi + memory: 128Mi securityContext: allowPrivilegeEscalation: false capabilities: @@ -89,7 +89,7 @@ spec: runAsNonRoot: true seccompProfile: type: RuntimeDefault - serviceAccountName: gitops-reverser-controller-manager + serviceAccountName: gitops-reverser terminationGracePeriodSeconds: 10 volumes: - emptyDir: {} diff --git a/config/kustomization.yaml b/config/kustomization.yaml index 570f21c2..d544f7fa 100644 --- a/config/kustomization.yaml +++ b/config/kustomization.yaml @@ -8,6 +8,7 @@ resources: - deployment.yaml - certs - webhook.yaml +- monitoring images: - name: gitops-reverser newName: gitops-reverser diff --git a/config/monitoring/kustomization.yaml b/config/monitoring/kustomization.yaml new file mode 100644 index 00000000..d63c162e --- /dev/null +++ b/config/monitoring/kustomization.yaml @@ -0,0 +1,7 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: prometheus-operator +resources: + - prometheus-rbac.yaml + - prometheus.yaml + - servicemonitor.yaml diff --git a/config/monitoring/prometheus-rbac.yaml b/config/monitoring/prometheus-rbac.yaml new file mode 100644 index 00000000..75e98ac7 --- /dev/null +++ b/config/monitoring/prometheus-rbac.yaml @@ -0,0 +1,46 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: gitops-reverser-prometheus-scraper +rules: + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + - namespaces + verbs: + - get + - list + - watch + - apiGroups: ["discovery.k8s.io"] + resources: + - endpointslices + verbs: + - get + - list + - watch + - nonResourceURLs: + - /metrics + verbs: + - get +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: gitops-reverser-prometheus-scraper +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: gitops-reverser-prometheus-scraper +subjects: + - kind: ServiceAccount + name: prometheus + namespace: prometheus-operator diff --git a/config/monitoring/prometheus.yaml b/config/monitoring/prometheus.yaml new file mode 100644 index 00000000..c3753897 --- /dev/null +++ b/config/monitoring/prometheus.yaml @@ -0,0 +1,27 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: gitops-reverser + labels: + monitoring.configbutler.ai/instance: gitops-reverser +spec: + replicas: 1 + serviceAccountName: prometheus + retention: 2h + scrapeInterval: 5s + evaluationInterval: 5s + serviceMonitorSelector: + matchLabels: + monitoring.configbutler.ai/instance: gitops-reverser + serviceMonitorNamespaceSelector: {} + podMonitorSelector: {} + podMonitorNamespaceSelector: {} + ruleSelector: {} + ruleNamespaceSelector: {} + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi diff --git a/config/monitoring/servicemonitor.yaml b/config/monitoring/servicemonitor.yaml new file mode 100644 index 00000000..6f095275 --- /dev/null +++ b/config/monitoring/servicemonitor.yaml @@ -0,0 +1,20 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: gitops-reverser + labels: + monitoring.configbutler.ai/instance: gitops-reverser +spec: + jobLabel: app.kubernetes.io/name + namespaceSelector: + matchNames: + - sut + selector: + matchLabels: + app.kubernetes.io/name: gitops-reverser + endpoints: + - port: metrics + path: /metrics + scheme: http + interval: 5s + scrapeTimeout: 4s diff --git a/config/namespace.yaml b/config/namespace.yaml index 14e972db..a7973808 100644 --- a/config/namespace.yaml +++ b/config/namespace.yaml @@ -4,5 +4,5 @@ metadata: labels: app.kubernetes.io/managed-by: kustomize app.kubernetes.io/name: gitops-reverser - control-plane: controller-manager + control-plane: gitops-reverser name: sut diff --git a/config/rbac/gitops-reverser-manager-role.yaml b/config/rbac/cluster-role.yaml similarity index 82% rename from config/rbac/gitops-reverser-manager-role.yaml rename to config/rbac/cluster-role.yaml index 04b78f14..799c2512 100644 --- a/config/rbac/gitops-reverser-manager-role.yaml +++ b/config/rbac/cluster-role.yaml @@ -1,12 +1,12 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: gitops-reverser-manager-role + name: gitops-reverser rules: - apiGroups: - - "" + - '*' resources: - - namespaces + - '*' verbs: - get - list @@ -17,17 +17,8 @@ rules: - secrets verbs: - create - - get - - list - - watch -- apiGroups: - - '*' - resources: - - '*' - verbs: - - get - - list - - watch + - patch + - update - apiGroups: - configbutler.ai resources: diff --git a/config/rbac/gitops-reverser-demo-jane-access.yaml b/config/rbac/cluster-rolebinding-jane.yaml similarity index 100% rename from config/rbac/gitops-reverser-demo-jane-access.yaml rename to config/rbac/cluster-rolebinding-jane.yaml diff --git a/config/rbac/gitops-reverser-manager-rolebinding.yaml b/config/rbac/cluster-rolebinding.yaml similarity index 70% rename from config/rbac/gitops-reverser-manager-rolebinding.yaml rename to config/rbac/cluster-rolebinding.yaml index 5eb0f83e..ed39923a 100644 --- a/config/rbac/gitops-reverser-manager-rolebinding.yaml +++ b/config/rbac/cluster-rolebinding.yaml @@ -4,12 +4,12 @@ metadata: labels: app.kubernetes.io/managed-by: kustomize app.kubernetes.io/name: gitops-reverser - name: gitops-reverser-manager-rolebinding + name: gitops-reverser roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: gitops-reverser-manager-role + name: gitops-reverser subjects: - kind: ServiceAccount - name: gitops-reverser-controller-manager + name: gitops-reverser namespace: sut diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml index f88dc735..7b51eb49 100644 --- a/config/rbac/kustomization.yaml +++ b/config/rbac/kustomization.yaml @@ -1,10 +1,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - # Service account used by the controller manager deployment. - - gitops-reverser-controller-manager.yaml - # Main runtime permissions for the controller. - - gitops-reverser-manager-role.yaml - - gitops-reverser-manager-rolebinding.yaml - # E2E-only helper: allows impersonated user jane@acme.com writes in tests. - - gitops-reverser-demo-jane-access.yaml + - sa.yaml + - cluster-role.yaml + - cluster-rolebinding.yaml + - cluster-rolebinding-jane.yaml # E2E-only helper: allows impersonated user jane@acme.com writes in tests. diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index f1fec8b7..6e1b5804 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -2,7 +2,7 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: manager-role + name: gitops-reverser rules: - apiGroups: - "" diff --git a/config/rbac/gitops-reverser-controller-manager.yaml b/config/rbac/sa.yaml similarity index 78% rename from config/rbac/gitops-reverser-controller-manager.yaml rename to config/rbac/sa.yaml index d726578e..4ea67633 100644 --- a/config/rbac/gitops-reverser-controller-manager.yaml +++ b/config/rbac/sa.yaml @@ -4,5 +4,5 @@ metadata: labels: app.kubernetes.io/managed-by: kustomize app.kubernetes.io/name: gitops-reverser - name: gitops-reverser-controller-manager + name: gitops-reverser namespace: sut diff --git a/config/service.yaml b/config/service.yaml index 8636109e..7c78d243 100644 --- a/config/service.yaml +++ b/config/service.yaml @@ -23,4 +23,4 @@ spec: targetPort: 8443 selector: app.kubernetes.io/name: gitops-reverser - control-plane: controller-manager + control-plane: gitops-reverser diff --git a/docs/a-tree.md b/docs/a-tree.md new file mode 100644 index 00000000..15e95bba --- /dev/null +++ b/docs/a-tree.md @@ -0,0 +1,416 @@ +Testing with pre-requisites + +I'm intregued how fast you can test and build stuff with AI: but it's also very hard to get the prerequisites right. + +How about that we really learn something about Makefiles, Kubernets and combining the two? + +A Makefile is a smart dependency tree: one can define for each task which other tasks needs to be done first. There is a special '.phony'. command to always run a task when it's requested. In normal circumstances you would include the actual file names that a task depends on. As long as these files are not touched the step is not ran again. This is very logical for compiles and linkers, which is why this tool has been so popular for years in the C years. + +In the Kubernetes world this is a bit harder: but we could save a little 'state' file per make step. The special cleanup cluster make file step would also remove the other 'state' files. + +This will allow us to define better when and how we should be waiting on things: especially the port forwarder task/script is an interesting example: it really does need a running pod. It's hard to guarantee that a pod is running, but we can guarantee that something at least has been defined as deployment (by defining the spec). + +golang source files -> create image -> deploy image -> create port forward + -> run compiler + -> run linter + +golang test files -> run compiler -> run tests + +kustomize config manifests -> replace image version -> apply kubectl -f + +Now the question is where this breaks: I believe that make will only allows us to work backwards. A change in a golang source file shouldnt/can't trigger a state file change? Or should it? And can we then include that as a dependency as well? + +image-state (source files changed, digest?) + +Which actions would qualify to move to make: + +create-cluster +delete-cluster + +setup-gitea +setup-prometheus (run both the operator and prometheus instance) +setup-cert-manager +forward-ports (interesting case: can we also check if they are alive somehow?) +create-age-secret +create-image +deploy-config +deploy-helm + +lint +test +test-e2e + + + +---- + +Testing with prerequisites: using Make for Kubernetes operator dev loops + +AI makes it tempting to “just run tests” all the time. But the real speed comes from something more boring: not rerunning steps that don’t need to run. + +That’s why Make is still useful. It can turn your build-and-test workflow into a small dependency graph where steps only rerun when their inputs changed. + +For Kubernetes operators, this gets tricky because part of your workflow happens in a live cluster. The good news: you can still make it work if you treat “cluster state” as something you prove locally using small stamp files. + +What Make actually does + +A Makefile is a set of rules. Each rule looks like this: + +target: prerequisites + recipe + +target is usually a file you want to create. + +prerequisites are files (or other targets) that must be up-to-date first. + +recipe is the shell commands to run. + +The key behavior: + +If the target file doesn’t exist, Make runs the recipe. + +If any prerequisite file is newer than the target, Make runs the recipe. + +Otherwise, Make skips the recipe. + +That’s it. No magic, just file timestamps. + +A tiny example (real file target) +build/app: main.c + cc -o build/app main.c + +If main.c didn’t change since last time, build/app is “up-to-date” and Make won’t relink. + +.PHONY targets (always run) + +Some targets are not files. They’re just convenient names like test or lint. For those you use .PHONY: + +.PHONY: test +test: + go test ./... + +This tells Make: “don’t look for a file named test; always run the commands when someone asks for make test.” + +The Kubernetes problem + +In operator development, a lot of steps don’t naturally create a local file: + +“Create a kind cluster” + +“Apply CRDs” + +“Deploy controller” + +“Wait for rollout” + +“Port-forward to the service” + +Make can’t “see” cluster state. If there’s no file target, it can’t decide what’s up-to-date. + +The bridge pattern: stamp files + +The trick is to create a small local file that means: + +“This step completed successfully for these inputs.” + +Example: + +.stamps/cluster/kind-myop/ready + +.stamps/cluster/kind-myop/crds.applied + +.stamps/cluster/kind-myop/operator.applied + +.stamps/cluster/kind-myop/e2e.passed + +Now each Kubernetes step becomes a normal Make target that produces a normal file. Make can reason about it. + +Why stamps work + +If you only touch the stamp after a real verification (like kubectl wait), then: + +stamp present → step already succeeded + +stamp older than inputs → rerun step + +Example Makefile: operator dev loop with stamps + +Below is a minimal pattern that works well for kind + kustomize-based operators. + +1) Define your “phony entrypoint” +.PHONY: e2e +e2e: .stamps/cluster/kind-gitops-reverser/e2e.passed + +make e2e doesn’t run tests directly. It asks Make to produce the file e2e.passed. Make then works backwards and builds everything it needs. + +That’s how “sub-steps” work in Make: they’re not nested actions, they’re dependencies. + +2) Build an image and write a digest file + +This is the “digest trick”. + +Problem: tags like :dev are mutable. You can’t reliably tell if the image changed. + +Solution: write an immutable identifier (digest) to a file. + +SOURCES := $(shell find cmd internal pkg -type f -name '*.go') go.mod go.sum +IMG_REPO ?= ghcr.io/you/gitops-reverser +IMG_TAG ?= dev +IMG := $(IMG_REPO):$(IMG_TAG) + +.stamps/image/controller.digest: $(SOURCES) Dockerfile + mkdir -p .stamps/image + docker build -t $(IMG) . + docker inspect --format='{{index .RepoDigests 0}}' $(IMG) > $@ + +What this gives you: + +If Go code changes → this target reruns → digest file updates. + +If nothing changed → digest file is up-to-date → Make skips the build. + +And because the digest is immutable, it’s also a perfect input for deployment. + +3) Ensure the cluster exists +KIND_CLUSTER ?= gitops-reverser +KUBECONTEXT ?= kind-$(KIND_CLUSTER) +CLUSTER_STAMPS := .stamps/cluster/$(KUBECONTEXT) + +$(CLUSTER_STAMPS)/ready: hack/kind-up.sh + mkdir -p $(CLUSTER_STAMPS) + KIND_CLUSTER=$(KIND_CLUSTER) ./hack/kind-up.sh + kubectl --context $(KUBECONTEXT) get ns >/dev/null + touch $@ + +Note the pattern: create cluster, then verify (kubectl get ns), then touch the stamp. + +4) Apply CRDs and verify they’re established +$(CLUSTER_STAMPS)/crds.applied: $(CLUSTER_STAMPS)/ready $(shell find config/crd -type f) + kubectl --context $(KUBECONTEXT) apply -k config/crd + kubectl --context $(KUBECONTEXT) wait --for=condition=Established crd --all --timeout=120s + touch $@ + +If you change any CRD manifest file, Make sees it and reruns this step. + +5) Deploy the controller using the digest + +This is where the digest trick pays off: deploy by digest, not tag. + +$(CLUSTER_STAMPS)/operator.applied: \ + $(CLUSTER_STAMPS)/crds.applied \ + .stamps/image/controller.digest \ + $(shell find config/manager config/rbac config/default -type f) + + IMG_DIGEST=$$(cat .stamps/image/controller.digest) ; \ + kustomize build config/default | \ + sed "s#$(IMG_REPO):.*#$$IMG_DIGEST#g" | \ + kubectl --context $(KUBECONTEXT) apply -f - + + kubectl --context $(KUBECONTEXT) -n system rollout status deploy/controller-manager --timeout=180s + touch $@ + +Now deployment reruns when: + +the image digest changes (code changed) + +or manifests changed (RBAC/manager/config) + +or CRDs changed (because it depends on crds.applied) + +6) Run E2E tests and stamp success +$(CLUSTER_STAMPS)/e2e.passed: $(CLUSTER_STAMPS)/operator.applied test/e2e/*.go + KUBECONTEXT=$(KUBECONTEXT) go test ./test/e2e -v + touch $@ + +Now make e2e is incremental: + +Change Go code → image rebuild → redeploy → rerun tests + +Change only tests → rerun tests + +Change only CRDs → reapply CRDs → redeploy → rerun tests + +Change nothing → everything skipped (fast!) + +What about port-forward? + +Port-forward is a special case because it’s not a stable build artifact. It’s a long-running process. + +Two approaches: + +Option A: keep it .PHONY (simple) + +Always start it when asked: + +.PHONY: port-forward +port-forward: $(CLUSTER_STAMPS)/operator.applied + kubectl --context $(KUBECONTEXT) -n system port-forward deploy/controller-manager 8080:8080 +Option B: stamp + PID file (more complex) + +Write a PID file and verify the process is still alive before claiming it’s up-to-date. This is doable, but for most teams option A is enough. + +Where this approach breaks + +Stamps assume the world doesn’t change behind Make’s back. Kubernetes can drift: + +someone deletes the cluster manually + +the deployment is edited manually + +the controller crashes after the stamp was written + +kubecontext changes + +You can mitigate this in two ways: + +Namespace stamps by kubecontext (as shown) + +Only write stamps after real verification (kubectl wait, rollout status) + +If you need extra safety, add a quick “is it still true?” check before skipping (for example: confirm the deployed image digest matches your digest file) + +The main takeaway + +Make is not “tasks”. It’s a dependency graph over artifacts. + +For Kubernetes operator workflows, you can still use Make effectively if you: + +make each cluster step produce a local stamp file + +make stamps depend on the real inputs (manifests, tests, scripts) + +use an image digest file as the stable bridge between “built code” and “deployed controller” + +That gives you fast, predictable inner loops: change one thing, rerun only what truly depends on it. + +If you want, paste your current Make targets (image build + kind + deploy + e2e), and I’ll adapt this pattern to your repo structure (operator-sdk layout vs custom, kind vs k3d, kustomize vs helm) without adding more complexity than necessary. + +O +I'm very curious to how you would do that PID trick with a port forwarding. What construct would you need to pull that off? + +You can do it, but you have to accept one hard truth: Make isn’t a process supervisor. So the “PID trick” is basically: start port-forward in the background, record its PID, and on the next make port-forward check if that PID is still alive (and ideally still the right command). If it’s alive, skip; if not, start again. + +There are two workable constructs: + +A stamp file + PID file (most common) + +A stamp file whose recipe validates and only re-runs if validation fails (cleaner, but you still need the PID) + +Below is a solid pattern using a PID file, plus an optional health-check. + +Pattern: target file .stamps/.../portforward.running + .stamps/.../portforward.pid +Makefile snippet +# Config +KIND_CLUSTER ?= gitops-reverser +KUBECONTEXT ?= kind-$(KIND_CLUSTER) +NAMESPACE ?= system + +PF_LOCAL_PORT ?= 8080 +PF_REMOTE_PORT ?= 8080 + +CLUSTER_STAMPS := .stamps/cluster/$(KUBECONTEXT) +PF_PID := $(CLUSTER_STAMPS)/portforward.pid +PF_LOG := $(CLUSTER_STAMPS)/portforward.log + +.PHONY: port-forward port-forward-stop +port-forward: $(CLUSTER_STAMPS)/portforward.running + +# Start port-forward only if not already running +$(CLUSTER_STAMPS)/portforward.running: $(CLUSTER_STAMPS)/operator.applied + @mkdir -p $(CLUSTER_STAMPS) + @set -eu; \ + # If PID file exists and process is alive, consider it running and refresh the stamp. + if [ -f "$(PF_PID)" ] && kill -0 "$$(cat "$(PF_PID)")" 2>/dev/null; then \ + echo "port-forward already running (pid=$$(cat "$(PF_PID)"))"; \ + touch "$@"; \ + exit 0; \ + fi; \ + # Clean up stale pid file + rm -f "$(PF_PID)"; \ + echo "starting port-forward on localhost:$(PF_LOCAL_PORT) -> $(PF_REMOTE_PORT)"; \ + # Start in background, capture PID, write logs + ( kubectl --context "$(KUBECONTEXT)" -n "$(NAMESPACE)" \ + port-forward deploy/controller-manager "$(PF_LOCAL_PORT):$(PF_REMOTE_PORT)" \ + >"$(PF_LOG)" 2>&1 & echo $$! >"$(PF_PID)" ); \ + # Give it a moment and verify process is alive + sleep 0.5; \ + kill -0 "$$(cat "$(PF_PID)")" 2>/dev/null || (echo "port-forward failed, see $(PF_LOG)"; exit 1); \ + touch "$@" + +# Stop it explicitly +port-forward-stop: + @set -eu; \ + if [ -f "$(PF_PID)" ]; then \ + PID="$$(cat "$(PF_PID)")"; \ + if kill -0 "$$PID" 2>/dev/null; then \ + echo "stopping port-forward (pid=$$PID)"; \ + kill "$$PID"; \ + fi; \ + rm -f "$(PF_PID)" "$(CLUSTER_STAMPS)/portforward.running"; \ + else \ + echo "no port-forward pid file found"; \ + fi +What this gives you + +make port-forward: + +If the PID file exists and the process is alive → do nothing. + +Otherwise start a new kubectl port-forward and record PID. + +make port-forward-stop kills it and removes files. + +Make it safer: verify it’s actually listening on the port + +A PID being alive doesn’t guarantee the forward is working (pods can restart, tunnels can break). Add a lightweight check: + +Option A: check the local TCP port is open (Linux) + +Add this in the “already running” branch: + +if [ -f "$(PF_PID)" ] && kill -0 "$(cat "$(PF_PID)")" 2>/dev/null; then + if ss -ltn | grep -q ":$(PF_LOCAL_PORT) "; then + echo "port-forward already running and listening" + touch "$@" + exit 0 + fi +fi + +(You could use lsof -iTCP:PORT -sTCP:LISTEN too, but ss is usually present.) + +Option B: HTTP health probe (if your port exposes it) + +If your operator exposes /healthz: + +if curl -fsS "http://127.0.0.1:$(PF_LOCAL_PORT)/healthz" >/dev/null; then + touch "$@" + exit 0 +fi + +This is the best check if you have it. + +Why this works in Make terms + +The target is a file artifact: portforward.running. + +If it exists and is newer than prerequisites, Make would normally skip. + +But we’re also doing an extra runtime validation in the recipe (PID alive / port listening). + +If validation fails, we restart and then touch the target. + +This is a slight “abuse” of Make (because runtime state isn’t purely captured by mtimes), but it’s practical. + +Two gotchas (don’t ignore these) + +Killing PID might not kill kubectl’s children in some shells +Usually it does, but if you see orphaned processes, you can start kubectl in its own process group and kill the group. That’s a bit more bash-y. + +PID reuse +It’s rare but possible: a PID file could point to a new unrelated process after reboot. That’s why adding a port check or command-line check is good. + +If you want command-line verification, you can do: + +ps -p "$PID" -o args= | grep -q "kubectl .*port-forward" + +If you tell me your environment (Linux/macOS, bash vs sh, kind vs k3d) and what you port-forward to (svc/deploy/pod + which ports), I’ll tailor the snippet to be robust and minimal for your setup. \ No newline at end of file diff --git a/docs/current-work-issue.md b/docs/current-work-issue.md new file mode 100644 index 00000000..e3600595 --- /dev/null +++ b/docs/current-work-issue.md @@ -0,0 +1,192 @@ +# E2E “second run” failure at `test/e2e/e2e_test.go:598` — investigation summary (2026-02-25) + +## Symptom you reported +- `make test-e2e` passes on a **fresh** Kind cluster / first run. +- Running `make test-e2e` (or `make test-e2e-encryption`) **again** in the same devcontainer (same cluster still running) fails reproducibly at: + - `test/e2e/e2e_test.go:598` + - inside `It("should commit encrypted Secret manifests when WatchRule includes secrets", ...)` + +## Latest findings (added 2026-02-25) +### Controller restart test (proves “pod lifetime” involvement) +We ran this sequence (same cluster, no cluster recreation): +1. `make test-e2e-encryption` (after a clean setup) → **PASS** +2. Restart controller only: `kubectl -n sut rollout restart deploy/gitops-reverser` → wait for rollout +3. `make test-e2e-encryption` again → **PASS** + +This strongly suggests the failure is tied to **controller pod lifetime state** (either in-memory state and/or container-local disk such as `/tmp`). + +### Deleting `/tmp/gitops-reverser-workers` inside the running pod (do not use as a workaround) +We attempted to discriminate “memory vs disk” by deleting the controller’s on-disk git cache while the controller stayed running: +- `rm -rf /tmp/gitops-reverser-workers` inside the controller pod + +Result: the next `make test-e2e-encryption` failed earlier in the same spec at line 598, but now because: +- `.sops.yaml` was **missing** in the test checkout: + - `.../e2e/secret-encryption-test/.sops.yaml: no such file or directory` +- controller logs showed repeated: + - `Failed to write event ... failed to open repository: repository does not exist` + +This is still useful evidence: +- the controller assumes the on-disk repo cache exists and does not reliably recreate it if removed +- deleting the directory mid-run introduces a new failure mode and is **not** a stable “cleanup” strategy + +### Decision: remove focused e2e selectors +We will remove the “focused” e2e Make targets/selectors (e.g. `-ginkgo.focus=...`) because the suite relies on behaviors/state that these focused runs accidentally depend on or interfere with (especially with ordered containers and shared resources). Running subsets has been confusing and can produce non-representative results compared to full `make test-e2e`. + +## What line 598 is actually doing +At `test/e2e/e2e_test.go:598`, the test does: + +1. Create `GitTarget` with path `e2e/secret-encryption-test` and a `WatchRule`. +2. Create + patch Secret `sut/test-secret-encryption` to force an update event. +3. `Eventually(...)`: + - `git pull` in the test checkout dir (e.g. `/tmp/gitops-reverser/e2e-test-416880`). + - Read the committed file: + - `e2e/secret-encryption-test/v1/secrets/sut/test-secret-encryption.sops.yaml` + - Assert: + - it contains `sops:` + - it does **not** contain plaintext + - Read `.sops.yaml` at: + - `e2e/secret-encryption-test/.sops.yaml` + - Derive an age recipient from the local private key file: + - `/tmp/e2e-age-key.txt` + - Assert `.sops.yaml` contains that derived recipient. + - Decrypt the committed secret YAML by exec’ing `sops --decrypt` **inside the controller pod**, passing the age private key. + - If decryption fails, the Eventually fails at line 598. + +So line 598 is really “**repo contains an encrypted secret that is decryptable using the e2e-generated age key**”. + +## Key evidence we captured (the “smoking gun”) +On the failing **second run**, the checked-out repo directory for the new Gitea repo (example shown below) contains a mismatch: + +- `.sops.yaml` contains a **new** recipient (derived from the current `/tmp/e2e-age-key.txt`). +- but the committed secret file contains an **old** `sops.age[].recipient` from the previous run. + +Example from the failing run checkout: +- Checkout dir: `/tmp/gitops-reverser/e2e-test-416880` + +`.sops.yaml` (new recipient): +- `e2e/secret-encryption-test/.sops.yaml` contains: `age1wkkp...` + +Secret file (old recipient + old `lastmodified`): +- `e2e/secret-encryption-test/v1/secrets/sut/test-secret-encryption.sops.yaml` contains: `age13vmf...` +- `sops.lastmodified` stayed at the **previous run’s timestamp** (e.g. `2026-02-25T11:59:30Z`), which strongly indicates it was **not newly generated** this run. + +We also verified the same mismatch exists **inside the controller pod’s local repo**: +- `/tmp/gitops-reverser-workers/sut/gitprovider-normal/main/e2e/secret-encryption-test/.sops.yaml` → new recipient +- `/tmp/gitops-reverser-workers/sut/gitprovider-normal/main/e2e/secret-encryption-test/v1/secrets/sut/test-secret-encryption.sops.yaml` → old recipient + +This mismatch leads to the decryption failure the test sees (typically: “no identity matched any of the recipients”). + +## What this implies (root cause direction) +This is not “the controller didn’t commit the secret”. +It’s “the controller committed **a secret encrypted to the wrong recipient**”. + +And the pattern matches “**stale repo state being reused across runs**”: + +- The controller uses a persistent on-disk working repo path: + - `/tmp/gitops-reverser-workers///` + - In e2e: `/tmp/gitops-reverser-workers/sut/gitprovider-normal/main` +- In e2e, the GitProvider URL changes every run because the test creates a unique repo name: + - e.g. `.../e2e-test-416879.git` → next run `.../e2e-test-416880.git` +- The controller *does* detect this and logs: + - “Updating remote origin URL … old … new …” +- But after switching to a new empty remote repo, the controller ends up pushing content that still contains files (or history) from the previous repo, including the old encrypted secret. + +In short: +- **remote URL changes** + **local repo directory is reused** + **cleanup/reset is incomplete** ⇒ stale encrypted secret gets into the new repo ⇒ `.sops.yaml` and secret recipients diverge ⇒ decrypt fails at line 598. + +## The “wanderings” / hypotheses we tried first (and why) +### 1) “Maybe deduplication drops events on run #2” +Because the failure looked like “secret isn’t updated”, we investigated dedup/state contamination in watcher/reconcile code. + +Changes made: +- Added `Key()` to `types.ResourceIdentifier` for stable fully-qualified identifiers. +- Switched several internal dedup/diff maps to use `Key()` instead of `String()`. +- Added cleanup when `GitTarget` is NotFound to remove in-memory state: + - `GitTargetReconciler.cleanupDeletedGitTarget(...)` calls: + - `EventRouter.UnregisterGitTargetEventStream(gitDest)` + - `ReconcilerManager.DeleteReconciler(gitDest)` (best-effort) + +Outcome: +- This helped tighten correctness, but **did not fix** the encryption failure at line 598. + +### 2) “Maybe stamp-based port-forwarding is stale” +We found a stamp target that could skip re-establishing port-forwards across runs. + +Change made: +- `Makefile`: added a phony `portforward-check` dependency so the `portforward.running` recipe executes every time (and it restarts port-forwards if health checks fail). + +Outcome: +- Improves developer ergonomics and reduces unrelated flakiness, but **not the root cause** of the encryption mismatch. + +## The git-cleanup fix we attempted (and why it wasn’t enough) +Given the evidence, we focused on the “empty remote/unborn branch” path. + +### What we changed +- In `internal/git/git.go`, `cleanWorktree()` used to call go-git’s `worktree.Clean(...)`. +- We replaced it with an explicit wipe of all top-level worktree entries except `.git`, using billy’s filesystem + `billyutil.RemoveAll`. +- Added unit test: + - `internal/git/git_operations_test.go`: `TestMakeHeadUnborn_CleansWorktreeIncludingTrackedFiles` + - It creates a repo with tracked + untracked files and asserts `makeHeadUnborn()` removes them. + +### What passed +- `make fmt` ✅ +- `make vet` ✅ +- `make lint` ✅ (after fixing a `funcorder` complaint by reordering methods in `internal/types/identifier.go`) +- `make test` ✅ +- `make test-e2e-encryption` on a fresh cluster ✅ (22/22 passed) + +### What still failed +- Running `make test-e2e-encryption` a second time (without deleting the cluster) **still failed** at `test/e2e/e2e_test.go:598` with the same fundamental symptom: `.sops.yaml` recipient != secret file recipient. + +So: “wipe worktree contents” improved the code, but the reproduction indicates **something else still carries old content across the remote switch** (likely refs/history, or the repo not being recreated when origin URL changes). + +## Additional important observations +- The controller runtime image is distroless and does **not** contain `git`, so debugging inside the pod required inspecting the filesystem directly (not `git status/log`). +- `SmartFetch()` returns `""` on an empty remote and does **not** fetch/prune anything. That means old `refs/remotes/origin/*` may remain locally unless explicitly removed. +- `makeHeadUnborn()` currently removes `refs/heads/` but does **not** remove remote-tracking refs or reset the entire repository identity. +- The bootstrap code writes `.sops.yaml` **only if missing**. So a reliable “clean slate” is essential whenever the age recipient set changes between runs. + +## Resolution (final, 2026-02-25) +This ended up being **two** independent “state leaks across runs” that combine into the line-598 failure. + +### Root cause A: Secret encryption cache reused stale ciphertext across runs +`internal/git/content_writer.go` caches encrypted Secret bytes to avoid re-running SOPS. The cache key did **not** include +anything about the *encryption context* (repo/path/recipients/identities). When events lacked stable marker fields (UID / +resourceVersion), the cache could be hit across separate e2e runs even though the suite regenerated the age key. + +That produces the exact symptom we observed: +- `.sops.yaml` contains the *current* recipient +- the committed Secret file contains an *old* `sops.age[].recipient` (and old `sops.lastmodified`) + +Fix: +- Partition the encryption cache by a stable **scope** derived from `(provider, workDir, recipients, identities)` and + include it in the cache key, so ciphertext never crosses runs/paths/keys. +- Added a unit test asserting that changing scope forces re-encryption. + +### Root cause B: BranchWorker repo identity and GitProvider snapshots +Separately, we tightened worker correctness around “repo identity” and runtime GitProvider changes: + +1. Local git repo paths are now keyed by remote URL hash + - New layout: `/tmp/gitops-reverser-workers////repos/` + - Prevents cross-run contamination when the remote URL changes. + +2. `GitProvider` is re-fetched before each commit/push batch + - `commitAndPush()` calls `getGitProvider()` each time instead of using a long-lived snapshot. + +3. Serialize `syncWithRemote()` with other git operations + - Added the repo mutex to `syncWithRemote()` to prevent concurrent ops on the same on-disk repo. + +### Supporting e2e robustness changes +- `Makefile`: force a controller rollout when building locally by patching a pod-template annotation with the built image ID. +- `test/e2e/e2e_test.go`: Prometheus metrics assertions use aggregated queries (`sum(...)`) to avoid rollout flake. + +### Validation (2026-02-25) +- `make lint` ✅ +- `make test` ✅ +- `make test-e2e` ✅ +- `make -B test-e2e` ✅ (confirms “second run” no longer fails) +- `make test-e2e-quickstart-manifest` ✅ +- `make test-e2e-quickstart-helm` ✅ + +Note: quickstart Helm validation can fail with “No space left on device” if the devcontainer is nearly full; deleting +unused Kind clusters and pruning Docker build cache resolves that. diff --git a/docs/design/multi-cluster-audit-ingestion-implications.md b/docs/design/multi-cluster-audit-ingestion-implications.md new file mode 100644 index 00000000..ed2eabb5 --- /dev/null +++ b/docs/design/multi-cluster-audit-ingestion-implications.md @@ -0,0 +1,316 @@ +# Multi-Cluster Audit Ingestion: Implications and Configuration Mapping + +## 1. Purpose + +This document describes what changes are implied by supporting audit ingestion from multiple clusters, how this maps to +current `WatchRule` / `ClusterWatchRule` usage, and what new cluster connectivity model is needed for initial +reconcile and CRD discovery. + +## 2. Current Baseline (Today) + +1. Audit ingress already supports a path contract with cluster identity: +`/audit-webhook/` (`internal/webhook/audit_handler.go:231`, `charts/gitops-reverser/README.md:225`). +2. `WatchRule` is namespaced and scoped to resources in its own namespace (`api/v1alpha1/watchrule_types.go:57`). +3. `ClusterWatchRule` is cluster-scoped and can target both cluster-scoped and namespaced resources via `scope` +(`api/v1alpha1/clusterwatchrule_types.go:59`, `api/v1alpha1/clusterwatchrule_types.go:113`). +4. There is no first-class CRD yet for remote kube-apiserver connectivity (needed for seed reconcile and dynamic GVR +planning per source cluster). + +## 3. Core Implications of Multi-Cluster Ingestion + +### 3.1 Identity and Routing + +- `clusterID` becomes a first-class identity dimension in every event and derived key. +- Rule matching must evaluate `{clusterID, gvr, namespace, name, operation}`. +- Dedupe keys and replay tools must include `clusterID` to avoid cross-cluster collisions. + +### 3.2 Rule Semantics + +- Current rule model is cluster-agnostic; multi-cluster requires source-cluster selection. +- Without a source selector, one rule could unintentionally match events from all clusters. + +### 3.3 Security Model + +- Audit ingress needs authenticated cluster identity, not just path parsing. +- Per-cluster policy and quotas are required (fairness and blast-radius control). +- Remote kube-api credentials become sensitive assets requiring rotation and least privilege. + +### 3.4 Reconciliation and Discovery + +- Initial snapshot reconcile must run against each source cluster API. +- New CRD installation discovery is per source cluster, not global. +- Failures are per cluster and should not block processing for healthy clusters. + +### 3.5 Operational/HA Impact + +- Metrics/alerts require `clusterID` labels with cardinality controls. +- Backpressure, dead-letter, and lag need per-cluster visibility. +- Noisy cluster isolation is required to protect global throughput. + +## 4. Mapping to Existing Rule Model + +### 4.1 Recommendation for V1 (Pragmatic) + +Use `ClusterWatchRule` as the primary multi-cluster policy object with an optional source-cluster selector. +Keep `WatchRule` for single-cluster/local management-cluster use. + +Why this is pragmatic: + +1. `ClusterWatchRule` already supports both `Cluster` and `Namespaced` resource scopes. +2. It avoids redefining namespace semantics for remote clusters in a namespaced CRD. +3. It gives one place to express global policy and source-cluster targeting. + +### 4.2 Proposed Field Additions + +### A. `ClusterWatchRule` (recommended now) + +Add optional source selector: + +```yaml +spec: + source: + clusterIDs: ["prod-eu-1", "prod-us-1"] # empty/omitted = all onboarded clusters +``` + +### B. `WatchRule` (optional future extension) + +If needed later, add a constrained source selector: + +```yaml +spec: + source: + clusterIDs: ["dev-eu-1"] +``` + +For initial multi-cluster rollout, avoid this extension unless there is a hard requirement for namespace-scoped, +team-owned cross-cluster policies. + +### 4.3 Mapping Examples + +| Current usage | Multi-cluster equivalent | +|---|---| +| `WatchRule` in namespace `team-a` for `configmaps` | Keep as-is for local cluster, or migrate to `ClusterWatchRule` with `scope: Namespaced` and `source.clusterIDs` | +| `ClusterWatchRule` for CRDs/nodes | Add `source.clusterIDs` to scope clusters explicitly | +| "Watch all namespaces" policy | `ClusterWatchRule` + `scope: Namespaced` + optional namespace filters + `source.clusterIDs` | + +### 4.4 Example Translation (WatchRule -> ClusterWatchRule) + +Current namespaced rule: + +```yaml +apiVersion: configbutler.ai/v1alpha1 +kind: WatchRule +metadata: + name: configmaps-team-a + namespace: team-a +spec: + targetRef: + name: team-a-target + rules: + - operations: [CREATE, UPDATE, DELETE] + apiGroups: [\"\"] + apiVersions: [\"v1\"] + resources: [configmaps] +``` + +Multi-cluster equivalent: + +```yaml +apiVersion: configbutler.ai/v1alpha1 +kind: ClusterWatchRule +metadata: + name: configmaps-team-a-multi-cluster +spec: + targetRef: + name: team-a-target + namespace: team-a + source: + clusterIDs: [\"prod-eu-1\", \"prod-us-1\"] + rules: + - scope: Namespaced + operations: [CREATE, UPDATE, DELETE] + apiGroups: [\"\"] + apiVersions: [\"v1\"] + resources: [configmaps] +``` + +### 4.5 GitTarget Provenance Model (Important) + +By default, a single `GitTarget` should represent **one source cluster provenance**. + +Why: + +1. Audit trails remain clear ("this file came from cluster X"). +2. Drift/debug workflows stay understandable. +3. Operational blast radius is reduced. + +Recommendation: + +1. Do not mix multiple source clusters into one `GitTarget` by default. +2. If mixing is ever needed, require explicit opt-in and enforce path partitioning per cluster (for example +`clusters//...`). + +## 5. Proposed New Connectivity Model + +Multi-cluster audit ingestion is not enough by itself. You also need control-plane connectivity to each source cluster +for: + +1. Initial reconcile (list current resources) +2. CRD discovery / GVR planning +3. Periodic drift checks and orphan detection + +### 5.1 New CRD Proposal: `SourceCluster` + +Introduce a dedicated cluster onboarding CRD, e.g. `SourceCluster`: + +```yaml +apiVersion: configbutler.ai/v1alpha1 +kind: SourceCluster +metadata: + name: prod-eu-1 +spec: + clusterID: prod-eu-1 + auditIngress: + authMode: mTLS # or BearerToken + credentialSecretRef: + name: sourcecluster-prod-eu-1-audit + kubeAPI: + mode: Direct # Direct | Proxy | Disabled + kubeconfigSecretRef: + name: sourcecluster-prod-eu-1-kubeconfig + qps: 20 + burst: 40 + reconcile: + enabled: true + limits: + maxEventsPerSecond: 500 +status: + conditions: [] +``` + +### 5.2 Why Separate `SourceCluster` from Rules + +- Rules express "what to capture". +- `SourceCluster` expresses "where and how to connect". +- Separation improves security, ownership, and rotation workflows. + +### 5.3 Connectivity Modes for Kube API + +1. `Direct`: central controller connects directly to remote kube-apiserver using kubeconfig/credential secret. +2. `Proxy`: source-cluster agent or gateway exposes constrained API for list/discovery operations. +3. `Disabled`: no kube-api access for this source; audit ingest only (no snapshot/discovery from that cluster). + +For initial reconcile and CRD discovery, `Direct` or `Proxy` is required. + +### 5.4 Deployment Topology Options + +#### Option A: One GitOps Reverser per source cluster (simplest app model) + +Model: + +1. Deploy one controller per cluster (often inside that same cluster). +2. Configure kube-api endpoint at deployment level (default in-cluster, optional override). +3. No cluster-specific selection fields required in rule CRDs for that deployment. + +Implications: + +- Pros: lowest application complexity, clear ownership boundaries, simpler CRD model. +- Cons: fleet-level aggregation requires external orchestration; more deployments to manage. + +Suggested deployment-level config knobs: + +- `kubeAPI.mode=inCluster|override` +- `kubeAPI.server=https://...` (used when `override`) +- `kubeAPI.authSecretRef=...` +- `cluster.identity=` (used in emitted metadata) + +#### Option B: Central hosting cluster with multi-cluster ingestion/control plane + +Model: + +1. One or more controllers run in a hosting cluster. +2. `SourceCluster` defines audit auth + kube-api connectivity for each source cluster. +3. Rules may target one or more `clusterIDs`. + +Implications: + +- Pros: centralized operations, single control plane, easier global governance. +- Cons: higher product complexity (cluster-aware routing, auth, fairness, isolation). + +#### Option C: Hybrid + +1. Local per-cluster deployments for most teams. +2. Central deployment only for selected clusters/use-cases. + +This can share the same CRDs, with `SourceCluster` used only where centralized mode is enabled. + +## 6. Event and Reconcile Flow with `SourceCluster` + +```text +Source Cluster A/B/C + -> POST /audit-webhook/ + -> authenticate via SourceCluster credentials + -> normalize event (clusterID required) + -> rule match (ClusterWatchRule + source selector) + -> enqueue to durable bus + -> writer lease by repo+branch partition + -> git commit/push + +Startup / rule change + -> list active SourceClusters + -> create per-cluster discovery + snapshot jobs + -> publish reconcile events tagged with clusterID + -> same write pipeline + status projection +``` + +## 7. Security and RBAC Implications + +1. `SourceCluster` CRUD should be restricted to platform admins. +2. Secrets for remote kube-api and audit auth should be namespace-local and tightly RBACed. +3. Add validation that `spec.clusterID` is unique and immutable. +4. Enforce explicit allow-listing of accepted `clusterID`s at ingress. + +## 8. Failure Modes and Required Behavior + +1. Source cluster audit down: continue ingest from other clusters. +2. Source cluster kube-api unreachable: skip snapshot/discovery for that cluster; keep live ingest if available. +3. Bad credentials for one cluster: mark that `SourceCluster` degraded; do not block others. +4. Per-cluster event spike: apply per-cluster quota and backpressure before global degradation. + +## 9. Suggested Implementation Paths + +### Path A (Per-cluster deployment first) + +1. Add deployment-level kube-api connectivity config (`inCluster` default + optional override server/auth). +2. Keep `WatchRule`/`ClusterWatchRule` semantics unchanged within each deployment. +3. Enforce one-cluster-per-`GitTarget` provenance at deployment level. +4. Add fleet docs/automation for managing many deployments. + +### Path B (Central multi-cluster control plane) + +1. Add `SourceCluster` CRD and controller (status + connectivity checks). +2. Add ingress auth that maps request to registered `SourceCluster`. +3. Add `source.clusterIDs` selector to `ClusterWatchRule`. +4. Add cluster-aware matching in rule compiler and event identity. +5. Add per-cluster snapshot/discovery workers using `SourceCluster.kubeAPI` credentials. +6. Add per-cluster lag/quota metrics and alerts. +7. Optionally evaluate `WatchRule` source selector later. + +### Recommendation + +If you want to reduce rewrite risk, start with **Path A** and keep the application model simple. +If centralized governance is the immediate requirement, implement **Path B** directly. + +## 10. Direct Answer to "ClusterWatchRule only?" + +For the first multi-cluster version: **yes, that is the cleanest approach**. + +- Use `ClusterWatchRule` with optional `source.clusterIDs`. +- Keep `WatchRule` unchanged for local/simple namespaced use. +- Revisit extending `WatchRule` only if multi-tenant namespace-owned cross-cluster policy becomes a strong requirement. + +Direct answer to your deployment thought: + +1. Yes, allowing in-cluster deployment (local kube-api by default) is sensible and should be first-class. +2. Yes, allowing deployment-level kube-api override is useful for a one-reverser-per-cluster model. +3. In centralized mode, `SourceCluster` is still required for remote connectivity/auth. diff --git a/docs/design/rewrite-from-scratch-recommendations.md b/docs/design/rewrite-from-scratch-recommendations.md new file mode 100644 index 00000000..4ff5eb90 --- /dev/null +++ b/docs/design/rewrite-from-scratch-recommendations.md @@ -0,0 +1,231 @@ +# GitOps Reverser Analysis and Rewrite Recommendations + +## 1. Executive Summary +The project has strong core ideas and good building blocks, but the orchestration layer is carrying too much complexity for its current reliability level. +If you rewrite from scratch, I would **keep the Git write engine + CRD model direction**, and **redo ingestion, reconciliation wiring, event identity, and lifecycle management**. + +Current baseline I validated: +- `make lint`: pass +- `make test`: pass +- Total unit coverage from `cover.out`: **52.6%** +- `make test-e2e`: not run in this pass + +## 2. High-Level Component Overview (Current) + +### Control Plane Components +1. **API/CRDs**: `GitProvider`, `GitTarget`, `WatchRule`, `ClusterWatchRule`. +2. **Controllers**: compile rules, validate targets/providers, bootstrap workers/streams. +3. **RuleStore**: in-memory compiled rule cache. +4. **Watch Manager**: dynamic discovery + informers + seed listing. +5. **Webhook Event Handler**: stores user attribution in correlation store. +6. **Correlation Store**: in-memory TTL/LRU bridge between webhook and watch paths. +7. **Event Router + Reconcile Manager**: route events and coordinate startup snapshot state. +8. **Git Worker Manager + Branch Workers**: branch-scoped buffering, commit, push. +9. **Audit Handler**: currently metrics/debug oriented, not main ingest path. + +### Current Flow +`Kubernetes API` -> `validating webhook (correlation only)` + `watch informers (event source)` -> `event router` -> `GitTarget event stream` -> `branch worker` -> `git commit/push` + +Main wiring is in `cmd/main.go:130`, `cmd/main.go:185`, `cmd/main.go:206`. + +## 3. What Works Well (Keep) +1. **Branch-scoped serialization for Git writes** is a good concurrency model (`internal/git/worker_manager.go`, `internal/git/branch_worker.go`). +2. **Low-level atomic push logic** is valuable and reusable (`internal/git/git_atomic_push.go`). +3. **Sanitization and secret encryption hooks** are good foundations (`internal/sanitize/sanitize.go`, `internal/git/content_writer.go`). +4. **CRD separation of concerns** is directionally correct (`api/v1alpha1/gittarget_types.go`, `api/v1alpha1/gitprovider_types.go`, `api/v1alpha1/watchrule_types.go`). +5. **Lifecycle gate concept in `GitTarget`** is a solid UX contract (`internal/controller/gittarget_controller.go:480`). + +## 4. Main Risks and Architecture Gaps + +| Severity | Finding | Why it matters | Evidence | +|---|---|---|---| +| High | Resource identity keys omit namespace | Cross-namespace collisions in dedupe/reconcile keys | `internal/types/identifier.go:89`, `internal/watch/manager.go:299`, `internal/reconcile/folder_reconciler.go:173`, `internal/reconcile/git_target_event_stream.go:152` | +| High | Snapshot create/reconcile events can be dropped | Reconciler emits events without object; stream skips non-delete events without payload | `internal/reconcile/folder_reconciler.go:140`, `internal/reconcile/git_target_event_stream.go:161` | +| High | Snapshot cluster-state query is over-broad | Lists cluster-wide and accepts all objects for target | `internal/watch/manager.go:767`, `internal/watch/manager.go:800` | +| High | Informer stop lifecycle is incomplete | cancel func stored as no-op; stale informers risk | `internal/watch/manager.go:1277`, `internal/watch/manager.go:1136` | +| High | Namespace removal diffs not handled cleanly | compare logic removes only at GVR level, not per namespace | `internal/watch/manager.go:1110`, `internal/watch/manager.go:1034` | +| High | Event loss under pressure | queue full drops events; routing failures are logged and dropped | `internal/git/branch_worker.go:157`, `internal/watch/event_router.go:239`, `internal/watch/informers.go:129` | +| Medium | Rule expressiveness not fully implemented | wildcard and many non-concrete rules are skipped in watcher planning | `internal/watch/gvr.go:41`, `internal/watch/gvr.go:89`, `internal/watch/manager.go:693` | +| Medium | Ignore-filter not implemented | noisy/system resources are not excluded by policy yet | `internal/watch/resource_filter.go:21` | +| Medium | Security posture is broad | read access is cluster-wide `*/*`; webhook failure policy is ignore | `config/rbac/role.yaml:27`, `config/webhook.yaml:16` | +| Medium | Naming/status drift still present | leftover GitRepoConfig reason strings and TODO readiness checks | `internal/controller/watchrule_controller.go:42`, `internal/controller/watchrule_controller.go:191`, `internal/controller/clusterwatchrule_controller.go:41`, `internal/controller/clusterwatchrule_controller.go:168` | +| Medium | Dual ingest complexity with single-pod limitation | correlation + watcher coupling and no HA | `README.md:43`, `README.md:180`, `docs/future/AUDIT_WEBHOOK_NATS_ARCHITECTURE_PROPOSAL.md:20` | +| Low | Audit path is not primary pipeline yet | currently mainly metrics/logging | `internal/webhook/audit_handler.go:214` | + +## 5. Keep vs Redo (If Rewriting From Scratch) + +| Component | Keep | Redo | +|---|---|---| +| Git commit/push core | Yes | Keep logic, isolate behind cleaner interface | +| CRD intent model | Mostly | Keep resource roles, tighten validation and immutable fields | +| Sanitization/encryption | Yes | Keep with plugin hooks and schema-aware policies | +| Worker partitioning by repo+branch | Yes | Keep, but add durable queue consumer semantics | +| Event identity/key model | No | Redo with canonical key including namespace/scope | +| Watch/informer manager | No | Redo as a separate ingestion adapter with explicit lifecycle | +| Reconcile orchestration | No | Redo to produce full payload events, not identifier-only for creates/updates | +| Correlation store bridge | No | Remove if moving to audit-first ingest | +| Audit handler | Partial | Promote to first-class ingest or remove experimental dead path | +| Ops/security defaults | No | Redo RBAC/webhook defaults and HA posture | + +## 6. Recommended Rewrite Architecture + +I recommend this target architecture: + +1. **Rule Compiler Service** +Compiles `WatchRule`/`ClusterWatchRule` into deterministic subscription plans per `GitTarget`. + +2. **Ingestion Adapter(s)** +Preferred: audit webhook ingestion from multiple source clusters. Optional: informer adapter for clusters without audit integration. + +3. **Durable Event Bus (Valkey)** +Valkey Streams as the durable queue with consumer groups and partitioning by `{repoURL, branch}` while carrying `clusterID` on every event. + +4. **HA Coordination (Valkey)** +Distributed lease/lock keys in Valkey to enforce single active writer per `{repoURL, branch}` while allowing many replicas. + +5. **Normalizer** +Single path for sanitize, metadata enrichment, canonical keying, dedupe policy. + +6. **Reconciliation Service** +Seed + orphan detection emits full resource events into the same bus. + +7. **Git Writer Service** +Consumes stream partitions, writes/commits/pushes with retry/idempotency. + +8. **Status Projector** +Updates `GitTarget`/`GitProvider` conditions from durable processing results. + +9. **Cluster Registry + AuthN/AuthZ** +Manages onboarded source clusters (`clusterID`, credentials/certs, policy), validates audit ingress identity, and applies per-cluster limits. + +This aligns with the HA direction in `README.md:56` (Valkey-based HA). + +## 7. Component Interaction and Event Pipeline + +### 7.1 Component Map (Target Architecture) + +1. **Kubernetes Sources** +Kubernetes API audit events from multiple source clusters (primary) and optional watch/informer events (fallback adapter). + +2. **Ingress/API Layer** +Stateless controller pods receive events on `/audit-webhook/`, authenticate source clusters, and convert events into canonical internal events. + +3. **Rule Compiler + Resolver** +Resolves `WatchRule`/`ClusterWatchRule` against `GitTarget`/`GitProvider` and produces deterministic routing metadata. + +4. **Normalizer** +Sanitizes payloads, enriches metadata (user, operation, `clusterID`, resource identity), computes idempotency key. + +5. **Valkey Streams Bus** +Durable event transport; events are appended to stream partitions keyed by `{repoURL, branch}`. + +6. **Valkey Lock/Lease Manager** +Ensures one active writer per partition key for ordered, conflict-minimized Git writes. + +7. **Git Writer Workers** +Consume stream entries, batch commits, push with retry/conflict handling, emit processing results. + +8. **Status Projector** +Projects pipeline outcomes back to `GitTarget`/`GitProvider` conditions and operational metrics. + +9. **Reconciliation Producer** +On startup/rule-change, lists desired resources and publishes seed/orphan events into the same stream path. + +10. **Cluster Registry + Policy Layer** +Stores cluster onboarding metadata and enforces per-cluster auth, routing, and backpressure policies. + +### 7.2 Current Event Pipeline (As-Is) + +```text +Kubernetes API + ├─ Validating Webhook (stores correlation key -> user) + └─ Dynamic Informers (main event source) + -> Rule match + -> Sanitize + dedupe + -> Correlation lookup + -> EventRouter + -> GitTargetEventStream (startup buffer/live mode) + -> BranchWorker queue + -> Git commit/push +``` + +### 7.3 Target Event Pipeline (Valkey + HA) + +```text +Source Clusters (N) Kubernetes API + -> Audit Webhook /audit-webhook/ + -> Ingress Adapter (authn + cluster policy) + -> Rule compile/resolve + -> Normalize + idempotency key (includes clusterID) + -> Valkey Stream partition {repoURL, branch} + -> Consumer Group worker candidates + -> Valkey lease holder elected per partition + -> Active Git writer processes ordered events + -> Commit/push + ack + -> Status/metrics projection +``` + +### 7.4 Reconciliation Pipeline (Seed and Drift) + +```text +GitTarget/Rule change or controller startup + -> Reconciliation producer computes desired scope + -> Lists resources and computes orphans + -> Publishes CREATE/UPDATE/DELETE reconcile events to Valkey Stream + -> Same writer path as live events + -> Converged Git state + projected conditions +``` + +### 7.5 Multi-Cluster Ingestion Model + +```text +Cluster A/B/C + -> audit-webhook/ + -> validate cluster identity + policy + -> normalize (clusterID mandatory) + -> shared Valkey bus + -> partition by target repo+branch + -> single writer lease per partition +``` + +Notes: +1. `clusterID` must be first-class in event identity, metrics, dead-letter entries, and replay tooling. +2. Route policy should allow per-cluster filtering (allowed resources/namespaces) before enqueue. +3. Backpressure and quotas should be enforceable per cluster to prevent one noisy cluster from starving others. + +## 8. HA-First Requirements (Valkey-Based) + +1. **Replica-safe ingestion**: any pod can receive events and append to Valkey Stream. +2. **Single-writer per branch**: distributed lock key per `{repoURL, branch}` with short TTL + heartbeats. +3. **At-least-once delivery**: consumer groups with pending-entry recovery on crash/restart. +4. **Idempotent processing**: deterministic event key + processed marker in Valkey to handle redelivery safely. +5. **Backpressure handling**: stream length caps and dead-letter stream for poison events. +6. **Failover behavior**: lock expiry enables automatic writer takeover by another replica. +7. **Operational visibility**: metrics for lock ownership, pending backlog, retries, dead-letter counts. +8. **Multi-cluster identity**: all ingest paths require explicit `clusterID` and authenticated source mapping. +9. **Multi-cluster fairness**: enforce per-cluster quotas/rate limits and expose per-cluster lag metrics. + +## 9. Concrete Improvements by Component + +1. **Identity and dedupe**: replace `ResourceIdentifier.String()` as key source with canonical key that includes namespace. +2. **Cluster-aware identity**: include `clusterID` in canonical event identity, dedupe keys, and audit projections. +3. **Reconcile payloads**: emit full objects for create/update snapshot events. +4. **Informer lifecycle**: explicit per-namespace start/stop contexts; no placeholder cancel funcs. +5. **Delivery guarantees**: remove fire-and-forget drops; use Valkey Streams retry/dead-letter flows. +6. **Rule coverage**: support wildcard expansion via discovery cache. +7. **Security defaults**: reduce RBAC blast radius and move webhook failure policy to fail-open only when explicitly chosen. +8. **Testing**: prioritize watch/reconcile integration paths and raise coverage on low packages (`internal/watch`, `internal/ssh`, `cmd`). +9. **Multi-cluster e2e**: add tests for concurrent audit ingestion from multiple cluster IDs with fairness and isolation checks. + +## 10. Suggested Rewrite Plan (Valkey + HA) + +1. Build a new internal package boundary around `Event`, `RulePlan`, `BranchWriter` interfaces. +2. Port existing Git writer logic behind the new interfaces first. +3. Implement Valkey Streams producer/consumer with branch partition keys and consumer groups. +4. Implement Valkey lock/lease manager for single active writer per branch. +5. Add cluster onboarding model (`clusterID`, credentials, policy) and ingress auth for `/audit-webhook/`. +6. Implement reconciliation producer (seed/orphan) publishing into Valkey Streams. +7. Implement ingest adapter (audit first, informer optional) publishing multi-cluster events into Valkey Streams. +8. Cut over `GitTarget` controller to status projection from processing pipeline. +9. Add multi-cluster e2e/load tests (fairness, isolation, replay). +10. Remove correlation store and legacy route paths. diff --git a/docs/design/valkey-adoption-guide.md b/docs/design/valkey-adoption-guide.md new file mode 100644 index 00000000..afdc5a03 --- /dev/null +++ b/docs/design/valkey-adoption-guide.md @@ -0,0 +1,257 @@ +# Valkey Adoption Guide for GitOps Reverser + +This document explains how to adopt Valkey in GitOps Reverser, how to install it, whether it should be bundled into the +Helm chart, and why Valkey Streams are a good fit for the event pipeline. + +## 1. Recommendation Summary + +1. Use **Valkey Streams + Consumer Groups** as the durable event bus. +2. Use **Valkey lock keys** (`SET key value NX EX`) for single active writer per `{repoURL, branch}` partition. +3. Keep Valkey as an **optional external dependency by default**. +4. Add an **optional Helm subchart** for convenience (disabled by default), not as the only deployment mode. + +This aligns with the current project direction in `README.md` (planned HA via Valkey) and with the rewrite goals in +`docs/design/rewrite-from-scratch-recommendations.md`. + +## 2. Why Valkey for GitOps Reverser + +GitOps Reverser needs: + +- Durable buffering during Git or network outages +- At-least-once delivery semantics +- Ordered processing per destination branch +- Replay/recovery after pod restarts +- Horizontal scaling with explicit ownership + +Valkey can provide all of this with a small operational footprint compared to larger brokers. + +## 3. Valkey Concepts Most Useful for GitOps Reverser + +1. **Streams (`XADD`, `XREADGROUP`)** +- Durable append-only log for Kubernetes events. +- Natural fit for event-driven ingestion and replay. + +2. **Consumer Groups** +- Multiple workers can process one stream without duplicate active consumption. +- Supports scale-out and worker failover. + +3. **Pending Entries List (PEL)** +- Tracks delivered-but-not-acked events. +- Critical for crash recovery and at-least-once guarantees. + +4. **`XAUTOCLAIM` / `XCLAIM`** +- Reclaims stuck messages from failed workers. +- Required for robust HA operation. + +5. **Ephemeral lock/lease keys (`SET NX EX`)** +- Enforce single active Git writer per partition key (`repo+branch`). +- Prevents concurrent push conflicts. + +6. **Lua / Functions** +- Atomic multi-step updates (for example dedupe marker + enqueue + metric counter). + +7. **Hashes / Sets** +- Store idempotency markers and short-lived processing metadata. + +## 4. Why Streams Over Other Valkey Primitives + +| Option | Good For | Limitation for GitOps Reverser | +|---|---|---| +| Pub/Sub | Low-latency fanout | No persistence, no replay, events lost on disconnect | +| Lists (`LPUSH`/`BRPOP`) | Simple queues | No built-in pending/claim model, weaker recovery story | +| Sorted sets as queue | Scheduled tasks | More custom logic for ack/retry/claim | +| Streams | Durable event pipelines | Slightly more operational and implementation complexity | + +For this project, **Streams are the best default** because durability + replay + consumer recovery are all first-class. + +## 5. Pros and Cons of Adopting Valkey + +### Pros + +- Durable queueing for API events +- Better HA posture than in-memory per-pod queues +- Clear backpressure and retry model +- Operationally lighter than larger message-broker stacks +- Fits branch-partitioned processing model + +### Cons + +- New dependency to operate and secure +- Requires careful stream retention and memory management +- At-least-once requires idempotent handlers (duplicates are possible) +- HA deployment (replication/failover) adds platform complexity + +## 6. Should Valkey Be Part of the Helm Chart? + +### Short answer + +**Yes, as an optional subchart; no, as a mandatory bundled component.** + +### Why + +- Many users already have a cache/queue layer or prefer managed services. +- OSS adoption is smoother when users can start with existing infrastructure. +- A bundled option improves developer experience and quickstart path. + +### Recommended chart strategy + +1. `valkey.enabled=false` by default +2. If enabled, install a small Valkey instance suitable for dev/small environments +3. Expose `valkey.external.*` settings for production-grade external endpoints +4. Keep one config contract in GitOps Reverser (`VALKEY_ADDR`, auth/TLS, stream names) + +## 7. Installation Paths + +### 7.1 Local Development (Docker) + +```bash +docker run -d --name valkey \ + -p 6379:6379 \ + valkey/valkey:latest + +# quick sanity check +docker exec -it valkey valkey-cli PING +``` + +### 7.2 Kubernetes (Recommended: dedicated release) + +```bash +helm repo add bitnami https://charts.bitnami.com/bitnami +helm repo update + +# Example install (adjust values for your environment) +helm install valkey bitnami/valkey \ + -n valkey-system --create-namespace +``` + +After installation, point GitOps Reverser to the service DNS in that namespace. + +### 7.3 Production HA Baseline + +Use an HA-capable Valkey topology (replication + failover), with: + +- Auth enabled +- TLS enabled where possible +- Persistent volumes enabled +- Resource requests/limits set +- Metrics enabled and scraped +- Backup/restore tested + +## 8. Example Helm Values Strategy for GitOps Reverser + +Example design (for a future chart update): + +```yaml +# values.yaml (proposed) +valkey: + enabled: false + external: + address: "valkey-primary.valkey-system.svc.cluster.local:6379" + username: "" + passwordSecretRef: + name: "valkey-auth" + key: "password" + tls: + enabled: false + caSecretRef: + name: "" + key: "ca.crt" + +pipeline: + streamName: "gitopsreverser.events" + consumerGroup: "git-writers" + lockPrefix: "gitopsreverser:lock" + dedupePrefix: "gitopsreverser:dedupe" +``` + +## 9. Example Event Flow with Streams + +Producer: + +```bash +valkey-cli XADD gitopsreverser.events '*' \ + event_id "01HRXYZ..." \ + repo "https://git.example/org/repo.git" \ + branch "main" \ + op "UPDATE" \ + gvr "apps/v1/deployments" \ + ns "default" \ + name "web" +``` + +Consumer group bootstrap (run once): + +```bash +valkey-cli XGROUP CREATE gitopsreverser.events git-writers '$' MKSTREAM +``` + +Worker read and ack: + +```bash +valkey-cli XREADGROUP GROUP git-writers worker-1 COUNT 50 BLOCK 5000 STREAMS gitopsreverser.events '>' +# process events +valkey-cli XACK gitopsreverser.events git-writers +``` + +Recover stuck events from dead workers: + +```bash +valkey-cli XAUTOCLAIM gitopsreverser.events git-writers worker-2 60000 0-0 COUNT 100 +``` + +Lease lock for a partition (`repo+branch`): + +```bash +valkey-cli SET gitopsreverser:lock: worker-1 NX EX 30 +# renew periodically while active +valkey-cli EXPIRE gitopsreverser:lock: 30 +``` + +## 10. Mapping to GitOps Reverser Event Pipeline + +Target flow: + +1. Audit/Webhook ingress receives Kubernetes event +2. Rule compiler resolves matching `GitTarget` +3. Event normalizer generates deterministic idempotency key +4. Event appended to Valkey stream partition +5. Worker from consumer group acquires partition lease +6. Worker sanitizes/writes/commits/pushes to Git +7. Worker acks stream entry +8. Status projector updates CR conditions and metrics + +## 11. Risks and Mitigations + +1. **Duplicate delivery** (normal in at-least-once) +- Mitigation: deterministic idempotency keys + dedupe TTL store. + +2. **Unbounded stream growth** +- Mitigation: retention policy (`MAXLEN`/time-based trim), dead-letter stream, monitoring. + +3. **Lock split-brain** +- Mitigation: short TTL + frequent renew + fencing token/version checks. + +4. **Valkey outage** +- Mitigation: HA topology, pod disruption controls, alerts, tested failover runbooks. + +## 12. Phased Adoption Plan + +1. Add Valkey client abstraction and feature flag (`pipeline.mode=inmemory|valkey`). +2. Implement producer path to stream and consumer group worker. +3. Add lock manager per `{repoURL,branch}` and idempotency markers. +4. Add stream metrics (lag, PEL, ack rate, reclaim count). +5. Run shadow mode in e2e (produce to Valkey, keep current writer authoritative). +6. Cut over writes to Valkey consumers. +7. Add optional Helm subchart and external Valkey config values. +8. Document HA runbook and SLO/alerts. + +## 13. Final Decision Guidance + +If your primary near-term goal is **single-cluster simplicity**, keep current mode and make Valkey optional. +If your goal is **multi-pod reliability and replayable event processing**, adopt Valkey Streams now. + +For this project, the practical middle ground is: + +- **Adopt Valkey Streams in the architecture** +- **Keep chart integration optional by default** +- **Support external managed/self-hosted Valkey as first-class** diff --git a/docs/e2e-shared-infra-parallel-plan.md b/docs/e2e-shared-infra-parallel-plan.md new file mode 100644 index 00000000..0cb7879e --- /dev/null +++ b/docs/e2e-shared-infra-parallel-plan.md @@ -0,0 +1,173 @@ +# Shared-Infra Parallel E2E Plan + +## Goal + +Run the three E2E flows in parallel while sharing one Gitea instance and one Prometheus instance in a single Kind cluster, with stable port-forwarding (`13000`, `19090`) and no cross-test contamination. + +## Target Architecture + +- One shared infra stack per cluster: + - Gitea in `gitea-e2e` + - Prometheus in `prometheus-operator` + - Fixed port-forwards kept as-is (`localhost:13000`, `localhost:19090`) +- Multiple parallel test flows, each with run-scoped isolation: + - `run-e2e-full-` + - `run-e2e-quickstart-helm-` + - `run-e2e-quickstart-manifest-` +- Isolation key carried across resources and metrics: + - `configbutler.ai/e2e-run-id=` + +## Design Principles + +- Share only infra components that are safe to share (Gitea, Prometheus, cluster-level operators). +- Isolate everything stateful per test run (namespaces, repo names, resource names, secrets, temp files, checkout dirs). +- Avoid destructive global cleanup in per-run flows. +- Make assertions run-aware via label and metric filtering. + +## Implementation Plan + +## 1. Introduce Run Context + +Define a run context contract used by Make targets, shell scripts, and Go tests: + +- `E2E_RUN_ID` +- `E2E_TEST_NAMESPACE` +- `E2E_CONTROLLER_NAMESPACE` (if needed for scenario separation) +- `E2E_GITEA_API_URL` (default `http://localhost:13000/api/v1`) + +Actions: + +- Add helper code in `test/e2e` to read/generate run context. +- Replace hardcoded namespace usage (`sut`) in Go helpers and tests with run-context values. +- Ensure all generated resource names include `E2E_RUN_ID` or are namespace-scoped. + +## 2. Refactor Gitea Setup for Concurrency + +Current risks include global git config and fixed temporary SSH key paths. + +Actions in `test/e2e/scripts/setup-gitea.sh`: + +- Accept parameters/env for: + - target namespace + - secret name prefix + - run id + - API URL +- Replace global git URL rewrite with safer alternatives: + - repo-local config, or + - per-command credentials +- Make SSH key files run-scoped: + - `/tmp/e2e-ssh-key-` + - `/tmp/e2e-ssh-key-.pub` +- Keep shared Gitea org if desired, but enforce unique repo per run. +- Create per-run secret names instead of shared fixed secrets when needed. + +## 3. Make Quickstart Flow Namespace-Aware and Non-Destructive + +Actions in `test/e2e/scripts/run-quickstart.sh`: + +- Replace fixed namespaces with run-context namespace. +- Keep all object names run-scoped. +- Remove or guard destructive reset operations that delete shared cluster-wide resources. +- Preserve shared infra and fixed port-forwarding lifecycle. + +Outcome: + +- Quickstart Helm/Manifest flows can run in parallel against shared infra without deleting each other’s prerequisites. + +## 4. Add Metric Isolation with Labels + +Use per-run labels for both resource selection and Prometheus assertions. + +Actions: + +- Ensure controller emits labels that can identify run context (`run_id`, `test_namespace`, or equivalent). +- Update e2e PromQL queries to filter by run label. +- If needed, create per-run `ServiceMonitor` objects with label selectors targeting run-tagged resources. +- Replace assumptions bound to fixed values (for example fixed `cluster_id='kind-e2e'` assertions) with run-aware checks. + +## 5. Move Quickstart Assertions into Go E2E Tests + +Rationale: reduce shell duplication, improve reuse and diagnostics consistency. + +Actions: + +- Port quickstart functional checks from shell to Go specs in `test/e2e`. +- Keep shell scripts for infrastructure/bootstrap only. +- Use table-driven tests for Helm and manifest install variants. +- Reuse existing helper patterns for CRUD, readiness, encryption, and failure-message assertions. + +## 6. Add Parallel Orchestration Target + +Introduce a new Make target to orchestrate all flows in one shared cluster. + +Example target: + +- `make test-e2e-all-parallel` + +Behavior: + +- Ensure cluster + shared infra are started once. +- Launch `full`, `quickstart-helm`, `quickstart-manifest` with distinct run contexts in parallel. +- Aggregate exit codes and print per-run diagnostics. +- Keep existing targets (`test-e2e`, `test-e2e-quickstart-*`) as fallback and for incremental debugging. + +## 7. Cleanup and Diagnostics Model + +Actions: + +- Cleanup by `run_id` only (namespace and run-scoped resources). +- Never kill shared port-forwards during per-run cleanup. +- Persist run-scoped artifacts/logs for failed runs. +- Add helper commands to inspect one run without affecting others. + +## Validation Strategy + +Follow existing project validation sequence and mandatory gates. + +Primary sequence: + +1. `make fmt` +2. `make generate` +3. `make manifests` (if API changes) +4. `make vet` +5. `make lint` +6. `make test` +7. `make test-e2e` + +Mandatory completion gates: + +- `make lint` +- `make test` +- `make test-e2e` +- `make test-e2e-quickstart-manifest` +- `make test-e2e-quickstart-helm` + +Additional prerequisite: + +- Verify Docker daemon availability before E2E execution (`docker info`). + +## Definition of Done + +- Three E2E flows can run concurrently in one cluster against one Gitea and one Prometheus instance. +- No flaky cross-run interactions from shared names, secrets, temp files, or cleanup. +- Prometheus assertions are run-aware via labels/filters. +- Fixed port-forwarding remains stable and unchanged. +- Existing mandatory lint/test/e2e targets pass. + +## Suggested Rollout Order + +1. Run context and namespace/resource parameterization. +2. Gitea script concurrency hardening. +3. Quickstart script de-destructive changes. +4. Metric labeling and query filtering. +5. Quickstart assertion migration to Go tests. +6. Parallel orchestration target and hardening. + +## Risks and Mitigations + +- Risk: hidden global assumptions in scripts/tests. + - Mitigation: search and remove fixed names/paths early (`sut`, fixed secret names, `/tmp/e2e-ssh-key`, global git config). +- Risk: metric cardinality or missing run labels. + - Mitigation: define required metric labels up front and enforce in tests. +- Risk: parallelism increases flakiness from timing. + - Mitigation: strengthen eventual assertions and per-run diagnostics. diff --git a/docs/make-e2e-deps.md b/docs/make-e2e-deps.md new file mode 100644 index 00000000..e2e905f7 --- /dev/null +++ b/docs/make-e2e-deps.md @@ -0,0 +1,387 @@ +# E2E test dependencies with Make stamps + +## What the old setup did (replaced) + +`make test-e2e` used to run these prerequisites unconditionally on every invocation: + +``` +setup-cluster → cleanup-webhook → setup-e2e → check-cert-manager → e2e-deploy → setup-port-forwards +``` + +Every single step was `.PHONY`. Make couldn't skip any of them. On a warm devcontainer with a running cluster that hadn't changed at all, you still paid for image rebuild, Helm reinstall, kustomize apply, and full rollout waits. + +--- + +## The fix: stamp files per cluster + +Each cluster-side step is represented as a local file. Make's timestamp logic handles the "did this need to rerun?" question automatically. + +The stamp namespace: + +``` +.stamps/ + image/ + controller.id ← image ID written after docker build + cluster/ + kind-/ + ready ← cluster created and kubectl works + cert-manager.installed + gitea.installed + prometheus.installed + image.loaded ← image loaded into this specific cluster + crds.applied + controller.deployed + portforward.running + e2e.passed ← test result stamp +``` + +Each stamp is written **only after a real verification step** (rollout status, kubectl wait, etc.). If any prerequisite file is newer than the stamp, Make reruns that step — and only that step. + +When `make cleanup-cluster` runs, it also `rm -rf .stamps/cluster/kind-$(KIND_CLUSTER)` so the next run starts fresh. + +--- + +## CTX-parameterized stamp targets + +All stamp targets are generic — they use a single `CTX` variable instead of hardcoded cluster names. This eliminates duplication: one set of rules serves all clusters. + +### Variables + +```make +KIND_CLUSTER_E2E ?= gitops-reverser-test-e2e +KIND_CLUSTER_QUICKSTART_HELM ?= gitops-reverser-test-e2e-quickstart-helm +KIND_CLUSTER_QUICKSTART_MANIFEST ?= gitops-reverser-test-e2e-quickstart-manifest + +# CTX: kubeconfig context for the cluster being operated on. +# Defaults to the main e2e cluster; override with CTX=kind- to reuse stamp targets for other clusters. +CTX ?= kind-$(KIND_CLUSTER_E2E) +# Derive the Kind cluster name by stripping the "kind-" prefix from CTX. +CLUSTER_FROM_CTX = $(patsubst kind-%,%,$(CTX)) +CS := .stamps/cluster/$(CTX) # cluster stamp directory +IS := .stamps/image # image stamp directory + +GO_SOURCES := $(shell find cmd internal -type f -name '*.go') go.mod go.sum + +KUBECONTEXT_QS_HELM := kind-$(KIND_CLUSTER_QUICKSTART_HELM) +KUBECONTEXT_QS_MANIFEST := kind-$(KIND_CLUSTER_QUICKSTART_MANIFEST) +``` + +> **Note**: `GO_SOURCES` covers `cmd/` and `internal/` — there is no `pkg/` directory in this project. + +To target a different cluster with any stamp target, pass `CTX` on the command line: + +```sh +make CTX=kind-my-other-cluster .stamps/cluster/kind-my-other-cluster/cert-manager.installed +``` + +--- + +## Dependency graph + +``` +make test-e2e (phony) + └─ $(CS)/e2e.passed + ├─ test/e2e/**/*.go (any test change reruns tests) + └─ $(CS)/portforward.running + ├─ $(CS)/controller.deployed + │ ├─ $(CS)/crds.applied + │ │ └─ $(CS)/ready + │ │ └─ test/e2e/kind/start-cluster.sh + │ │ test/e2e/kind/cluster-template.yaml + │ ├─ $(CS)/cert-manager.installed + │ │ └─ $(CS)/ready + │ ├─ $(CS)/prometheus.installed + │ │ └─ $(CS)/ready + │ ├─ $(IS)/controller.id + │ │ └─ cmd/**/*.go internal/**/*.go go.mod go.sum Dockerfile + │ └─ config/** (excluding config/crd/*) + ├─ $(CS)/gitea.installed + │ └─ $(CS)/ready + └─ $(CS)/prometheus.installed +``` + +What this buys you in practice: + +| Change you make | Steps that actually rerun | +|---|---| +| Edit a Go source file | image rebuild → load into kind → controller redeploy → tests | +| Edit a test file only | tests only | +| Edit a CRD manifest | CRD apply → controller redeploy → tests | +| Edit RBAC/manager config | controller redeploy → tests | +| Edit `gitea-values.yaml` | gitea reinstall → port-forwards restart → tests | +| Edit `ensure-prometheus-operator.sh` | prometheus reinstall → port-forwards restart → tests | +| Change `CERT_MANAGER_VERSION` | cert-manager reinstall → controller redeploy → tests | +| Change nothing | everything skipped — only the test binary is checked | + +--- + +## Stamp targets, step by step + +### 1 — Cluster ready + +```make +$(CS)/ready: test/e2e/kind/start-cluster.sh test/e2e/kind/cluster-template.yaml + mkdir -p $(CS) + KIND_CLUSTER=$(CLUSTER_FROM_CTX) bash test/e2e/kind/start-cluster.sh + kubectl --context $(CTX) get ns >/dev/null + touch $@ +``` + +### 2 — cert-manager + +```make +$(CS)/cert-manager.installed: $(CS)/ready + mkdir -p $(CS) + kubectl --context $(CTX) apply -f $(CERT_MANAGER_MANIFEST_URL) | grep -v unchanged + kubectl --context $(CTX) -n cert-manager rollout status deploy/cert-manager --timeout=$(CERT_MANAGER_WAIT_TIMEOUT) + kubectl --context $(CTX) -n cert-manager rollout status deploy/cert-manager-webhook --timeout=$(CERT_MANAGER_WAIT_TIMEOUT) + kubectl --context $(CTX) -n cert-manager rollout status deploy/cert-manager-cainjector --timeout=$(CERT_MANAGER_WAIT_TIMEOUT) + touch $@ +``` + +### 3 — Gitea + +```make +$(CS)/gitea.installed: $(CS)/ready test/e2e/gitea-values.yaml + mkdir -p $(CS) + $(HELM) repo add gitea-charts https://dl.gitea.com/charts/ 2>/dev/null || true + $(HELM) repo update gitea-charts + kubectl --context $(CTX) create namespace $(GITEA_NAMESPACE) --dry-run=client -o yaml \ + | kubectl --context $(CTX) apply -f - + $(HELM) --kube-context $(CTX) upgrade --install gitea gitea-charts/gitea \ + --namespace $(GITEA_NAMESPACE) \ + --version $(GITEA_CHART_VERSION) \ + --values test/e2e/gitea-values.yaml + kubectl --context $(CTX) -n $(GITEA_NAMESPACE) rollout status deploy/gitea --timeout=300s + touch $@ +``` + +### 4 — Prometheus Operator + +```make +$(CS)/prometheus.installed: $(CS)/ready test/e2e/scripts/ensure-prometheus-operator.sh + mkdir -p $(CS) + KUBECONTEXT=$(CTX) bash test/e2e/scripts/ensure-prometheus-operator.sh + touch $@ +``` + +### 5 — Image build + +For local images (no registry push), `RepoDigests` is empty. Use the image ID instead — it changes whenever the build output changes. + +```make +$(IS)/controller.id: $(GO_SOURCES) Dockerfile + mkdir -p $(IS) + $(CONTAINER_TOOL) build -t $(E2E_LOCAL_IMAGE) . + $(CONTAINER_TOOL) inspect --format='{{.Id}}' $(E2E_LOCAL_IMAGE) > $@ +``` + +### 6 — Image loaded into cluster + +A separate stamp for loading the image into a specific cluster. This is used by both the main e2e flow (via `controller.deployed`) and the quickstart targets. + +```make +$(CS)/image.loaded: $(IS)/controller.id $(CS)/ready + $(KIND) load docker-image $(E2E_LOCAL_IMAGE) --name $(CLUSTER_FROM_CTX) + touch $@ +``` + +### 7 — CRDs applied + +```make +$(CS)/crds.applied: $(CS)/ready $(shell find config/crd -type f) + mkdir -p $(CS) + kubectl --context $(CTX) apply -k config/crd + kubectl --context $(CTX) wait --for=condition=Established crd --all --timeout=120s + touch $@ +``` + +### 8 — Controller deployed + +`DEPLOY_INPUTS` includes `prometheus.installed` because the controller's ServiceMonitor depends on the Prometheus Operator CRDs being present. + +```make +DEPLOY_INPUTS := $(CS)/crds.applied $(CS)/cert-manager.installed $(CS)/prometheus.installed \ + $(IS)/controller.id \ + $(shell find config -type f -not -path 'config/crd/*') + +$(CS)/controller.deployed: $(DEPLOY_INPUTS) + mkdir -p $(CS) + $(KIND) load docker-image $(E2E_LOCAL_IMAGE) --name $(CLUSTER_FROM_CTX) + kubectl --context $(CTX) delete validatingwebhookconfiguration \ + gitops-reverser-validating-webhook-configuration --ignore-not-found=true + cd config && $(KUSTOMIZE) edit set image gitops-reverser=$(E2E_LOCAL_IMAGE) + $(KUSTOMIZE) build config | kubectl --context $(CTX) apply -f - + kubectl --context $(CTX) -n sut rollout status deploy/gitops-reverser --timeout=180s + touch $@ +``` + +### 9 — Port-forwards + +`setup-port-forwards.sh` uses bare `kubectl` (no `--context` flag), so the current kubeconfig context must be set first with `kubectl config use-context $(CTX)`. + +This target stays a **file target, not `.PHONY`**. The file target skips the recipe when the stamp is newer than all prerequisites. The health-check-first pattern inside the recipe handles the case where forwards are already alive from a previous run but the stamp needed refreshing. + +```make +$(CS)/portforward.running: $(CS)/controller.deployed $(CS)/gitea.installed $(CS)/prometheus.installed + mkdir -p $(CS) + if curl -fsS http://localhost:13000/api/healthz >/dev/null 2>&1 && \ + curl -fsS http://localhost:19090/-/healthy >/dev/null 2>&1; then \ + echo "port-forwards already healthy, skipping restart"; \ + touch $@; exit 0; \ + fi + kubectl config use-context $(CTX) + bash test/e2e/scripts/setup-port-forwards.sh + curl -fsS http://localhost:13000/api/healthz >/dev/null || { echo "Gitea health check failed"; exit 1; } + curl -fsS http://localhost:19090/-/healthy >/dev/null || { echo "Prometheus health check failed"; exit 1; } + touch $@ +``` + +The three execution paths this covers: + +| Situation | What happens | +|---|---| +| Stamp newer than all prerequisites | Make skips the recipe entirely — zero curl calls | +| Stamp stale, but forwards still alive (e.g. controller redeployed, forwards survived) | Fast path: two curls pass → touch stamp → done | +| Stamp stale and forwards are dead | Slow path: restart script → two curls verify → touch stamp | + +### 10 — E2E tests with result stamp + +```make +$(CS)/e2e.passed: $(CS)/portforward.running $(shell find test/e2e -name '*.go') + mkdir -p $(CS) + KIND_CLUSTER=$(CLUSTER_FROM_CTX) PROJECT_IMAGE=$(E2E_LOCAL_IMAGE) \ + go test ./test/e2e/ -v -ginkgo.v + touch $@ + +.PHONY: test-e2e +test-e2e: $(CS)/e2e.passed +``` + +### 11 — Cluster teardown invalidates all stamps + +```make +.PHONY: cleanup-cluster +cleanup-cluster: + @if $(KIND) get clusters 2>/dev/null | grep -q "^$(KIND_CLUSTER)$$"; then \ + $(KIND) delete cluster --name $(KIND_CLUSTER); \ + fi + rm -rf .stamps/cluster/kind-$(KIND_CLUSTER) +``` + +`KIND_CLUSTER` defaults to `$(KIND_CLUSTER_E2E)` but can be overridden: `make cleanup-cluster KIND_CLUSTER=my-other-cluster`. + +--- + +## Quickstart tests + +Quickstart tests (`test-e2e-quickstart-helm`, `test-e2e-quickstart-manifest`) validate the user-facing install paths (Helm chart and generated `dist/install.yaml`). They use dedicated clusters to keep them isolated from the main e2e suite. + +### Design: always start with a clean cluster + +Quickstart tests **always delete and recreate their cluster** at the start of every run. This avoids a class of failures where Helm tries to install on top of resources that were previously applied with `kubectl apply` (server-side apply), which causes `metadata.managedFields must be nil` errors. Since `run-quickstart.sh` resets install state but doesn't delete CRDs or ClusterRoles, a clean cluster is the only reliable guarantee. + +### CTX override pattern + +Quickstart targets reuse the generic stamp targets via recursive Make with `CTX` overridden: + +```sh +$(MAKE) CTX=kind-gitops-reverser-test-e2e-quickstart-helm \ + .stamps/cluster/kind-gitops-reverser-test-e2e-quickstart-helm/cert-manager.installed +``` + +The inner Make re-evaluates `CS` and `CLUSTER_FROM_CTX` with the overridden `CTX`, so the stamp file rules match correctly. + +### Helm quickstart target + +```make +.PHONY: test-e2e-quickstart-helm +test-e2e-quickstart-helm: ## Run quickstart smoke test (Helm install) - always starts with a clean cluster + $(MAKE) cleanup-cluster KIND_CLUSTER=$(KIND_CLUSTER_QUICKSTART_HELM) + $(MAKE) CTX=$(KUBECONTEXT_QS_HELM) \ + .stamps/cluster/$(KUBECONTEXT_QS_HELM)/cert-manager.installed \ + .stamps/cluster/$(KUBECONTEXT_QS_HELM)/gitea.installed + @if [ -z "$(PROJECT_IMAGE)" ]; then \ + $(MAKE) CTX=$(KUBECONTEXT_QS_HELM) \ + .stamps/cluster/$(KUBECONTEXT_QS_HELM)/image.loaded; \ + fi + kubectl config use-context $(KUBECONTEXT_QS_HELM) + PROJECT_IMAGE=$(if $(PROJECT_IMAGE),$(PROJECT_IMAGE),$(E2E_LOCAL_IMAGE)) \ + bash test/e2e/scripts/run-quickstart.sh helm +``` + +### Manifest quickstart target + +```make +.PHONY: test-e2e-quickstart-manifest +test-e2e-quickstart-manifest: ## Run quickstart smoke test (manifest install) - always starts with a clean cluster + $(MAKE) cleanup-cluster KIND_CLUSTER=$(KIND_CLUSTER_QUICKSTART_MANIFEST) + $(MAKE) CTX=$(KUBECONTEXT_QS_MANIFEST) \ + .stamps/cluster/$(KUBECONTEXT_QS_MANIFEST)/cert-manager.installed \ + .stamps/cluster/$(KUBECONTEXT_QS_MANIFEST)/gitea.installed + @if [ -z "$(PROJECT_IMAGE)" ]; then \ + $(MAKE) build-installer; \ + $(MAKE) CTX=$(KUBECONTEXT_QS_MANIFEST) \ + .stamps/cluster/$(KUBECONTEXT_QS_MANIFEST)/image.loaded; \ + fi + kubectl config use-context $(KUBECONTEXT_QS_MANIFEST) + PROJECT_IMAGE=$(if $(PROJECT_IMAGE),$(PROJECT_IMAGE),$(E2E_LOCAL_IMAGE)) \ + bash test/e2e/scripts/run-quickstart.sh manifest +``` + +The manifest target also calls `make build-installer` (regenerates `dist/install.yaml`) before loading the image if `PROJECT_IMAGE` is not set. + +### `run-quickstart.sh` is self-contained + +`run-quickstart.sh` manages its own Gitea port-forward via an internal `ensure_gitea_api_port_forward()` function. It does **not** call `setup-port-forwards.sh` and does not need Prometheus. The Makefile only needs to ensure cert-manager, Gitea, and the image are loaded before invoking the script. + +--- + +## Targeted test runs with Ginkgo focus flags + +The entire e2e suite lives in one `Describe("Manager", ...)` block. You can run subsets without touching the test file: + +```make +.PHONY: test-e2e-gitprovider +test-e2e-gitprovider: $(CS)/portforward.running + KIND_CLUSTER=$(KIND_CLUSTER_E2E) PROJECT_IMAGE=$(E2E_LOCAL_IMAGE) \ + go test ./test/e2e/ -v -ginkgo.v -ginkgo.focus="GitProvider" + +.PHONY: test-e2e-watchrule +test-e2e-watchrule: $(CS)/portforward.running + KIND_CLUSTER=$(KIND_CLUSTER_E2E) PROJECT_IMAGE=$(E2E_LOCAL_IMAGE) \ + go test ./test/e2e/ -v -ginkgo.v -ginkgo.focus="WatchRule" + +.PHONY: test-e2e-encryption +test-e2e-encryption: $(CS)/portforward.running + KIND_CLUSTER=$(KIND_CLUSTER_E2E) PROJECT_IMAGE=$(E2E_LOCAL_IMAGE) \ + go test ./test/e2e/ -v -ginkgo.v -ginkgo.focus="encrypt|SOPS|age" +``` + +These are `.PHONY` (always rerun the test binary) but still benefit from stamp-based infra: they won't rebuild the image or reinstall Gitea unless something actually changed. + +--- + +## What stamps cannot protect you from + +Stamps assume the world doesn't change behind Make's back: + +- **Manual `kubectl delete` of a CRD or deployment**: the stamp still exists. Fix: `make cleanup-cluster` to reset stamps. +- **Kind cluster restart / node failure**: `$(CS)/ready` still exists. The recipe guards against this with a live `kubectl get ns` call. +- **Port-forward dies between test runs**: the `.portforward.running` stamp is stale. The curl health-check at the start of the recipe catches this. +- **cert-manager version bump without touching `CERT_MANAGER_VERSION`**: the stamp won't invalidate. Encoding the version in the stamp path (`.../cert-manager-$(CERT_MANAGER_VERSION).installed`) would prevent this, but is not currently implemented. + +--- + +## Summary + +| What | Current (after stamps) | +|---|---| +| `make test-e2e` with nothing changed | Runs tests only | +| Go code change | Rebuilds image → redeploys → tests | +| Test file change only | Tests only | +| CRD change | Reapplies CRDs → redeploys → tests | +| Gitea config change | Reinstalls Gitea → tests | +| `make test-e2e-gitprovider` | Runs GitProvider tests only (infra skipped if unchanged) | +| `make test-e2e-quickstart-helm` | Deletes cluster → fresh cluster → cert-manager + gitea + image → quickstart helm test | +| `make test-e2e-quickstart-manifest` | Deletes cluster → fresh cluster → cert-manager + gitea + image → quickstart manifest test | diff --git a/docs/make-e2e-further.md b/docs/make-e2e-further.md new file mode 100644 index 00000000..c5b3317e --- /dev/null +++ b/docs/make-e2e-further.md @@ -0,0 +1,163 @@ +# E2E: move suite setup into Make stamps (clean plan) + +## Goal + +Make `go test ./test/e2e` run **only test logic**, not cluster provisioning or stable suite setup. + +Principle: +- If a step is deterministic and its completion can be captured by a file mtime, it belongs in a Make stamp. +- If it must be unique per run (e.g., unique repo name), it stays in `BeforeAll`. + +## Target outcome + +After this change, `BeforeAll` does only: +- Gitea per-run repo setup (unique repo + checkout dir) +- Prometheus client init + availability check (pure Go + port-forward) + +Everything else becomes stamp-based and runs *outside* the test binary. + +## Changes (implementation checklist) + +## Proposed dependency graph (parallel-friendly) + +This graph is the target Make stamp topology to enable `make -j` parallelism while keeping dependencies correct. + +```mermaid +flowchart TD + GO[Go sources + Dockerfile] --> IMG[IS/controller.id] + + READY[CS/ready] --> CRDS[CS/crds.applied] + READY --> CM[CS/cert-manager.installed] + READY --> PROM[CS/prometheus.installed] + READY --> GITEA[CS/gitea.installed] + READY --> SUTNS[CS/sut.namespace.ready] + + SUTNS --> AGE[CS/age-key.applied] + + IMG --> CTRL[CS/controller.deployed] + CRDS --> CTRL + CM --> CTRL + PROM --> CTRL + SUTNS --> CTRL + + CTRL --> PF[CS/portforward.running] + GITEA --> PF + PROM --> PF + + PF --> E2E[CS/e2e.passed] + AGE --> E2E + TESTSRC[test/e2e/**/*.go] --> E2E +``` + +Parallelism notes: +- `IS/controller.id` can run in parallel with `CS/ready`. +- After `CS/ready`, `CS/crds.applied`, `CS/cert-manager.installed`, `CS/prometheus.installed`, `CS/gitea.installed`, + and `CS/sut.namespace.ready` can run in parallel. +- `CS/age-key.applied` only needs the `sut` namespace; it should not wait for controller rollout. + +### 1) Replace suite Age key generation with a stamp + Go tool + +Why: +- The suite key does not need to change per run. +- The Secret YAML should be materialized as a real file (auditable, diffable). +- The test should not generate keys or apply Secrets. + +Actions: +1. Add a small Go tool: + - Path: `test/e2e/tools/gen-age-key/main.go` + - Output files: + - `--key-file` writes an `age` identity in `age-keygen`-compatible format. + - `--secret-file` writes a Kubernetes Secret manifest containing `stringData.identity.agekey`. +2. Add a cluster-scoped stamp: + +```make +# Where tests read the private key from (overridable). +E2E_AGE_KEY_FILE ?= /tmp/e2e-age-key.txt + +$(CS)/sut.namespace.ready: $(CS)/ready config/namespace.yaml + mkdir -p $(CS) + kubectl --context $(CTX) apply -f config/namespace.yaml + kubectl --context $(CTX) label --overwrite ns sut pod-security.kubernetes.io/enforce=restricted + touch $@ + +$(CS)/age-key.applied: $(CS)/sut.namespace.ready test/e2e/tools/gen-age-key/main.go + mkdir -p $(CS) + go run ./test/e2e/tools/gen-age-key \ + --key-file $(CS)/age-key.txt \ + --secret-file $(CS)/age-key-secret.yaml \ + --namespace sut \ + --secret-name sops-age-key + kubectl --context $(CTX) apply -f $(CS)/age-key-secret.yaml + touch $@ +``` + +3. Stop copying keys around by making the test configurable: + - In `test/e2e/e2e_test.go`, replace the hardcoded key path with: + - `E2E_AGE_KEY_FILE` env var (default `/tmp/e2e-age-key.txt`). + - In `Makefile` set `E2E_AGE_KEY_FILE=$(CS)/age-key.txt` for the test run. + +### 2) Move namespace creation/labeling out of `BeforeAll` + +Namespace creation already happens via Kustomize (`config/namespace.yaml`). + +Make it explicit and parallel-friendly by adding a namespace stamp that runs immediately after cluster readiness: +- `$(CS)/sut.namespace.ready` (create + label) + +Then: +- remove namespace creation + label commands from `BeforeAll` +- ensure `controller.deployed` depends on `$(CS)/sut.namespace.ready` (so the label is applied before pods are created) + +### 3) Remove certificate Secret polling from `BeforeAll` + +Delete `waitForCertificateSecrets()` usage and the helper entirely. + +Rationale: +- The controller `Deployment` mounts `admission-server-cert` and `audit-server-cert` as Secret volumes. +- Pods cannot become Ready (and `kubectl rollout status` cannot succeed) until those Secrets exist. + +### 4) Run preventive CRD cleanup only when the suite actually runs + +Do not put CRD cleanup in `controller.deployed` (it won’t re-run when the stamp is fresh). + +Instead, run it in the `$(CS)/e2e.passed` recipe immediately before `go test`: +- `kubectl --context $(CTX) delete crd icecreamorders.shop.example.com --ignore-not-found=true` + +This guarantees: +- Cleanup happens exactly when the test suite runs. +- It does not rely on stamp invalidation. + +### 5) Wire the new stamps into the existing chain + +Keep `portforward.running` focused on port-forward readiness. + +Preferred wiring: +- Add `$(CS)/age-key.applied` (and `$(CS)/namespace.configured` if used) as dependencies of `$(CS)/e2e.passed`. + +Example: + +```make +$(CS)/e2e.passed: $(CS)/portforward.running $(CS)/age-key.applied $(shell find test/e2e -name '*.go') + mkdir -p $(CS) + kubectl --context $(CTX) delete crd icecreamorders.shop.example.com --ignore-not-found=true + KIND_CLUSTER=$(CLUSTER_FROM_CTX) PROJECT_IMAGE=$(E2E_IMAGE) \ + E2E_AGE_KEY_FILE=$(CS)/age-key.txt \ + go test ./test/e2e/ -v -ginkgo.v + touch $@ +``` + +## Code edits summary + +- `Makefile`: add `age-key.applied` (and optionally `namespace.configured`) stamps; wire into `e2e.passed`; export `E2E_AGE_KEY_FILE`. +- `test/e2e/tools/gen-age-key/main.go`: new tool to write key file + Secret YAML. +- `test/e2e/e2e_test.go`: remove suite-side namespace/label, age secret setup, and cert-secret wait; read key path from env. +- `test/e2e/helpers.go`: delete `setupSOPSAgeSecret` and `waitForCertificateSecrets` (and related imports). + +## Validation + +Run: +- `make lint` +- `make test` +- `docker info` (must work before e2e) +- `make test-e2e` +- `make test-e2e-quickstart-manifest` +- `make test-e2e-quickstart-helm` diff --git a/internal/controller/gittarget_controller.go b/internal/controller/gittarget_controller.go index 8dba237b..9e9721f2 100644 --- a/internal/controller/gittarget_controller.go +++ b/internal/controller/gittarget_controller.go @@ -911,6 +911,7 @@ func (r *GitTargetReconciler) handleFetchError( namespacedName k8stypes.NamespacedName, ) (ctrl.Result, error) { if client.IgnoreNotFound(err) == nil { + r.cleanupDeletedGitTarget(namespacedName, log) log.Info("GitTarget not found, was likely deleted", "namespacedName", namespacedName) return ctrl.Result{}, nil } @@ -918,6 +919,24 @@ func (r *GitTargetReconciler) handleFetchError( return ctrl.Result{}, err } +func (r *GitTargetReconciler) cleanupDeletedGitTarget( + namespacedName k8stypes.NamespacedName, + log logr.Logger, +) { + if r.EventRouter == nil { + return + } + + gitDest := types.NewResourceReference(namespacedName.Name, namespacedName.Namespace) + + r.EventRouter.UnregisterGitTargetEventStream(gitDest) + if r.EventRouter.ReconcilerManager != nil { + _ = r.EventRouter.ReconcilerManager.DeleteReconciler(gitDest) + } + + log.V(1).Info("Cleaned up in-memory state for deleted GitTarget", "gitDest", gitDest.String()) +} + func clampIntToInt32(value int) int32 { if value < 0 { return 0 diff --git a/internal/git/branch_worker.go b/internal/git/branch_worker.go index 474f5f33..71fe8e7a 100644 --- a/internal/git/branch_worker.go +++ b/internal/git/branch_worker.go @@ -20,6 +20,8 @@ package git import ( "context" + "crypto/sha256" + "encoding/hex" "errors" "fmt" "os" @@ -109,6 +111,26 @@ func NewBranchWorker( } } +func repoCacheKey(remoteURL string) string { + sum := sha256.Sum256([]byte(strings.TrimSpace(remoteURL))) + return hex.EncodeToString(sum[:16]) +} + +func (w *BranchWorker) repoRootPath() string { + return filepath.Join( + "/tmp", + "gitops-reverser-workers", + w.GitProviderNamespace, + w.GitProviderRef, + w.Branch, + "repos", + ) +} + +func (w *BranchWorker) repoPathForRemote(remoteURL string) string { + return filepath.Join(w.repoRootPath(), repoCacheKey(remoteURL)) +} + // Start begins processing events. func (w *BranchWorker) Start(parentCtx context.Context) error { w.mu.Lock() @@ -171,9 +193,11 @@ func (w *BranchWorker) ListResourcesInPath(path string) ([]itypes.ResourceIdenti return nil, fmt.Errorf("failed to initialize repository: %w", err) } - // Use the worker's managed repository path - repoPath := filepath.Join("/tmp", "gitops-reverser-workers", - w.GitProviderNamespace, w.GitProviderRef, w.Branch) + provider, err := w.getGitProvider(w.ctx) + if err != nil { + return nil, fmt.Errorf("failed to get GitProvider: %w", err) + } + repoPath := w.repoPathForRemote(provider.Spec.URL) return w.listResourceIdentifiersInPath(repoPath, path) } @@ -288,8 +312,7 @@ func (w *BranchWorker) prepareBootstrapRepository( return "", nil, fmt.Errorf("failed to get auth: %w", err) } - repoPath := filepath.Join("/tmp", "gitops-reverser-workers", - w.GitProviderNamespace, w.GitProviderRef, w.Branch) + repoPath := w.repoPathForRemote(provider.Spec.URL) pullReport, err := PrepareBranch(ctx, provider.Spec.URL, repoPath, w.Branch, auth) if err != nil { return "", nil, fmt.Errorf("failed to prepare repository: %w", err) @@ -375,7 +398,6 @@ func (w *BranchWorker) listResourceIdentifiersInPath( // processEvents is the main event processing loop. func (w *BranchWorker) processEvents() { - // Get GitProvider provider, err := w.getGitProvider(w.ctx) if err != nil { w.Log.Error(err, "Failed to get GitProvider, worker exiting") @@ -394,7 +416,7 @@ func (w *BranchWorker) processEvents() { for { select { case <-w.ctx.Done(): - w.handleShutdown(provider, eventBuffer) + w.handleShutdown(eventBuffer) return case event := <-w.eventQueue: @@ -404,14 +426,14 @@ func (w *BranchWorker) processEvents() { // Check limits if len(eventBuffer) >= maxCommits || bufferByteCount >= maxBytesBytes { - w.commitAndPush(provider, eventBuffer) + w.commitAndPush(eventBuffer) eventBuffer = nil bufferByteCount = 0 } case <-pushTicker.C: if len(eventBuffer) > 0 { - w.commitAndPush(provider, eventBuffer) + w.commitAndPush(eventBuffer) eventBuffer = nil bufferByteCount = 0 } @@ -421,10 +443,7 @@ func (w *BranchWorker) processEvents() { // commitAndPush processes a batch of events. // Events may have different paths but all go to same branch. -func (w *BranchWorker) commitAndPush( - provider *configv1alpha1.GitProvider, - events []Event, -) { +func (w *BranchWorker) commitAndPush(events []Event) { w.repoMu.Lock() defer w.repoMu.Unlock() @@ -433,14 +452,19 @@ func (w *BranchWorker) commitAndPush( log.Info("Starting git commit and push", "branch", w.Branch) + provider, err := w.getGitProvider(w.ctx) + if err != nil { + log.Error(err, "Failed to get GitProvider") + return + } + auth, err := getAuthFromSecret(w.ctx, w.Client, provider) if err != nil { log.Error(err, "Failed to get auth") return } - repoPath := filepath.Join("/tmp", "gitops-reverser-workers", - w.GitProviderNamespace, w.GitProviderRef, w.Branch) + repoPath := w.repoPathForRemote(provider.Spec.URL) var result *WriteEventsResult for _, event := range events { @@ -503,12 +527,11 @@ func (w *BranchWorker) commitAndPush( // handleShutdown finalizes processing when context is canceled. func (w *BranchWorker) handleShutdown( - provider *configv1alpha1.GitProvider, eventBuffer []Event, ) { w.Log.Info("Handling shutdown, flushing buffer") if len(eventBuffer) > 0 { - w.commitAndPush(provider, eventBuffer) + w.commitAndPush(eventBuffer) } } @@ -623,6 +646,9 @@ func (w *BranchWorker) SyncAndGetMetadata(ctx context.Context) (*PullReport, err // syncWithRemote fetches latest changes from remote to detect drift. // This is now called by SyncAndGetMetadata() during controller reconciliation. func (w *BranchWorker) syncWithRemote(ctx context.Context) (*PullReport, error) { + w.repoMu.Lock() + defer w.repoMu.Unlock() + provider, err := w.getGitProvider(ctx) if err != nil { return nil, fmt.Errorf("failed to get GitProvider: %w", err) @@ -633,8 +659,7 @@ func (w *BranchWorker) syncWithRemote(ctx context.Context) (*PullReport, error) return nil, fmt.Errorf("failed to get auth: %w", err) } - repoPath := filepath.Join("/tmp", "gitops-reverser-workers", - w.GitProviderNamespace, w.GitProviderRef, w.Branch) + repoPath := w.repoPathForRemote(provider.Spec.URL) // PrepareBranch handles both initial and update cases report, err := PrepareBranch(ctx, provider.Spec.URL, repoPath, w.Branch, auth) @@ -666,8 +691,7 @@ func (w *BranchWorker) ensureRepositoryInitialized(ctx context.Context) error { return fmt.Errorf("failed to get auth: %w", err) } - repoPath := filepath.Join("/tmp", "gitops-reverser-workers", - w.GitProviderNamespace, w.GitProviderRef, w.Branch) + repoPath := w.repoPathForRemote(provider.Spec.URL) // Use new PrepareBranch abstraction pullReport, err := PrepareBranch(ctx, provider.Spec.URL, repoPath, w.Branch, auth) diff --git a/internal/git/branch_worker_test.go b/internal/git/branch_worker_test.go index b36d1423..ca365f91 100644 --- a/internal/git/branch_worker_test.go +++ b/internal/git/branch_worker_test.go @@ -58,6 +58,17 @@ func setupBranchWorkerTest() (*BranchWorker, func()) { return worker, cleanup } +func TestRepoCacheKey_DeterministicAndDistinct(t *testing.T) { + a := repoCacheKey("https://example.com/foo.git") + b := repoCacheKey("https://example.com/foo.git") + c := repoCacheKey("https://example.com/bar.git") + d := repoCacheKey(" https://example.com/foo.git ") + + require.Equal(t, a, b, "same URL should produce same cache key") + require.NotEqual(t, a, c, "different URL should produce different cache key") + require.Equal(t, a, d, "cache key should ignore surrounding whitespace") +} + // TestListResourcesInPath_BasicFunctionality verifies ListResourcesInPath can be called. func TestListResourcesInPath_BasicFunctionality(t *testing.T) { worker, cleanup := setupBranchWorkerTest() diff --git a/internal/git/content_writer.go b/internal/git/content_writer.go index 7357e7be..dc60d85a 100644 --- a/internal/git/content_writer.go +++ b/internal/git/content_writer.go @@ -23,6 +23,7 @@ import ( "crypto/sha256" "errors" "fmt" + "strings" "sync" "github.com/ConfigButler/gitops-reverser/internal/sanitize" @@ -45,6 +46,9 @@ type secretMarker struct { type contentWriter struct { encryptor Encryptor + // encryptionScope partitions encryption cache entries so cached bytes never cross + // repo/path/key boundaries (e.g. different GitTargets or rotated identities). + encryptionScope string mu sync.RWMutex secretCache map[string][]byte @@ -62,10 +66,11 @@ func newContentWriter() *contentWriter { } } -func (w *contentWriter) setEncryptor(encryptor Encryptor) { +func (w *contentWriter) setEncryptor(encryptor Encryptor, scope string) { w.mu.Lock() defer w.mu.Unlock() w.encryptor = encryptor + w.encryptionScope = scope } // buildContentForWrite renders event content to stable ordered YAML and applies @@ -87,7 +92,6 @@ func (w *contentWriter) encryptSecretContent(ctx context.Context, event Event, p meta := buildResourceMeta(event) identityKey := secretIdentityKey(meta.Identifier) digest := sha256.Sum256(plain) - cacheKey := fmt.Sprintf("%s:%x", identityKey, digest) currentMarker := secretMarker{ UID: meta.UID, ResourceVersion: meta.ResourceVersion, @@ -96,7 +100,18 @@ func (w *contentWriter) encryptSecretContent(ctx context.Context, event Event, p w.mu.RLock() encryptor := w.encryptor - cached, ok := w.cachedEncryptedSecret(ctx, identityKey, cacheKey, currentMarker) + scope := w.encryptionScope + w.mu.RUnlock() + + scopedIdentityKey := identityKey + if strings.TrimSpace(scope) != "" { + scopedIdentityKey = fmt.Sprintf("%s:%s", scope, identityKey) + } + + cacheKey := fmt.Sprintf("%s:%x", scopedIdentityKey, digest) + + w.mu.RLock() + cached, ok := w.cachedEncryptedSecret(ctx, scopedIdentityKey, cacheKey, currentMarker) w.mu.RUnlock() if ok { return cached, nil @@ -122,7 +137,7 @@ func (w *contentWriter) encryptSecretContent(ctx context.Context, event Event, p w.mu.Lock() w.secretCache[cacheKey] = append([]byte(nil), encrypted...) - w.secretMarker[identityKey] = currentMarker + w.secretMarker[scopedIdentityKey] = currentMarker w.mu.Unlock() return encrypted, nil diff --git a/internal/git/content_writer_test.go b/internal/git/content_writer_test.go index c627704d..88a4bfac 100644 --- a/internal/git/content_writer_test.go +++ b/internal/git/content_writer_test.go @@ -135,7 +135,7 @@ func TestBuildContentForWrite_SecretEncryptionCacheMarkerReuse(t *testing.T) { writer := newContentWriter() enc := &stubEncryptor{result: []byte("encrypted: true\nsops:\n version: 3.9.0\n")} - writer.setEncryptor(enc) + writer.setEncryptor(enc, "test-scope") obj := &unstructured.Unstructured{ Object: map[string]interface{}{ @@ -177,7 +177,7 @@ func TestBuildContentForWrite_SecretUIDChangeForcesReencrypt(t *testing.T) { writer := newContentWriter() enc := &stubEncryptor{result: []byte("encrypted: true\nsops:\n version: 3.9.0\n")} - writer.setEncryptor(enc) + writer.setEncryptor(enc, "test-scope") makeSecret := func(uid string) *unstructured.Unstructured { return &unstructured.Unstructured{ @@ -220,7 +220,7 @@ func TestBuildContentForWrite_SecretUIDChangeForcesReencrypt(t *testing.T) { func TestBuildContentForWrite_SecretEncryptionFailure(t *testing.T) { writer := newContentWriter() - writer.setEncryptor(&stubEncryptor{err: errors.New("boom")}) + writer.setEncryptor(&stubEncryptor{err: errors.New("boom")}, "test-scope") event := Event{ Identifier: types.ResourceIdentifier{ @@ -249,3 +249,44 @@ func TestBuildContentForWrite_SecretEncryptionFailure(t *testing.T) { require.Error(t, err) assert.Contains(t, err.Error(), "secret encryption failed") } + +func TestBuildContentForWrite_SecretEncryptionScopeChangeForcesReencrypt(t *testing.T) { + writer := newContentWriter() + + enc := &stubEncryptor{result: []byte("encrypted: true\nsops:\n version: 3.9.0\n")} + writer.setEncryptor(enc, "scope-a") + + event := Event{ + Identifier: types.ResourceIdentifier{ + Group: "", + Version: "v1", + Resource: "secrets", + Namespace: "default", + Name: "my-secret", + }, + Object: &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "v1", + "kind": "Secret", + "metadata": map[string]interface{}{ + "name": "my-secret", + "namespace": "default", + }, + "data": map[string]interface{}{ + "password": "cGxhaW4=", + }, + }, + }, + } + + _, err := writer.buildContentForWrite(context.Background(), event) + require.NoError(t, err) + _, err = writer.buildContentForWrite(context.Background(), event) + require.NoError(t, err) + assert.Equal(t, 1, enc.callCount, "same scope should reuse cached encrypted content") + + writer.setEncryptor(enc, "scope-b") + _, err = writer.buildContentForWrite(context.Background(), event) + require.NoError(t, err) + assert.Equal(t, 2, enc.callCount, "scope change should force re-encryption") +} diff --git a/internal/git/encryption.go b/internal/git/encryption.go index 627d4aa7..7768fbe9 100644 --- a/internal/git/encryption.go +++ b/internal/git/encryption.go @@ -206,7 +206,7 @@ func configureSecretEncryptionWriter( cfg *ResolvedEncryptionConfig, ) error { if cfg == nil { - writer.setEncryptor(nil) + writer.setEncryptor(nil, "") return nil } @@ -216,13 +216,38 @@ func configureSecretEncryptionWriter( if err != nil { return err } - writer.setEncryptor(NewSOPSEncryptorWithEnv(defaultSOPSBinaryPath, "", workDir, environment)) + scope := secretEncryptionCacheScope(workDir, cfg) + writer.setEncryptor(NewSOPSEncryptorWithEnv(defaultSOPSBinaryPath, "", workDir, environment), scope) return nil default: return fmt.Errorf("unsupported encryption provider %q", cfg.Provider) } } +func secretEncryptionCacheScope(workDir string, cfg *ResolvedEncryptionConfig) string { + if cfg == nil { + return "" + } + + hasher := sha256.New() + hasher.Write([]byte(strings.TrimSpace(cfg.Provider))) + hasher.Write([]byte{0}) + hasher.Write([]byte(strings.TrimSpace(workDir))) + hasher.Write([]byte{0}) + + for _, recipient := range cfg.AgeRecipients { + hasher.Write([]byte(strings.TrimSpace(recipient))) + hasher.Write([]byte{0}) + } + for _, identity := range cfg.AgeIdentities { + hasher.Write([]byte(strings.TrimSpace(identity))) + hasher.Write([]byte{0}) + } + + sum := hasher.Sum(nil) + return hex.EncodeToString(sum[:16]) +} + func buildSOPSEnvironment(workDir string, cfg *ResolvedEncryptionConfig) (map[string]string, error) { environment := cloneEnvironment(cfg.Environment) if len(cfg.AgeIdentities) == 0 { diff --git a/internal/git/git.go b/internal/git/git.go index a99479d7..f2016368 100644 --- a/internal/git/git.go +++ b/internal/git/git.go @@ -30,6 +30,7 @@ import ( "strings" "time" + billyutil "github.com/go-git/go-billy/v5/util" "github.com/go-git/go-git/v5" "github.com/go-git/go-git/v5/config" "github.com/go-git/go-git/v5/plumbing" @@ -600,10 +601,20 @@ func cleanWorktree(r *git.Repository) error { return fmt.Errorf("failed to get worktree: %w", err) } - // Use Clean to remove all untracked files. - // Since the index is empty, this means *all* files. - if err := w.Clean(&git.CleanOptions{Dir: true}); err != nil { - return fmt.Errorf("failed to clean worktree: %w", err) + entries, err := w.Filesystem.ReadDir(".") + if err != nil { + return fmt.Errorf("failed to read worktree root: %w", err) + } + + for _, entry := range entries { + name := entry.Name() + if name == ".git" { + continue + } + + if err := billyutil.RemoveAll(w.Filesystem, name); err != nil { + return fmt.Errorf("failed to remove %q from worktree: %w", name, err) + } } return nil diff --git a/internal/git/git_operations_test.go b/internal/git/git_operations_test.go index 8ae001e7..369b9cf5 100644 --- a/internal/git/git_operations_test.go +++ b/internal/git/git_operations_test.go @@ -359,6 +359,51 @@ func TestWriteEvents_FirstCommitOnEmptyRepo(t *testing.T) { assert.FileExists(t, filePath) } +func TestMakeHeadUnborn_CleansWorktreeIncludingTrackedFiles(t *testing.T) { + tempDir := t.TempDir() + repoPath := filepath.Join(tempDir, "repo") + + repo, err := git.PlainInit(repoPath, false) + require.NoError(t, err) + + worktree, err := repo.Worktree() + require.NoError(t, err) + + err = os.WriteFile(filepath.Join(repoPath, "tracked.txt"), []byte("tracked"), 0600) + require.NoError(t, err) + + _, err = worktree.Add("tracked.txt") + require.NoError(t, err) + + _, err = worktree.Commit("Initial commit", &git.CommitOptions{ + Author: &object.Signature{Name: "Test", Email: "test@example.com", When: time.Now()}, + }) + require.NoError(t, err) + + err = worktree.Checkout(&git.CheckoutOptions{ + Branch: plumbing.NewBranchReferenceName("main"), + Create: true, + }) + require.NoError(t, err) + + err = os.MkdirAll(filepath.Join(repoPath, "dir/sub"), 0750) + require.NoError(t, err) + + err = os.WriteFile(filepath.Join(repoPath, "dir/sub/untracked.txt"), []byte("untracked"), 0600) + require.NoError(t, err) + + err = os.WriteFile(filepath.Join(repoPath, ".sops.yaml"), []byte("dummy"), 0600) + require.NoError(t, err) + + err = makeHeadUnborn(context.Background(), repo, plumbing.NewBranchReferenceName("main")) + require.NoError(t, err) + + assert.DirExists(t, filepath.Join(repoPath, ".git")) + assert.NoFileExists(t, filepath.Join(repoPath, "tracked.txt")) + assert.NoFileExists(t, filepath.Join(repoPath, ".sops.yaml")) + assert.NoDirExists(t, filepath.Join(repoPath, "dir")) +} + func TestWriteEvents_BranchCreationAndPush(t *testing.T) { tempDir := t.TempDir() diff --git a/internal/git/secret_write_test.go b/internal/git/secret_write_test.go index 4256cd90..0fb8a461 100644 --- a/internal/git/secret_write_test.go +++ b/internal/git/secret_write_test.go @@ -113,7 +113,7 @@ func TestWriteEvents_SecretWritesSOPSPath(t *testing.T) { } writer := newContentWriter() - writer.setEncryptor(&stubEncryptor{result: []byte("encrypted: true\nsops:\n version: 3.9.0\n")}) + writer.setEncryptor(&stubEncryptor{result: []byte("encrypted: true\nsops:\n version: 3.9.0\n")}, "test-scope") _, err = WriteEventsWithContentWriter(context.Background(), writer, repoPath, []Event{event}, "master", nil) require.NoError(t, err) diff --git a/internal/reconcile/folder_reconciler.go b/internal/reconcile/folder_reconciler.go index 3cc674fe..82422e9b 100644 --- a/internal/reconcile/folder_reconciler.go +++ b/internal/reconcile/folder_reconciler.go @@ -170,17 +170,17 @@ func (r *FolderReconciler) findDifferences( // Build sets for efficient lookup for _, resource := range clusterResources { - clusterSet[resource.String()] = resource + clusterSet[resource.Key()] = resource } for _, resource := range gitResources { - gitSet[resource.String()] = resource + gitSet[resource.Key()] = resource } // Find resources to create (in cluster but not in Git) var toCreate []types.ResourceIdentifier for _, resource := range clusterResources { - if _, exists := gitSet[resource.String()]; !exists { + if _, exists := gitSet[resource.Key()]; !exists { toCreate = append(toCreate, resource) } } @@ -188,7 +188,7 @@ func (r *FolderReconciler) findDifferences( // Find resources to delete (in Git but not in cluster) var toDelete []types.ResourceIdentifier for _, resource := range gitResources { - if _, exists := clusterSet[resource.String()]; !exists { + if _, exists := clusterSet[resource.Key()]; !exists { toDelete = append(toDelete, resource) } } @@ -196,7 +196,7 @@ func (r *FolderReconciler) findDifferences( // Find resources that exist in both cluster and Git var existingInBoth []types.ResourceIdentifier for _, resource := range clusterResources { - if _, exists := gitSet[resource.String()]; exists { + if _, exists := gitSet[resource.Key()]; exists { existingInBoth = append(existingInBoth, resource) } } diff --git a/internal/reconcile/git_target_event_stream.go b/internal/reconcile/git_target_event_stream.go index 9e4f4a5b..7c0b3987 100644 --- a/internal/reconcile/git_target_event_stream.go +++ b/internal/reconcile/git_target_event_stream.go @@ -107,7 +107,7 @@ func (s *GitTargetEventStream) OnWatchEvent(event git.Event) { case LiveProcessing: // Check for duplicates using event hash eventHash := s.computeEventHash(event) - resourceKey := event.Identifier.String() + resourceKey := event.Identifier.Key() if lastHash, exists := s.processedEventHashes[resourceKey]; exists && lastHash == eventHash { s.mu.Unlock() @@ -149,7 +149,7 @@ func (s *GitTargetEventStream) OnReconciliationComplete() { // Process all buffered events for _, event := range bufferedEvents { eventHash := s.computeEventHash(event) - resourceKey := event.Identifier.String() + resourceKey := event.Identifier.Key() s.processEvent(event, eventHash, resourceKey) } diff --git a/internal/types/identifier.go b/internal/types/identifier.go index 7170f9eb..778f5f79 100644 --- a/internal/types/identifier.go +++ b/internal/types/identifier.go @@ -59,6 +59,19 @@ func NewResourceIdentifier(group, version, resource, namespace, name string) Res } } +// Key returns a stable, fully-qualified identifier suitable for map keys and deduplication. +// +// Format (namespaced): "group/version/resource/namespace/name" +// Format (cluster-scoped): "group/version/resource/name" +// +// For core resources, Group is empty and the key begins with "/" (e.g., "/v1/secrets/ns/name"). +func (r ResourceIdentifier) Key() string { + if r.Namespace != "" { + return fmt.Sprintf("%s/%s/%s/%s/%s", r.Group, r.Version, r.Resource, r.Namespace, r.Name) + } + return fmt.Sprintf("%s/%s/%s/%s", r.Group, r.Version, r.Resource, r.Name) +} + // ToGitPath generates the Git repository file path following Kubernetes API structure. func (r ResourceIdentifier) ToGitPath() string { var basePath string diff --git a/internal/watch/manager.go b/internal/watch/manager.go index e725a917..c49b7218 100644 --- a/internal/watch/manager.go +++ b/internal/watch/manager.go @@ -69,7 +69,7 @@ type Manager struct { // Deduplication: tracks last seen content hash per resource to skip status-only changes lastSeenMu sync.RWMutex - lastSeenHash map[string]uint64 // resourceKey → content hash + lastSeenHash map[string]uint64 // resourceKey → content hash (key uses types.ResourceIdentifier.Key) // Dynamic informer lifecycle management informersMu sync.Mutex @@ -295,8 +295,8 @@ func (m *Manager) isDuplicateContent( } currentHash := xxhash.Sum64(yaml) - // Resource key: namespace/name (or just name for cluster-scoped) - resourceKey := id.String() + // Resource key: fully-qualified identifier (types.ResourceIdentifier.Key). + resourceKey := id.Key() // Check against last seen m.lastSeenMu.RLock() @@ -1320,7 +1320,7 @@ func (m *Manager) clearDeduplicationCacheForGVRs(gvrs []GVR) { } // Remove hash entries for resources of these GVRs - // Key format: "group/version/resource/namespace/name" + // Key format: "group/version/resource/namespace/name" (group may be empty for core resources) for key := range m.lastSeenHash { if resourceMatchesGVRs(key, gvrSet) { delete(m.lastSeenHash, key) @@ -1332,7 +1332,8 @@ func (m *Manager) clearDeduplicationCacheForGVRs(gvrs []GVR) { } // resourceMatchesGVRs checks if a resource key matches any GVR in the set. -// Key format: "group/version/resource/namespace/name". +// Key format: "group/version/resource/namespace/name" or "group/version/resource/name" (cluster-scoped). +// Group may be empty for core resources, which yields a key like "/v1/secrets/ns/name". func resourceMatchesGVRs(resourceKey string, gvrSet map[GVR]struct{}) bool { // Parse key components parts := splitResourceKey(resourceKey) @@ -1361,8 +1362,8 @@ func resourceMatchesGVRs(resourceKey string, gvrSet map[GVR]struct{}) bool { // splitResourceKey splits a resource key into components. // Format: "group/version/resource/namespace/name" or "group/version/resource/name" (cluster-scoped). func splitResourceKey(key string) []string { - // ResourceIdentifier.String() produces: "group/version/resource/namespace/name" - // We just need the first 3 parts for GVR matching. + // types.ResourceIdentifier.Key() produces: "group/version/resource/namespace/name" (or ".../name" cluster-scoped). + // We just need the first 3 parts for GVR matching, including an empty group for core resources. parts := make([]string, 0, resourceKeyCapacity) current := "" for _, ch := range key { diff --git a/test/e2e/E2E_DEBUGGING.md b/test/e2e/E2E_DEBUGGING.md index ba3fa69a..baff1222 100644 --- a/test/e2e/E2E_DEBUGGING.md +++ b/test/e2e/E2E_DEBUGGING.md @@ -4,6 +4,14 @@ After running e2e tests, the infrastructure remains running for debugging purposes. +### Kind Cluster Names by Test Type + +- `make test-e2e` uses `KIND_CLUSTER_E2E` (default: `gitops-reverser-test-e2e`) +- `make test-e2e-quickstart-helm` uses `KIND_CLUSTER_QUICKSTART_HELM` (default: `gitops-reverser-test-e2e-quickstart-helm`) +- `make test-e2e-quickstart-manifest` uses `KIND_CLUSTER_QUICKSTART_MANIFEST` (default: `gitops-reverser-test-e2e-quickstart-manifest`) + +This separation avoids cross-test contamination between end-to-end and quickstart install smoke tests. + ### Port-Forward Management The `test-e2e` target automatically starts port-forwards, so services are immediately accessible: @@ -22,17 +30,17 @@ This exposes: make cleanup-port-forwards ``` -**Note:** The `make test-e2e` and `make e2e-setup` targets automatically run `setup-port-forwards`, so services are ready immediately after setup. +**Note:** The `make test-e2e` and `make setup-e2e` targets automatically run `setup-port-forwards`, so services are ready immediately after setup. ## Useful Prometheus Queries ### Check Pod Status ```promql # Are both controller pods being scraped? -up{job="gitops-reverser-metrics"} +up{job="gitops-reverser"} # Count of active pods -count(up{job="gitops-reverser-metrics"}) +count(up{job="gitops-reverser"}) ``` ### Webhook Events @@ -48,13 +56,13 @@ gitopsreverser_events_received_total{role!="leader"} ### Resource Metrics ```promql # CPU usage -process_cpu_seconds_total{job="gitops-reverser-metrics"} +process_cpu_seconds_total{job="gitops-reverser"} # Memory usage -process_resident_memory_bytes{job="gitops-reverser-metrics"} +process_resident_memory_bytes{job="gitops-reverser"} # Goroutines -go_goroutines{job="gitops-reverser-metrics"} +go_goroutines{job="gitops-reverser"} ``` ## Network Architecture @@ -65,8 +73,8 @@ Host Machine (port 13000, 19090) DevContainer ↕ (kubectl port-forward) Kind Cluster - ├─ prometheus-e2e namespace - │ └─ Prometheus (scrapes metrics via HTTPS + bearer token) + ├─ prometheus-operator namespace + │ └─ Prometheus Operator + Prometheus (scrapes metrics via ServiceMonitor) ├─ gitea-e2e namespace │ └─ Gitea (Git server) └─ sut namespace (System Under Test) @@ -93,7 +101,7 @@ Kind Cluster 4. **View controller logs:** ```bash - kubectl logs -n sut -l control-plane=controller-manager --tail=100 + kubectl logs -n sut -l control-plane=gitops-reverser --tail=100 ``` 5. **Check Prometheus scrape status:** @@ -102,11 +110,13 @@ Kind Cluster ## Cleanup ```bash -# Clean up all e2e infrastructure -make e2e-cleanup +# Clean up all E2E Kind clusters (all test types) +make cleanup-e2e-clusters -# Or individually: -make cleanup-prometheus-e2e +# Or clean one specific cluster name: +make cleanup-cluster KIND_CLUSTER=gitops-reverser-test-e2e + +# Infra cleanup inside the active cluster: make cleanup-gitea-e2e ``` @@ -115,6 +125,10 @@ make cleanup-gitea-e2e ```bash make setup-port-forwards # Start port-forwards (Gitea:13000, Prometheus:19090) make cleanup-port-forwards # Stop all port-forwards -make e2e-setup # Setup Gitea + Prometheus + port-forwards +make ensure-prometheus-operator +make setup-e2e # Setup Gitea + Prometheus Operator (+ cert-manager) make test-e2e # Run e2e tests (includes port-forwards) -make e2e-cleanup # Clean up all infrastructure +make test-e2e-quickstart-helm +make test-e2e-quickstart-manifest +make cleanup-e2e-clusters # Delete all E2E test clusters +``` diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index bdd973ad..ad119805 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -21,30 +21,12 @@ package e2e import ( "fmt" "os" - "os/exec" "testing" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - - "github.com/ConfigButler/gitops-reverser/test/utils" -) - -var ( - // projectImage is the name of the image to be tested. - // In CI, this is provided via PROJECT_IMAGE environment variable. - // For local testing, defaults to locally built image. - projectImage = getProjectImage() ) -// getProjectImage returns the project image name from environment or default. -func getProjectImage() string { - if img := os.Getenv("PROJECT_IMAGE"); img != "" { - return img - } - return "gitops-reverser:e2e-local" -} - // TestE2E runs the end-to-end (e2e) test suite for the project. These tests execute in an isolated, // temporary environment to validate project changes with the purpose of being used in CI jobs. // The default setup requires Kind, builds/loads the Manager Docker image locally, and installs @@ -56,18 +38,13 @@ func TestE2E(t *testing.T) { } var _ = BeforeSuite(func() { - // Check if image is pre-built (CI environment) - if os.Getenv("PROJECT_IMAGE") != "" { - By(fmt.Sprintf("using pre-built image: %s", projectImage)) - // Image is already built and loaded in CI pipeline - return + if img := os.Getenv("PROJECT_IMAGE"); img == "" { + // In local runs, the Makefile guarantees the cluster and image are ready + // before go test is invoked. Nothing to do here. + By("local run: cluster and image prepared by Makefile") + } else { + By(fmt.Sprintf("using pre-built image: %s", img)) } - - // IDE/direct go test path: ensure cluster exists and local image is built+loaded via Makefile. - By("PROJECT_IMAGE is not set; preparing cluster/image through Makefile for local run") - cmd := exec.Command("make", "setup-cluster", "e2e-build-load-image", fmt.Sprintf("PROJECT_IMAGE=%s", projectImage)) - _, err := utils.Run(cmd) - ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to build/load manager image via Makefile") }) var _ = AfterSuite(func() { diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index d9288c7f..c3067340 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -41,7 +41,7 @@ import ( // giteaRepoURLTemplate is the URL template for test Gitea repositories. const giteaRepoURLTemplate = "http://gitea-http.gitea-e2e.svc.cluster.local:13000/testorg/%s.git" const giteaSSHURLTemplate = "ssh://git@gitea-ssh.gitea-e2e.svc.cluster.local:2222/testorg/%s.git" -const e2eAgeKeyPath = "/tmp/e2e-age-key.txt" +const defaultE2EAgeKeyPath = "/tmp/e2e-age-key.txt" var testRepoName string var checkoutDir string @@ -59,53 +59,10 @@ func getRepoURLSSH() string { var _ = Describe("Manager", Ordered, func() { var controllerPodName string // Name of first controller pod for logging - // Before running the tests, set up the environment by creating the namespace, - // enforce the restricted security policy to the namespace, installing CRDs, - // deploying the controller, and setting up Gitea. + // Before running the tests, set up per-run test fixtures. BeforeAll(func() { - By("preventive namespace cleanup") var err error - // Wait for the namespace to be fully deleted to prevent race conditions with lingering pods - cmd := exec.Command("kubectl", "delete", "ns", namespace, "--wait=true", "--ignore-not-found=true") - _, _ = utils.Run(cmd) - - By("preventive CRD cleanup") - cmd = exec.Command("kubectl", "delete", "crd", "icecreamorders.shop.example.com", "--ignore-not-found=true") - _, _ = utils.Run(cmd) - - By("creating manager namespace") - cmd = exec.Command("kubectl", "create", "ns", namespace) - _, err = utils.Run(cmd) - if err != nil { - // Namespace might already exist from Gitea setup - check if it's AlreadyExists error - By("checking if namespace already exists") - checkCmd := exec.Command("kubectl", "get", "ns", namespace) - _, checkErr := utils.Run(checkCmd) - if checkErr != nil { - Expect(err).NotTo(HaveOccurred(), "Failed to create namespace and namespace doesn't exist") - } - } - - By("labeling the namespace to enforce the restricted security policy") - cmd = exec.Command("kubectl", "label", "--overwrite", "ns", namespace, - "pod-security.kubernetes.io/enforce=restricted") - _, err = utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed to label namespace with restricted policy") - - setupSOPSAgeSecret(e2eAgeKeyPath) - - By("installing CRDs") - cmd = exec.Command("make", "install") - _, err = utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed to install CRDs") - - By("deploying the controller-manager") - cmd = exec.Command("make", "deploy", fmt.Sprintf("IMG=%s", projectImage)) - _, err = utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed to deploy the controller-manager") - - By("waiting for certificate secrets to be created by cert-manager") - waitForCertificateSecrets() + var cmd *exec.Cmd By("setting up Gitea test environment with unique repository") companyStart := time.Date(2025, 5, 12, 0, 0, 0, 0, time.UTC) @@ -176,11 +133,11 @@ var _ = Describe("Manager", Ordered, func() { Context("Manager", func() { It("should run successfully", func() { - By("validating that the controller-manager pods are running as expected") + By("validating that the gitops-reverser pods are running as expected") verifyControllerUp := func(g Gomega) { - // Get the names of the controller-manager pods + // Get the names of the gitops-reverser pods cmd := exec.Command("kubectl", "get", - "pods", "-l", "control-plane=controller-manager", + "pods", "-l", "control-plane=gitops-reverser", "-o", "go-template={{ range .items }}"+ "{{ if not .metadata.deletionTimestamp }}"+ "{{ .metadata.name }}"+ @@ -189,11 +146,11 @@ var _ = Describe("Manager", Ordered, func() { ) podOutput, err := utils.Run(cmd) - g.Expect(err).NotTo(HaveOccurred(), "Failed to retrieve controller-manager pod information") + g.Expect(err).NotTo(HaveOccurred(), "Failed to retrieve gitops-reverser pod information") podNames := utils.GetNonEmptyLines(podOutput) g.Expect(podNames).To(HaveLen(1), "expected exactly 1 controller pod running") controllerPodName = podNames[0] // Use first pod for logging - g.Expect(controllerPodName).To(ContainSubstring("controller-manager")) + g.Expect(controllerPodName).To(ContainSubstring("gitops-reverser")) // Validate all pods' status for _, podName := range podNames { @@ -226,7 +183,7 @@ var _ = Describe("Manager", Ordered, func() { // Skip warning lines if !strings.HasPrefix(line, "Warning:") && !strings.Contains(line, "deprecated") && - strings.Contains(line, "controller-manager") { + strings.Contains(line, "gitops-reverser") { podSet[line] = struct{}{} } } @@ -263,7 +220,7 @@ var _ = Describe("Manager", Ordered, func() { for _, line := range lines { if !strings.HasPrefix(line, "Warning:") && !strings.Contains(line, "deprecated") && - strings.Contains(line, "controller-manager") { + strings.Contains(line, "gitops-reverser") { podSet[line] = struct{}{} } } @@ -311,25 +268,32 @@ var _ = Describe("Manager", Ordered, func() { cmd := exec.Command("kubectl", "logs", controllerPodName, "-n", namespace) output, err := utils.Run(cmd) g.Expect(err).NotTo(HaveOccurred()) - g.Expect(output).To(ContainSubstring("controller-runtime.metrics\tServing metrics server"), - "Metrics server not yet started") + jsonMetricsLogLine := "\"logger\":\"controller-runtime.metrics\"," + + "\"msg\":\"Serving metrics server\"" + g.Expect(output).To( + SatisfyAny( + ContainSubstring("controller-runtime.metrics\tServing metrics server"), + ContainSubstring(jsonMetricsLogLine), + ), + "Metrics server not yet started", + ) } Eventually(verifyMetricsServerStarted).Should(Succeed()) By("waiting for Prometheus to scrape controller metrics") - waitForMetric("up{job='gitops-reverser-metrics'}", + waitForMetric("sum(up{job='gitops-reverser'})", func(v float64) bool { return v == 1 }, "metrics endpoint should be up") By("verifying basic process metrics are exposed") - waitForMetric("process_cpu_seconds_total{job='gitops-reverser-metrics'}", + waitForMetric("sum(process_cpu_seconds_total{job='gitops-reverser'})", func(v float64) bool { return v > 0 }, "process metrics should exist") By("verifying metrics from the controller pod") - podCount, err := queryPrometheus("count(up{job='gitops-reverser-metrics'})") + podCount, err := queryPrometheus("sum(up{job='gitops-reverser'})") Expect(err).NotTo(HaveOccurred()) - Expect(podCount).To(Equal(1.0), "Should scrape from 1 controller pod") + Expect(podCount).To(BeEquivalentTo(1), "Should scrape from 1 controller pod") fmt.Printf("✅ Metrics collection verified from %.0f pods\n", podCount) fmt.Printf("📊 Inspect metrics: %s\n", getPrometheusURL()) @@ -594,7 +558,7 @@ var _ = Describe("Manager", Ordered, func() { HaveOccurred(), fmt.Sprintf(".sops.yaml must exist at %s", bootstrapSOPSFile), ) - ageKey, ageKeyErr := readSOPSAgeKeyFromFile(e2eAgeKeyPath) + ageKey, ageKeyErr := readSOPSAgeKeyFromFile(getE2EAgeKeyPath()) g.Expect(ageKeyErr).NotTo(HaveOccurred(), "Should read age private key") recipient, recipientErr := deriveAgeRecipient(ageKey) g.Expect(recipientErr).NotTo(HaveOccurred(), "Should derive age recipient") @@ -807,7 +771,7 @@ var _ = Describe("Manager", Ordered, func() { By("waiting for controller reconciliation of ConfigMap event") verifyReconciliationLogs := func(g Gomega) { // Get controller logs from all pods (single-pod mode still uses label selector). - cmd := exec.Command("kubectl", "logs", "-l", "control-plane=controller-manager", + cmd := exec.Command("kubectl", "logs", "-l", "control-plane=gitops-reverser", "-n", namespace, "--tail=500", "--prefix=true") output, err := utils.Run(cmd) g.Expect(err).NotTo(HaveOccurred()) @@ -1155,7 +1119,7 @@ var _ = Describe("Manager", Ordered, func() { By("waiting for controller reconciliation of CRD instance event") verifyReconciliationLogs := func(g Gomega) { - cmd := exec.Command("kubectl", "logs", "-l", "control-plane=controller-manager", + cmd := exec.Command("kubectl", "logs", "-l", "control-plane=gitops-reverser", "-n", namespace, "--tail=500", "--prefix=true") output, err := utils.Run(cmd) g.Expect(err).NotTo(HaveOccurred()) @@ -1594,6 +1558,13 @@ func readSOPSAgeKeyFromFile(path string) (string, error) { return "", fmt.Errorf("no AGE-SECRET-KEY found in %s", path) } +func getE2EAgeKeyPath() string { + if value := strings.TrimSpace(os.Getenv("E2E_AGE_KEY_FILE")); value != "" { + return value + } + return defaultE2EAgeKeyPath +} + func deriveAgeRecipient(identityString string) (string, error) { identity, err := age.ParseX25519Identity(strings.TrimSpace(identityString)) if err != nil { @@ -1605,7 +1576,7 @@ func deriveAgeRecipient(identityString string) (string, error) { func decryptWithControllerSOPS(ciphertext []byte, ageKey string) (string, error) { podCmd := exec.Command( "kubectl", "get", "pods", - "-l", "control-plane=controller-manager", + "-l", "control-plane=gitops-reverser", "-n", namespace, "-o", "jsonpath={.items[0].metadata.name}", ) @@ -1740,7 +1711,7 @@ func showControllerLogs(context string) { By(fmt.Sprintf("📋 Controller logs %s:", context)) // Get the controller pod name dynamically - cmd := exec.Command("kubectl", "get", "pods", "-l", "control-plane=controller-manager", + cmd := exec.Command("kubectl", "get", "pods", "-l", "control-plane=gitops-reverser", "-o", "jsonpath={.items[0].metadata.name}", "-n", namespace) podName, err := utils.Run(cmd) if err != nil { diff --git a/test/e2e/helpers.go b/test/e2e/helpers.go index fae8517c..ac1ecb1e 100644 --- a/test/e2e/helpers.go +++ b/test/e2e/helpers.go @@ -26,13 +26,11 @@ import ( "bytes" "context" "fmt" - "os" "os/exec" "strings" "text/template" "time" - "filippo.io/age" . "github.com/onsi/ginkgo/v2" //nolint:staticcheck // Ginkgo standard practice . "github.com/onsi/gomega" //nolint:staticcheck // Ginkgo standard practice "github.com/prometheus/client_golang/api" @@ -170,30 +168,6 @@ func applyFromTemplate(templatePath string, data interface{}, namespace string, return err } -// waitForCertificateSecrets waits for cert-manager to create the required certificate secrets -// This prevents race conditions where controller pods try to mount secrets before they exist -func waitForCertificateSecrets() { - By("waiting for webhook certificate secret to be created by cert-manager") - Eventually(func(g Gomega) { - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) //nolint:mnd // reasonable timeout - defer cancel() - cmd := exec.CommandContext(ctx, "kubectl", "get", "secret", "admission-server-cert", "-n", namespace) - _, err := utils.Run(cmd) - g.Expect(err).NotTo(HaveOccurred(), "admission-server-cert secret should exist") - }, 60*time.Second, 2*time.Second).Should(Succeed()) //nolint:mnd // reasonable timeout for cert-manager - - By("waiting for dedicated audit certificate secret to be created by cert-manager") - Eventually(func(g Gomega) { - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) //nolint:mnd // reasonable timeout - defer cancel() - cmd := exec.CommandContext(ctx, "kubectl", "get", "secret", "audit-server-cert", "-n", namespace) - _, err := utils.Run(cmd) - g.Expect(err).NotTo(HaveOccurred(), "audit-server-cert secret should exist") - }, 60*time.Second, 2*time.Second).Should(Succeed()) //nolint:mnd // reasonable timeout for cert-manager - - By("✅ All certificate secrets are ready") -} - // getBaseFolder returns the baseFolder used by GitDestination in e2e tests. // Must satisfy the CRD validation (POSIX-like relative path, no traversal). func getBaseFolder() string { @@ -280,31 +254,3 @@ func cleanupClusterWatchRule(name string) { "--ignore-not-found=true") _, _ = utils.Run(cmd) } - -func setupSOPSAgeSecret(keyPath string) { - By("creating SOPS age key and encryption secret for e2e") - - identity, err := age.GenerateX25519Identity() - Expect(err).NotTo(HaveOccurred(), "Failed to generate age identity") - - // Keep file format compatible with existing key reader in e2e tests. - keyFileContent := fmt.Sprintf("# created: e2e\n# public key: %s\n%s\n", identity.Recipient(), identity) - err = os.WriteFile(keyPath, []byte(keyFileContent), 0600) - Expect(err).NotTo(HaveOccurred(), "Failed to write e2e age key file") - - manifest := fmt.Sprintf(`apiVersion: v1 -kind: Secret -metadata: - name: %s - namespace: %s -type: Opaque -stringData: - identity.agekey: %q -`, e2eEncryptionRefName, namespace, identity.String()) - - ctx := context.Background() - applyCmd := exec.CommandContext(ctx, "kubectl", "apply", "-f", "-") - applyCmd.Stdin = strings.NewReader(manifest) - _, err = utils.Run(applyCmd) - Expect(err).NotTo(HaveOccurred(), "Failed to apply SOPS encryption secret") -} diff --git a/test/e2e/kind/start-cluster.sh b/test/e2e/kind/start-cluster.sh index bd3ba28f..c60a57d3 100755 --- a/test/e2e/kind/start-cluster.sh +++ b/test/e2e/kind/start-cluster.sh @@ -9,23 +9,6 @@ CONFIG_FILE="test/e2e/kind/cluster.ignore.yaml" KIND_CREATE_LOG_FILE="${TMPDIR:-/tmp}/kind-create-${CLUSTER_NAME}.log" POD_SUBNET="${KIND_POD_SUBNET:-10.244.0.0/16}" -# Check if HOST_PROJECT_PATH is set -if [ -z "$HOST_PROJECT_PATH" ]; then - echo "❌ ERROR: HOST_PROJECT_PATH environment variable is not set" - echo "This should be set in .devcontainer/devcontainer.json" - exit 1 -fi - -echo "🔧 Using HOST_PROJECT_PATH: $HOST_PROJECT_PATH" - -# Use envsubst to replace ${HOST_PROJECT_PATH} in template -echo "📝 Generating Kind cluster configuration from template..." -envsubst < "$TEMPLATE_FILE" > "$CONFIG_FILE" - -echo "✅ Generated configuration:" -cat "$CONFIG_FILE" -echo "" - create_primary_cluster() { kind create cluster --name "$CLUSTER_NAME" --config "$CONFIG_FILE" --wait 5m --retain 2>&1 | tee "$KIND_CREATE_LOG_FILE" } @@ -82,6 +65,21 @@ run_dood_self_heal() { if kind get clusters 2>/dev/null | grep -q "^${CLUSTER_NAME}$"; then echo "♻️ Reusing existing Kind cluster '$CLUSTER_NAME' (no delete/recreate)" else + # HOST_PROJECT_PATH is only needed when creating a new cluster (for host-path volume mounts). + # Clusters pre-created by CI (helm/kind-action) don't need it. + if [ -z "${HOST_PROJECT_PATH:-}" ]; then + echo "❌ ERROR: HOST_PROJECT_PATH environment variable is not set" + echo "This should be set in .devcontainer/devcontainer.json or passed by CI" + exit 1 + fi + + echo "🔧 Using HOST_PROJECT_PATH: $HOST_PROJECT_PATH" + echo "📝 Generating Kind cluster configuration from template..." + envsubst < "$TEMPLATE_FILE" > "$CONFIG_FILE" + echo "✅ Generated configuration:" + cat "$CONFIG_FILE" + echo "" + echo "🚀 Creating Kind cluster '$CLUSTER_NAME' with audit webhook support..." if create_primary_cluster; then echo "✅ Kind cluster created successfully" @@ -102,11 +100,15 @@ current_server="$(kubectl config view --minify -o jsonpath='{.clusters[0].cluste if [[ "$current_server" =~ ^https://(127\.0\.0\.1|localhost|0\.0\.0\.0):([0-9]+)$ ]]; then apiserver_port="${BASH_REMATCH[2]}" - echo "🔁 Rewriting kubeconfig server endpoint to host.docker.internal:${apiserver_port}..." - kubectl config set-cluster "$current_cluster_name" \ - --server="https://host.docker.internal:${apiserver_port}" \ - --tls-server-name=localhost >/dev/null - echo "✅ kubeconfig endpoint updated for devcontainer networking" + if getent hosts host.docker.internal >/dev/null 2>&1; then + echo "🔁 Rewriting kubeconfig server endpoint to host.docker.internal:${apiserver_port}..." + kubectl config set-cluster "$current_cluster_name" \ + --server="https://host.docker.internal:${apiserver_port}" \ + --tls-server-name=localhost >/dev/null + echo "✅ kubeconfig endpoint updated for devcontainer networking" + else + echo "ℹ️ host.docker.internal not resolvable; keeping server as ${current_server} (--network host or native environment)" + fi else echo "ℹ️ kubeconfig server is '$current_server' (no rewrite needed)" fi diff --git a/test/e2e/prometheus/deployment.yaml b/test/e2e/prometheus/deployment.yaml deleted file mode 100644 index 22ced0de..00000000 --- a/test/e2e/prometheus/deployment.yaml +++ /dev/null @@ -1,104 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: prometheus - namespace: prometheus-e2e ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: prometheus-config - namespace: prometheus-e2e -data: - prometheus.yml: | - global: - scrape_interval: 5s - evaluation_interval: 5s - scrape_timeout: 4s - - scrape_configs: - # Scrape gitops-reverser metrics from the single controller Service in 'sut' - - job_name: 'gitops-reverser-metrics' - scheme: http - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - static_configs: - - targets: - - gitops-reverser-service.sut.svc.cluster.local:8443 ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: prometheus - namespace: prometheus-e2e - labels: - app: prometheus -spec: - replicas: 1 - selector: - matchLabels: - app: prometheus - template: - metadata: - labels: - app: prometheus - spec: - serviceAccountName: prometheus - securityContext: - runAsNonRoot: true - runAsUser: 65534 - fsGroup: 65534 - containers: - - name: prometheus - image: prom/prometheus:v2.48.0 - args: - - '--config.file=/etc/prometheus/prometheus.yml' - - '--storage.tsdb.retention.time=2h' - - '--storage.tsdb.path=/prometheus' - - '--web.enable-lifecycle' - ports: - - name: http - containerPort: 9090 - livenessProbe: - httpGet: - path: /-/healthy - port: http - initialDelaySeconds: 10 - readinessProbe: - httpGet: - path: /-/ready - port: http - initialDelaySeconds: 5 - volumeMounts: - - name: config - mountPath: /etc/prometheus - - name: storage - mountPath: /prometheus - resources: - requests: - cpu: 100m - memory: 128Mi - limits: - cpu: 500m - memory: 512Mi - volumes: - - name: config - configMap: - name: prometheus-config - - name: storage - emptyDir: {} ---- -apiVersion: v1 -kind: Service -metadata: - name: prometheus - namespace: prometheus-e2e - labels: - app: prometheus -spec: - type: ClusterIP - selector: - app: prometheus - ports: - - name: http - port: 19090 - targetPort: http diff --git a/test/e2e/prometheus/namespace.yaml b/test/e2e/prometheus/namespace.yaml deleted file mode 100644 index 2b45c439..00000000 --- a/test/e2e/prometheus/namespace.yaml +++ /dev/null @@ -1,6 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: prometheus-e2e - labels: - app: prometheus-e2e \ No newline at end of file diff --git a/test/e2e/prometheus/rbac.yaml b/test/e2e/prometheus/rbac.yaml deleted file mode 100644 index fe0ca085..00000000 --- a/test/e2e/prometheus/rbac.yaml +++ /dev/null @@ -1,57 +0,0 @@ -# Allow Prometheus to scrape metrics from sut namespace -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: prometheus-e2e-scraper -rules: - - apiGroups: [""] - resources: - - nodes - - nodes/proxy - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] - - apiGroups: [""] - resources: - - configmaps - verbs: ["get"] - - nonResourceURLs: ["/metrics"] - verbs: ["get"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: prometheus-e2e-scraper -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus-e2e-scraper -subjects: - - kind: ServiceAccount - name: prometheus - namespace: prometheus-e2e ---- -# Allow Prometheus service account to access metrics endpoints -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: prometheus-metrics-reader -rules: - - nonResourceURLs: - - /metrics - verbs: - - get ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: prometheus-metrics-reader -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus-metrics-reader -subjects: - - kind: ServiceAccount - name: prometheus - namespace: prometheus-e2e \ No newline at end of file diff --git a/test/e2e/scripts/ensure-prometheus-operator.sh b/test/e2e/scripts/ensure-prometheus-operator.sh new file mode 100755 index 00000000..a94b5391 --- /dev/null +++ b/test/e2e/scripts/ensure-prometheus-operator.sh @@ -0,0 +1,43 @@ +#!/bin/bash +set -euo pipefail + +PROM_OPERATOR_VERSION="${PROM_OPERATOR_VERSION:-0.89.0}" +PROM_OPERATOR_NAMESPACE="${PROM_OPERATOR_NAMESPACE:-prometheus-operator}" + +is_fully_installed() { + kubectl get deployment prometheus-operator -n "${PROM_OPERATOR_NAMESPACE}" >/dev/null 2>&1 && + kubectl get crd prometheuses.monitoring.coreos.com >/dev/null 2>&1 && + kubectl get crd servicemonitors.monitoring.coreos.com >/dev/null 2>&1 +} + +if is_fully_installed; then + echo "✅ Prometheus Operator already installed and configured in ${PROM_OPERATOR_NAMESPACE}" + exit 0 +fi + +echo "Installing Prometheus Operator v${PROM_OPERATOR_VERSION} in namespace ${PROM_OPERATOR_NAMESPACE}..." +kubectl create namespace "${PROM_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + +tmp_dir="$(mktemp -d)" +cleanup() { + rm -rf "${tmp_dir}" +} +trap cleanup EXIT + +cat > "${tmp_dir}/kustomization.yaml" </dev/null || true kubectl delete clusterrolebinding \ - gitops-reverser-manager-rolebinding \ + gitops-reverser \ gitops-reverser-proxy-rolebinding \ --ignore-not-found=true >/dev/null || true kubectl delete validatingwebhookconfiguration gitops-reverser-validating-webhook-configuration \ diff --git a/test/e2e/scripts/setup-port-forwards.sh b/test/e2e/scripts/setup-port-forwards.sh index a9021e13..1c54ead1 100755 --- a/test/e2e/scripts/setup-port-forwards.sh +++ b/test/e2e/scripts/setup-port-forwards.sh @@ -3,7 +3,8 @@ set -euo pipefail # Configuration GITEA_NAMESPACE=${GITEA_NAMESPACE:-gitea-e2e} -PROMETHEUS_NAMESPACE=${PROMETHEUS_NAMESPACE:-prometheus-e2e} +PROMETHEUS_NAMESPACE=${PROMETHEUS_NAMESPACE:-prometheus-operator} +PROMETHEUS_INSTANCE_NAME=${PROMETHEUS_INSTANCE_NAME:-gitops-reverser} # Cleanup old port-forwards echo "🧹 Cleaning up old port-forwards..." @@ -13,45 +14,48 @@ sleep 1 echo "🔌 Setting up port-forwards for e2e testing..." -# Wait for pods to be ready before attempting port-forwards echo "⏳ Waiting for Prometheus pod to be ready..." kubectl wait --for=condition=ready pod \ - -l app=prometheus \ + -l "prometheus=${PROMETHEUS_INSTANCE_NAME}" \ -n "$PROMETHEUS_NAMESPACE" \ - --timeout=60s || { + --timeout=180s || { echo "❌ Prometheus pod failed to become ready" + kubectl get pods -n "$PROMETHEUS_NAMESPACE" -l "prometheus=${PROMETHEUS_INSTANCE_NAME}" || true exit 1 } +echo "✅ Prometheus pod is ready" + echo "⏳ Waiting for Gitea pod to be ready..." kubectl wait --for=condition=ready pod \ -l app.kubernetes.io/name=gitea \ -n "$GITEA_NAMESPACE" \ - --timeout=60s || { + --timeout=180s || { echo "❌ Gitea pod failed to become ready" exit 1 } -echo "✅ All pods are ready" +echo "✅ Gitea pod is ready" # Generic function to setup a port-forward with verification -# Args: service_name namespace service port health_check_url +# Args: service_name namespace service local_port remote_port setup_port_forward() { local service_name="$1" local namespace="$2" local service="$3" - local port="$4" - local health_url="$5" + local local_port="$4" + local remote_port="$5" # Check if already running - if ps aux | grep -E "kubectl.*port-forward.*${service}.*${port}" | grep -v grep >/dev/null 2>&1; then + if ps aux | grep -E "kubectl.*port-forward.*${service}.*${local_port}:${remote_port}" | grep -v grep >/dev/null 2>&1; then echo "✅ ${service_name} port-forward already running" return 0 fi echo "🔌 Starting ${service_name} port-forward..." local pf_log="/tmp/${service_name}-pf.log" - kubectl port-forward --address 127.0.0.1 -n "$namespace" "svc/${service}" "${port}:${port}" > "$pf_log" 2>&1 & + kubectl port-forward --address 127.0.0.1 \ + -n "$namespace" "svc/${service}" "${local_port}:${remote_port}" > "$pf_log" 2>&1 & local pf_pid=$! # Give it time to establish @@ -71,7 +75,7 @@ setup_port_forward() { fi # Try to connect to the port - if timeout 2 bash -c "echo >/dev/tcp/localhost/${port}" 2>/dev/null; then + if timeout 2 bash -c "echo >/dev/tcp/localhost/${local_port}" 2>/dev/null; then echo "✅ ${service_name} port-forward verified and working" return 0 fi @@ -87,8 +91,8 @@ setup_port_forward() { } # Setup port-forwards -setup_port_forward "Prometheus" "$PROMETHEUS_NAMESPACE" "prometheus" "19090" "http://localhost:19090/-/ready" -setup_port_forward "Gitea" "$GITEA_NAMESPACE" "gitea-http" "13000" "http://localhost:13000/api/v1/version" +setup_port_forward "Prometheus" "$PROMETHEUS_NAMESPACE" "prometheus-operated" "19090" "9090" +setup_port_forward "Gitea" "$GITEA_NAMESPACE" "gitea-http" "13000" "13000" echo "" echo "📊 Prometheus: http://localhost:19090" diff --git a/test/e2e/scripts/setup-prometheus.sh b/test/e2e/scripts/setup-prometheus.sh deleted file mode 100755 index 46565591..00000000 --- a/test/e2e/scripts/setup-prometheus.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -set -e - -echo "Setting up Prometheus for e2e testing..." - -# Create namespace -echo "Creating prometheus-e2e namespace..." -kubectl create namespace prometheus-e2e --dry-run=client -o yaml | kubectl apply -f - - -# Apply RBAC -echo "Applying Prometheus RBAC..." -kubectl apply -f test/e2e/prometheus/rbac.yaml - -# Deploy Prometheus -echo "Deploying Prometheus..." -kubectl apply -f test/e2e/prometheus/deployment.yaml - -echo "Restarting Prometheus deployment to pick up ConfigMap changes..." -kubectl rollout restart deployment/prometheus -n prometheus-e2e -kubectl rollout status deployment/prometheus -n prometheus-e2e --timeout=120s - -echo "✅ Prometheus manifests deployed" diff --git a/test/e2e/tools/gen-age-key/main.go b/test/e2e/tools/gen-age-key/main.go new file mode 100644 index 00000000..fa83b949 --- /dev/null +++ b/test/e2e/tools/gen-age-key/main.go @@ -0,0 +1,96 @@ +/* +SPDX-License-Identifier: Apache-2.0 + +Copyright 2025 ConfigButler + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "flag" + "fmt" + "os" + "path/filepath" + "time" + + "filippo.io/age" +) + +const ( + keyFileMode = 0o600 + secretFileMode = 0o644 + dirMode = 0o750 +) + +func main() { + var keyFile string + var secretFile string + var namespace string + var secretName string + + flag.StringVar(&keyFile, "key-file", "", "path to write age identity file") + flag.StringVar(&secretFile, "secret-file", "", "path to write Kubernetes Secret manifest") + flag.StringVar(&namespace, "namespace", "", "Secret namespace") + flag.StringVar(&secretName, "secret-name", "", "Secret name") + flag.Parse() + + if keyFile == "" || secretFile == "" || namespace == "" || secretName == "" { + exitf("all flags are required: --key-file --secret-file --namespace --secret-name") + } + + identity, err := age.GenerateX25519Identity() + if err != nil { + exitf("generate age identity: %v", err) + } + + keyContent := fmt.Sprintf( + "# created: %s\n# public key: %s\n%s\n", + time.Now().UTC().Format(time.RFC3339), + identity.Recipient(), + identity.String(), + ) + if err := writeFile(keyFile, []byte(keyContent), keyFileMode); err != nil { + exitf("write key file: %v", err) + } + + secretContent := fmt.Sprintf(`apiVersion: v1 +kind: Secret +metadata: + name: %s + namespace: %s +type: Opaque +stringData: + identity.agekey: | + %s +`, secretName, namespace, identity.String()) + if err := writeFile(secretFile, []byte(secretContent), secretFileMode); err != nil { + exitf("write secret file: %v", err) + } +} + +func writeFile(path string, content []byte, mode os.FileMode) error { + parent := filepath.Dir(path) + if parent != "" && parent != "." { + if err := os.MkdirAll(parent, dirMode); err != nil { + return err + } + } + return os.WriteFile(path, content, mode) +} + +func exitf(format string, args ...any) { + _, _ = fmt.Fprintf(os.Stderr, format+"\n", args...) + os.Exit(1) +} diff --git a/test/improving.md b/test/improving.md new file mode 100644 index 00000000..d5ee5e97 --- /dev/null +++ b/test/improving.md @@ -0,0 +1,29 @@ +I've learned a lot on the Makefile and running tests: the Tiltfile really got me into this direction and now it makes me wonder how I should 'thank it'. + +At this moment I have 3 seperate e2e tests that all create their own cluster. + +I think / feel that they could be combined, and even could be ran in parallel. + +I find it really important to test my quickstart steps: to show to people that it just works. + +But the basics are the same, and they should be able to handle multiple gitops-reversers at the same time: + +* Kind cluster +* Gitea (one can have different repos!) +* Prometheus (one can spin up different prometheus instances, or just apply different labels, altough it does become a bit more fuzzy to get that right) +* One namespace with a unqiue name per testrun. +* One Makefile target that would allow us to run the three e2e tests at the same time (in parallel where possible). + +What would be needed to get that right? We need to continue our quest for removing repitiont: getting back the essence, properly defining what is really dependent on what. + +The first next step would be to analyse test/e2e/scripts/run-quickstart.sh -> I'm sure a lot can be removed and can be picked up by the 'normal' stuff. + +It would be really cool to create a new namespace: +"run-e2e-full-{same-number-as-repo-basedupon-time}" +"run-e2e-quickstart-manifests-{same-number-as-repo-basedupon-time}" +"run-e2e-quickstart-helm-{same-number-as-repo-basedupon-time}" + +I'm also wondering if the tests for the quickstart shouldnt be in a go e2e_test.go like file -> in the end we are just doing the same type of assertions. + +It would be really awesome if we could redo all tests in the proper way. +