diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 8619215b2..b8d2e558c 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -32,7 +32,11 @@ jobs: steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 with: - ref: ${{ github.event.workflow_run.head_sha || github.sha }} + # For pull_request events, use the branch HEAD SHA (not the ephemeral merge + # commit that github.sha resolves to), which is directly fetchable by SHA. + # For workflow_run events fall back to the triggering HEAD SHA. + # For push/workflow_dispatch fall back to github.sha. + ref: ${{ github.event.pull_request.head.sha || github.event.workflow_run.head_sha || github.sha }} - name: Set up Go uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6 diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index e3682d08e..711c3d3b8 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -7,9 +7,10 @@ name: Docker Build, Publish & Test # - Enhanced PR handling with dedicated scanning # - Improved workflow orchestration with supply-chain-verify.yml # -# PHASE 1 OPTIMIZATION (February 2026): + + # PHASE 1 OPTIMIZATION (February 2026): # - PR images now pushed to GHCR registry (enables downstream workflow consumption) -# - Immutable PR tagging: pr-{number}-{short-sha} (prevents race conditions) +# - Stable PR tagging: pr-{number} (freshness gate prevents stale scans) # - Feature branch tagging: {sanitized-branch-name}-{short-sha} (enables unique testing) # - Tag sanitization per spec Section 3.2 (handles special chars, slashes, etc.) # - Mandatory security scanning for PR images (blocks on CRITICAL/HIGH vulnerabilities) @@ -30,7 +31,7 @@ on: types: [completed] concurrency: - group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event_name == 'workflow_run' && github.event.workflow_run.head_branch || github.head_ref || github.ref_name }} + group: ${{ github.workflow }}-${{ github.head_ref || github.event.workflow_run.head_branch || github.ref_name }} cancel-in-progress: true permissions: @@ -46,7 +47,7 @@ env: TRIGGER_HEAD_SHA: ${{ github.event_name == 'workflow_run' && github.event.workflow_run.head_sha || github.sha }} TRIGGER_REF: ${{ github.event_name == 'workflow_run' && format('refs/heads/{0}', github.event.workflow_run.head_branch) || github.ref }} TRIGGER_HEAD_REF: ${{ github.event_name == 'workflow_run' && github.event.workflow_run.head_branch || github.head_ref }} - TRIGGER_PR_NUMBER: ${{ github.event_name == 'workflow_run' && join(github.event.workflow_run.pull_requests.*.number, '') || format('{0}', github.event.pull_request.number) }} + TRIGGER_PR_NUMBER: ${{ github.event_name == 'pull_request' && format('{0}', github.event.pull_request.number) || '' }} TRIGGER_ACTOR: ${{ github.event_name == 'workflow_run' && github.event.workflow_run.actor.login || github.actor }} TRIGGER_BASE_REF: ${{ github.event_name == 'workflow_run' && '' || github.base_ref }} @@ -67,7 +68,6 @@ jobs: outputs: skip_build: ${{ steps.skip.outputs.skip_build }} digest: ${{ steps.build-and-push.outputs.digest }} - pr_image_ref: ${{ steps.pr-image-ref.outputs.image_ref }} steps: - name: Checkout repository @@ -160,9 +160,11 @@ jobs: echo "Force building on feature branch (PR)" fi - echo "skip_build=$should_skip" >> "$GITHUB_OUTPUT" - echo "is_feature_push=$is_feature_push" >> "$GITHUB_OUTPUT" - echo "force_refresh=$force_refresh" >> "$GITHUB_OUTPUT" + { + echo "skip_build=$should_skip" + echo "is_feature_push=$is_feature_push" + echo "force_refresh=$force_refresh" + } >> "$GITHUB_OUTPUT" - name: Set up QEMU if: steps.skip.outputs.skip_build != 'true' @@ -249,6 +251,77 @@ jobs: fi fi + - name: Resolve security scan image reference (contract) + if: steps.skip.outputs.skip_build != 'true' + id: pr_image_ref_output + run: | + set -euo pipefail + + sanitize_branch() { + local raw_branch="$1" + local sanitized_branch + + sanitized_branch=$(echo "${raw_branch}" | tr '[:upper:]' '[:lower:]') + sanitized_branch=${sanitized_branch//[^a-z0-9-]/-} + while [[ "${sanitized_branch}" == *"--"* ]]; do + sanitized_branch=${sanitized_branch//--/-} + done + sanitized_branch=${sanitized_branch##[^a-z0-9]*} + sanitized_branch=${sanitized_branch%%[^a-z0-9-]*} + + if [[ -z "${sanitized_branch}" ]]; then + sanitized_branch="branch" + fi + + echo "${sanitized_branch}" + } + + IMAGE_TAG="" + + if [[ "${TRIGGER_EVENT}" == "pull_request" ]]; then + PR_NUMBER="${TRIGGER_PR_NUMBER}" + + if [[ "${GITHUB_EVENT_NAME}" == "workflow_run" ]]; then + mapfile -t WORKFLOW_RUN_PR_NUMBERS < <(jq -r '.workflow_run.pull_requests[].number // empty' "$GITHUB_EVENT_PATH") + + if [[ "${#WORKFLOW_RUN_PR_NUMBERS[@]}" != "1" ]]; then + echo "โŒ ERROR: Expected exactly one workflow_run pull request number, found ${#WORKFLOW_RUN_PR_NUMBERS[@]}" + exit 1 + fi + + PR_NUMBER="${WORKFLOW_RUN_PR_NUMBERS[0]}" + fi + + if [[ -z "${PR_NUMBER}" || "${PR_NUMBER}" == "null" ]]; then + echo "โŒ ERROR: Missing pull request number for stable security scan tag" + exit 1 + fi + + IMAGE_TAG="pr-${PR_NUMBER}" + elif [[ "${TRIGGER_REF}" == refs/heads/nightly ]]; then + IMAGE_TAG="nightly" + elif [[ "${TRIGGER_REF}" == refs/heads/main ]]; then + IMAGE_TAG="main" + elif [[ "${TRIGGER_REF}" == refs/heads/development ]]; then + IMAGE_TAG="development" + else + BRANCH_NAME="${TRIGGER_HEAD_BRANCH:-${TRIGGER_REF#refs/heads/}}" + IMAGE_TAG="branch-$(sanitize_branch "${BRANCH_NAME}")" + fi + + IMAGE_REF="${GHCR_REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}" + + echo "image_tag=${IMAGE_TAG}" >> "$GITHUB_OUTPUT" + echo "image_ref=${IMAGE_REF}" >> "$GITHUB_OUTPUT" + echo "Resolved security scan image reference: ${IMAGE_REF}" + + { + echo "## PR Image Reference Diagnostics" + echo "" + echo "- Image tag: ${IMAGE_TAG}" + echo "- Emitted image_ref: ${IMAGE_REF}" + } >> "$GITHUB_STEP_SUMMARY" + - name: Generate Docker metadata id: meta uses: docker/metadata-action@80c7e94dd9b9319bd5eb7a0e0fe9291e23a2a2e9 # v6.1.0 @@ -264,10 +337,9 @@ jobs: type=raw,value=dev,enable=${{ env.TRIGGER_REF == 'refs/heads/development' }} type=raw,value=nightly,enable=${{ env.TRIGGER_REF == 'refs/heads/nightly' }} type=raw,value=beta,enable=${{ env.TRIGGER_EVENT == 'pull_request' && env.TRIGGER_BASE_REF == 'development' }} - type=raw,value=${{ steps.branch-tags.outputs.pr_feature_branch_sha_tag }},enable=${{ env.TRIGGER_EVENT == 'pull_request' && steps.branch-tags.outputs.pr_feature_branch_sha_tag != '' }} + type=raw,value=${{ steps.pr_image_ref_output.outputs.image_tag }},enable=${{ steps.pr_image_ref_output.outputs.image_tag != '' }} type=raw,value=${{ steps.branch-tags.outputs.feature_branch_tag }},enable=${{ env.TRIGGER_EVENT != 'pull_request' && startsWith(env.TRIGGER_REF, 'refs/heads/feature/') && steps.branch-tags.outputs.feature_branch_tag != '' }} type=raw,value=${{ steps.branch-tags.outputs.branch_sha_tag }},enable=${{ env.TRIGGER_EVENT != 'pull_request' && steps.branch-tags.outputs.branch_sha_tag != '' }} - type=raw,value=pr-${{ env.TRIGGER_PR_NUMBER }}-{{sha}},enable=${{ env.TRIGGER_EVENT == 'pull_request' }},prefix=,suffix= type=sha,format=short,prefix=,suffix=,enable=${{ env.TRIGGER_EVENT != 'pull_request' && (env.TRIGGER_REF == 'refs/heads/main' || env.TRIGGER_REF == 'refs/heads/development' || env.TRIGGER_REF == 'refs/heads/nightly') }} flavor: | latest=false @@ -277,21 +349,18 @@ jobs: io.charon.build.timestamp=${{ github.event.repository.updated_at }} io.charon.feature.branch=${{ steps.branch-tags.outputs.feature_branch_tag }} - - name: Resolve PR image reference + - name: Assert PR output contract if: steps.skip.outputs.skip_build != 'true' && env.TRIGGER_EVENT == 'pull_request' - id: pr-image-ref run: | - IMAGE_REF=$(echo "${{ steps.meta.outputs.tags }}" | head -n 1) - if [[ -z "$IMAGE_REF" ]]; then - echo "โŒ ERROR: No PR image reference found in metadata tags" + IMAGE_REF="${{ steps.pr_image_ref_output.outputs.image_ref }}" + if [[ -z "${IMAGE_REF}" ]]; then + echo "โŒ ERROR: image_ref output contract violated (empty output)" exit 1 fi - echo "image_ref=${IMAGE_REF}" >> "$GITHUB_OUTPUT" - echo "Resolved PR image reference: ${IMAGE_REF}" # Phase 1 Optimization: Build once, test many - # - For PRs: Multi-platform (amd64, arm64) + immutable tags (pr-{number}-{short-sha}) + # - For PRs: Multi-platform (amd64, arm64) + stable security-scan tag (pr-{number}) # - For feature branches: Multi-platform (amd64, arm64) + sanitized tags ({branch}-{short-sha}) - # - For main/dev: Multi-platform (amd64, arm64) for production + # - For main/dev/nightly: Multi-platform (amd64, arm64) for production # - Always push to registry (enables downstream workflow consumption) # - Retry logic handles transient registry failures (3 attempts, 10s wait) # See: docs/plans/current_spec.md Section 4.1 @@ -355,52 +424,40 @@ jobs: # This enables backward compatibility with workflows that use artifacts if [[ "${{ env.TRIGGER_EVENT }}" == "pull_request" ]]; then echo "๐Ÿ“ฅ Pulling image back for artifact creation..." - FIRST_TAG=$(echo "${{ steps.meta.outputs.tags }}" | head -n1) - docker pull "${FIRST_TAG}" - echo "โœ… Image pulled: ${FIRST_TAG}" + PR_IMAGE_REF="${{ steps.pr_image_ref_output.outputs.image_ref }}" + if [[ -z "${PR_IMAGE_REF}" ]]; then + echo "โŒ ERROR: image_ref output contract violated during pull-back" + exit 1 + fi + docker pull "${PR_IMAGE_REF}" + echo "โœ… Image pulled: ${PR_IMAGE_REF}" fi - # Critical Fix: Use exact tag from metadata instead of manual reconstruction - # WHY: docker/build-push-action with load:true applies the exact tags from - # docker/metadata-action. Manual reconstruction can cause mismatches due to: - # - Case sensitivity variations (owner name normalization) - # - Tag format differences in Buildx internal behavior - # - Registry prefix inconsistencies - # - # SOLUTION: Extract the first tag from metadata output (which is the PR tag) - # and use it directly with docker save. This guarantees we reference the - # exact image that was loaded into the local Docker daemon. - # - # VALIDATION: Added defensive checks to fail fast with diagnostics if: - # 1. No tag found in metadata output - # 2. Image doesn't exist locally after build - # 3. Artifact creation fails + # Use the stable security scan reference so the saved artifact matches the + # image that the PR scan job will pull and validate. - name: Save Docker Image as Artifact if: success() && steps.skip.outputs.skip_build != 'true' && env.TRIGGER_EVENT == 'pull_request' run: | - # Extract the first tag from metadata action (PR tag) - IMAGE_TAG=$(echo "${{ steps.meta.outputs.tags }}" | head -n 1) + IMAGE_REF="${{ steps.pr_image_ref_output.outputs.image_ref }}" - if [[ -z "${IMAGE_TAG}" ]]; then - echo "โŒ ERROR: No image tag found in metadata output" - echo "Metadata tags output:" - echo "${{ steps.meta.outputs.tags }}" + if [[ -z "${IMAGE_REF}" ]]; then + echo "โŒ ERROR: image_ref output contract violated (empty image ref)" exit 1 fi - echo "๐Ÿ” Detected image tag: ${IMAGE_TAG}" + echo "๐Ÿ” Detected image ref: ${IMAGE_REF}" # Verify the image exists locally - if ! docker image inspect "${IMAGE_TAG}" >/dev/null 2>&1; then - echo "โŒ ERROR: Image ${IMAGE_TAG} not found locally" + if ! docker image inspect "${IMAGE_REF}" >/dev/null 2>&1; then + echo "โŒ ERROR: Image ${IMAGE_REF} not found locally" echo "๐Ÿ“‹ Available images:" docker images exit 1 fi # Save the image using the exact tag from metadata - echo "๐Ÿ’พ Saving image: ${IMAGE_TAG}" - docker save "${IMAGE_TAG}" -o /tmp/charon-pr-image.tar + echo "๐Ÿ’พ Saving image: ${IMAGE_REF}" + docker save "${IMAGE_REF}" -o /tmp/charon-pr-image.tar # Verify the artifact was created echo "โœ… Artifact created:" @@ -424,9 +481,9 @@ jobs: # Determine the image reference based on event type if [ "${{ env.TRIGGER_EVENT }}" = "pull_request" ]; then - IMAGE_REF="${{ steps.pr-image-ref.outputs.image_ref }}" + IMAGE_REF="${{ steps.pr_image_ref_output.outputs.image_ref }}" if [ -z "${IMAGE_REF}" ]; then - echo "โŒ ERROR: Failed to load PR image reference from pr-image-ref output" + echo "โŒ ERROR: Failed to load PR image reference from image_ref output" exit 1 fi echo "Using PR image: $IMAGE_REF" @@ -451,9 +508,9 @@ jobs: # Determine the image reference based on event type if [ "${{ env.TRIGGER_EVENT }}" = "pull_request" ]; then - IMAGE_REF="${{ steps.pr-image-ref.outputs.image_ref }}" + IMAGE_REF="${{ steps.pr_image_ref_output.outputs.image_ref }}" if [ -z "${IMAGE_REF}" ]; then - echo "โŒ ERROR: Failed to load PR image reference from pr-image-ref output" + echo "โŒ ERROR: Failed to load PR image reference from image_ref output" exit 1 fi echo "Using PR image: $IMAGE_REF" @@ -518,9 +575,9 @@ jobs: # Determine the image reference based on event type if [ "${{ env.TRIGGER_EVENT }}" = "pull_request" ]; then - IMAGE_REF="${{ steps.pr-image-ref.outputs.image_ref }}" + IMAGE_REF="${{ steps.pr_image_ref_output.outputs.image_ref }}" if [ -z "${IMAGE_REF}" ]; then - echo "โŒ ERROR: Failed to load PR image reference from pr-image-ref output" + echo "โŒ ERROR: Failed to load PR image reference from image_ref output" exit 1 fi echo "Using PR image: $IMAGE_REF" @@ -717,12 +774,15 @@ jobs: needs: build-and-push if: needs.build-and-push.outputs.skip_build != 'true' && needs.build-and-push.result == 'success' && github.event_name == 'pull_request' runs-on: ubuntu-latest - timeout-minutes: 10 + timeout-minutes: 20 permissions: contents: read packages: read security-events: write steps: + - name: Checkout repository for Trivy ignore rules + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - name: Normalize image name run: | IMAGE_NAME=$(echo "${{ env.IMAGE_NAME }}" | tr '[:upper:]' '[:lower:]') @@ -731,12 +791,22 @@ jobs: - name: Load PR image reference id: pr-image run: | - IMAGE_REF="${{ needs.build-and-push.outputs.pr_image_ref }}" - if [[ -z "$IMAGE_REF" ]]; then - echo "โŒ ERROR: Missing PR image reference from build-and-push outputs" + DIGEST="${{ needs.build-and-push.outputs.digest }}" + + if [[ -z "$DIGEST" ]]; then + echo "โŒ ERROR: build-and-push digest output is empty; cannot proceed without immutable reference" + exit 1 + fi + + if ! [[ "$DIGEST" =~ ^sha256:[0-9a-f]{64}$ ]]; then + echo "โŒ ERROR: Invalid digest format (expected sha256:<64 hex chars>): ${DIGEST}" exit 1 fi + + IMAGE_REF="ghcr.io/${IMAGE_NAME}@${DIGEST}" + echo "digest=${DIGEST}" >> "$GITHUB_OUTPUT" echo "image_ref=${IMAGE_REF}" >> "$GITHUB_OUTPUT" + echo "Using immutable digest reference: ${IMAGE_REF}" - name: Log in to GitHub Container Registry uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0 @@ -745,27 +815,36 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Validate image freshness + - name: Inspect PR image metadata + id: image-metadata run: | - echo "๐Ÿ” Validating image freshness for PR #${{ env.TRIGGER_PR_NUMBER }}..." - echo "Expected SHA: ${{ env.TRIGGER_HEAD_SHA }}" + echo "๐Ÿ” Inspecting image metadata for PR #${{ env.TRIGGER_PR_NUMBER }}..." echo "Image: ${{ steps.pr-image.outputs.image_ref }}" - # Pull image to inspect + # Pull the immutable digest reference to inspect labels. docker pull "${{ steps.pr-image.outputs.image_ref }}" - # Extract commit SHA from image label LABEL_SHA=$(docker inspect "${{ steps.pr-image.outputs.image_ref }}" \ --format '{{index .Config.Labels "org.opencontainers.image.revision"}}') echo "Image label SHA: ${LABEL_SHA}" + echo "label_sha=${LABEL_SHA}" >> "$GITHUB_OUTPUT" + + - name: Validate image freshness + run: | + LABEL_SHA="${{ steps.image-metadata.outputs.label_sha }}" + + if [[ -z "${LABEL_SHA}" ]]; then + echo "โŒ ERROR: Missing image revision label" + exit 1 + fi if [[ "${LABEL_SHA}" != "${{ env.TRIGGER_HEAD_SHA }}" ]]; then - echo "โš ๏ธ WARNING: Image SHA mismatch!" + echo "โŒ ERROR: Image SHA mismatch!" echo " Expected: ${{ env.TRIGGER_HEAD_SHA }}" echo " Got: ${LABEL_SHA}" - echo "Image may be stale. Resuming for triage (Bypassing failure)." - # exit 1 + echo "Image may be stale. Failing closed." + exit 1 fi echo "โœ… Image freshness validated" @@ -775,6 +854,8 @@ jobs: with: image-ref: ${{ steps.pr-image.outputs.image_ref }} format: 'table' + scanners: 'vuln' + trivyignores: '.trivyignore' severity: 'CRITICAL,HIGH' exit-code: '0' version: 'v0.70.0' @@ -786,8 +867,12 @@ jobs: image-ref: ${{ steps.pr-image.outputs.image_ref }} format: 'sarif' output: 'trivy-pr-results.sarif' + scanners: 'vuln' + trivyignores: '.trivyignore' severity: 'CRITICAL,HIGH' - exit-code: '1' # Intended to block, but continued on error for now + # Keep scanning strict for CRITICAL/HIGH; fail is enforced explicitly + # at the end so SARIF upload and summaries still run. + exit-code: '1' version: 'v0.70.0' continue-on-error: true @@ -813,14 +898,24 @@ jobs: env: IMAGE_REF: ${{ steps.pr-image.outputs.image_ref }} run: | - CADDY_VERSION=$(docker run --rm --pull=never "${IMAGE_REF}" caddy version 2>/dev/null || echo "unknown") + CADDY_VERSION="unknown" + CADDY_VERSION_SOURCE="fallback-not-available" + + # Prefer absolute entrypoint to avoid shell/command resolution differences. + if CADDY_VERSION=$(timeout 30s docker run --rm --pull=never --entrypoint /usr/bin/caddy "${IMAGE_REF}" version 2>/dev/null); then + CADDY_VERSION_SOURCE="--entrypoint /usr/bin/caddy" + elif CADDY_VERSION=$(timeout 30s docker run --rm --pull=never --entrypoint caddy "${IMAGE_REF}" version 2>/dev/null); then + CADDY_VERSION_SOURCE="--entrypoint caddy" + fi + { echo "## PR Trivy SARIF Traceability" echo "" echo "- Category: ${{ env.TRIVY_SARIF_CATEGORY }}" echo "- Image: ${IMAGE_REF}" - echo "- Digest: n/a (tag-based PR image)" + echo "- Digest: ${IMAGE_REF#*@}" echo "- Caddy version: ${CADDY_VERSION}" + echo "- Caddy version source: ${CADDY_VERSION_SOURCE}" } >> "$GITHUB_STEP_SUMMARY" - name: Create scan summary @@ -830,7 +925,127 @@ jobs: echo "## ๐Ÿ”’ PR Image Security Scan" echo "" echo "- **Image**: ${{ steps.pr-image.outputs.image_ref }}" + echo "- **Image Digest**: ${{ steps.pr-image.outputs.digest }}" + echo "- **Image Revision Label SHA**: ${{ steps.image-metadata.outputs.label_sha || 'missing' }}" echo "- **PR**: #${{ env.TRIGGER_PR_NUMBER }}" echo "- **Commit**: ${{ env.TRIGGER_HEAD_SHA }}" echo "- **Scan Status**: ${{ steps.trivy-scan.outcome == 'success' && 'โœ… No critical vulnerabilities' || 'โŒ Vulnerabilities detected' }}" } >> "$GITHUB_STEP_SUMMARY" + + - name: Diagnose unsuppressed PR Trivy blockers + if: always() + continue-on-error: true + run: | + SARIF_PATH="trivy-pr-results.sarif" + FALLBACK_REASON="" + FINDINGS_COUNT="0" + UNIQUE_IDS_CSV="none" + PARSER_EXIT_CODE="" + PARSER_HINT="" + + echo "Unsuppressed HIGH/CRITICAL findings (from ${SARIF_PATH}):" + + if [[ ! -f "${SARIF_PATH}" ]]; then + FALLBACK_REASON="file missing" + elif [[ ! -r "${SARIF_PATH}" ]]; then + FALLBACK_REASON="file unreadable" + elif ! jq -e '.' "${SARIF_PATH}" >/dev/null 2>&1; then + FALLBACK_REASON="invalid JSON" + elif ! jq -e '(.runs | type) == "array"' "${SARIF_PATH}" >/dev/null 2>&1; then + FALLBACK_REASON="unexpected schema" + else + PARSER_ERR_FILE=$(mktemp) + if IDS_JSON=$(jq -c ' + [ + .runs[]? as $run + | (($run.results // [])[]?) as $result + | select((($result.suppressions // []) | length) == 0) + | { + id: ( + $result.ruleId + // ($result.rule // {} | .id) + // ( + if ($result.ruleIndex != null and (($run.tool.driver.rules? // null) | type) == "array") then + ($run.tool.driver.rules[$result.ruleIndex].id // empty) + else + empty + end + ) + // "unknown" + ) + } + ] + ' "${SARIF_PATH}" 2>"${PARSER_ERR_FILE}"); then + FINDINGS_COUNT=$(echo "${IDS_JSON}" | jq 'length' 2>/dev/null || echo "0") + UNIQUE_IDS_CSV=$(echo "${IDS_JSON}" | jq -r '[.[].id | select(. != "" and . != null)] | unique | join(", ")' 2>/dev/null || echo "none") + if [[ -z "${UNIQUE_IDS_CSV}" ]]; then + UNIQUE_IDS_CSV="none" + fi + + if [[ "${FINDINGS_COUNT}" -gt 0 ]]; then + if ! jq -r ' + .runs[]? as $run + | (($run.results // [])[]?) as $result + | select((($result.suppressions // []) | length) == 0) + | "- \(( + $result.ruleId + // ($result.rule // {} | .id) + // ( + if ($result.ruleIndex != null and (($run.tool.driver.rules? // null) | type) == \"array\") then + ($run.tool.driver.rules[$result.ruleIndex].id // \"unknown\") + else + \"unknown\" + end + ) + )) | package: \(( + ($result.message.text // \"\") + | (try capture(\"(?i)(?:Package|PkgName|Pkg|Library)\\\\s*[:=]\\\\s*`?(?[A-Za-z0-9._+:+-]+)`?\").pkg catch \"n/a\") + ))" + ' "${SARIF_PATH}"; then + echo "- unable to render parsed findings" + fi + else + echo "- none" + fi + else + FALLBACK_REASON="parser command failure" + PARSER_EXIT_CODE="$?" + PARSER_HINT=$(head -n 1 "${PARSER_ERR_FILE}" | tr -d '\r' || true) + if [[ -z "${PARSER_HINT}" ]]; then + PARSER_HINT="no parser stderr available" + fi + fi + + rm -f "${PARSER_ERR_FILE}" + fi + + { + echo "## PR Trivy Unsuppressed Blockers" + echo "" + if [[ -n "${FALLBACK_REASON}" ]]; then + echo "- Diagnostics status: fallback" + echo "- Fallback reason: ${FALLBACK_REASON}" + if [[ "${FALLBACK_REASON}" == "parser command failure" ]]; then + echo "- Parser exit code: ${PARSER_EXIT_CODE:-unknown}" + echo "- Parser hint: ${PARSER_HINT}" + fi + echo "- Count: unknown" + echo "- Blocker IDs: unknown" + else + echo "- Diagnostics status: parsed" + echo "- Count: ${FINDINGS_COUNT}" + if [[ "${FINDINGS_COUNT}" -gt 0 ]]; then + echo "- Blocker IDs: ${UNIQUE_IDS_CSV}" + else + echo "- Blocker IDs: none" + fi + fi + } >> "$GITHUB_STEP_SUMMARY" + + - name: Enforce PR Trivy security gate + if: always() + run: | + if [[ "${{ steps.trivy-scan.outcome }}" != "success" ]]; then + echo "โŒ Blocking merge: PR image has CRITICAL/HIGH vulnerabilities or scan failed" + exit 1 + fi diff --git a/.gitignore b/.gitignore index 5b3674d79..aff059433 100644 --- a/.gitignore +++ b/.gitignore @@ -326,3 +326,6 @@ backend/test_out.txt backend/cf_coverage.txt backend/***_coverage.txt backend/***_cov.txt +.tmp/caddy-binary-pin-cleanup +.tmp/caddy-binary-pin-cleanup-local.tar +.tmp/*** \ No newline at end of file diff --git a/.trivyignore b/.trivyignore index fdd90a138..f5ede014e 100644 --- a/.trivyignore +++ b/.trivyignore @@ -103,3 +103,13 @@ CVE-2026-33997 # See also: .grype.yaml for full justification # exp: 2026-04-30 GHSA-pxq6-2prw-chj9 + +# CVE-2026-41889 / GHSA-j88v-2chj-qfwx: pgx/v4 panic on crafted PostgreSQL wire payload (DoS) +# Severity: LOW (CVSS 3.7) โ€” Package: github.com/jackc/pgx/v4, embedded in CrowdSec binaries +# Fix path requires upstream migration to pgx/v5; CrowdSec currently vendors pgx/v4 in bundled components. +# Charon uses SQLite by default; PostgreSQL wire-protocol path is not reachable in standard deployment. +# Review by: 2026-05-25 +# See also: .grype.yaml for full justification +# exp: 2026-05-25 +CVE-2026-41889 +GHSA-j88v-2chj-qfwx diff --git a/Dockerfile b/Dockerfile index bc26ea794..1714dfd3c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,8 +26,8 @@ ARG CROWDSEC_RELEASE_SHA256=704e37121e7ac215991441cef0d8732e33fa3b1a2b2b88b53a0b ARG EXPR_LANG_VERSION=1.17.8 # renovate: datasource=go depName=golang.org/x/net ARG XNET_VERSION=0.55.0 -# renovate: datasource=go depName=github.com/smallstep/certificates -ARG SMALLSTEP_CERTIFICATES_VERSION=0.30.0 +# renovate: datasource=go depName=golang.org/x/crypto +ARG XCRYPTO_VERSION=0.52.0 # renovate: datasource=npm depName=npm ARG NPM_VERSION=11.11.1 @@ -241,7 +241,7 @@ ARG CORAZA_CADDY_VERSION ARG XCADDY_VERSION=0.4.6 ARG EXPR_LANG_VERSION ARG XNET_VERSION -ARG SMALLSTEP_CERTIFICATES_VERSION +ARG XCRYPTO_VERSION # hadolint ignore=DL3018 RUN apk add --no-cache bash git @@ -289,6 +289,7 @@ RUN --mount=type=cache,target=/root/.cache/go-build \ go get github.com/expr-lang/expr@v${EXPR_LANG_VERSION}; \ # renovate: datasource=go depName=github.com/hslatman/ipstore go get github.com/hslatman/ipstore@v0.4.0; \ + go get golang.org/x/crypto@v${XCRYPTO_VERSION}; \ go get golang.org/x/net@v${XNET_VERSION}; \ # CVE-2026-33186: gRPC-Go auth bypass (fixed in v1.79.3) # CVE-2026-34986: go-jose/v4 transitive fix (requires grpc >= v1.80.0) @@ -316,10 +317,6 @@ RUN --mount=type=cache,target=/root/.cache/go-build \ # remove once caddy-security ships a release built with goxmldsig >= v1.6.0. # renovate: datasource=go depName=github.com/russellhaering/goxmldsig go get github.com/russellhaering/goxmldsig@v1.6.0; \ - # CVE-2026-30836: smallstep/certificates 0.30.0-rc3 vulnerability - # Fix available at v0.30.0. Pin here so the Caddy binary is patched immediately; - # remove once caddy-security ships a release built with smallstep/certificates >= v0.30.0. - go get github.com/smallstep/certificates@v${SMALLSTEP_CERTIFICATES_VERSION}; \ # CVE-2026-32952: go-ntlmssp DoS via malicious NTLM challenge response # Affects /usr/bin/caddy (transitive dependency). Fix available at v0.1.1. # renovate: datasource=go depName=github.com/Azure/go-ntlmssp @@ -339,8 +336,17 @@ RUN --mount=type=cache,target=/root/.cache/go-build \ echo "Unsupported CADDY_PATCH_SCENARIO=${CADDY_PATCH_SCENARIO}"; \ exit 1; \ fi; \ + # Final re-pin: enforce requested Caddy core version after plugin/security updates. + go get github.com/caddyserver/caddy/v2@v${CADDY_TARGET_VERSION}; \ # Clean up go.mod and ensure all dependencies are resolved go mod tidy; \ + # Hard assertion: fail if module graph resolves to a different Caddy core version. + ACTUAL_CADDY_VERSION="$(go list -m -f "{{.Version}}" github.com/caddyserver/caddy/v2)"; \ + if [ "$ACTUAL_CADDY_VERSION" != "v${CADDY_TARGET_VERSION}" ]; then \ + echo "ERROR: Resolved Caddy version ${ACTUAL_CADDY_VERSION} does not match target v${CADDY_TARGET_VERSION}"; \ + exit 1; \ + fi; \ + echo "Verified Caddy module version: ${ACTUAL_CADDY_VERSION}"; \ echo "Dependencies patched successfully"; \ # Remove any temporary binaries from initial xcaddy run rm -f /tmp/caddy-initial; \ diff --git a/SECURITY.md b/SECURITY.md index d93510438..e20a57db8 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -27,7 +27,7 @@ public disclosure. ## Known Vulnerabilities -Last reviewed: 2026-05-20 +Last reviewed: 2026-05-25 ### [HIGH] CVE-2026-31790 ยท OpenSSL Vulnerability in Alpine Base Image @@ -284,6 +284,45 @@ database server is untrusted. EPSS score not yet available. Monitor https://github.com/jackc/pgproto3 for a fix release. Upgrade the indirect dependency once a patched version is available. Pre-existing; not introduced by PR #1031. +--- + +### [LOW] CVE-2026-41889 ยท pgx/v4 Panic via Crafted PostgreSQL Wire Payload + +| Field | Value | +|--------------|-------| +| **ID** | CVE-2026-41889 (GHSA-j88v-2chj-qfwx) | +| **Severity** | Low ยท 3.7 | +| **Status** | Awaiting Upstream | + +**What** +`github.com/jackc/pgx/v4` may panic when decoding a crafted PostgreSQL wire payload, +which can cause a denial-of-service condition in affected clients. + +**Who** + +- Discovered by: Automated scan (Trivy image scan) +- Reported: 2026-05-25 +- Affects: Deployments using PostgreSQL-backed CrowdSec code paths (non-default) + +**Where** + +- Component: `github.com/jackc/pgx/v4` (transitive dependency in bundled CrowdSec components) +- Versions affected: pgx/v4 prior to upstream remediation + +**When** + +- Discovered: 2026-05-25 +- Disclosed (if public): Public +- Target fix: When upstream dependencies migrate to a patched path (pgx/v5) + +**How** +Exploitation requires a crafted PostgreSQL protocol payload delivered to an affected pgx/v4 +decode path. Charon defaults to SQLite, so standard deployments do not expose this path. + +**Planned Remediation** +Track upstream CrowdSec dependency updates and remove suppression once pgx/v4 is no longer +present in bundled components. + ## Patched Vulnerabilities ### โœ… [HIGH] CVE-2026-34040 ยท Docker AuthZ Plugin Bypass via Oversized Request Body diff --git a/agent/muzzle/muzzle.go b/agent/muzzle/muzzle.go index 562670578..189a4460d 100644 --- a/agent/muzzle/muzzle.go +++ b/agent/muzzle/muzzle.go @@ -20,13 +20,51 @@ const forbiddenResponse = "HTTP/1.1 403 Forbidden\r\nContent-Length: 0\r\nConnec // allowedPatterns enumerates the Docker API paths that agents may access. // Matching uses path.Match after stripping query parameters; each pattern // uses `*` to match any single path segment (never crosses a slash). +// +// Both versioned (/v*/...) and unversioned (/...) forms are listed because +// Docker clients such as Dockhand send unversioned requests (e.g. GET /containers/json) +// while the canonical Docker CLI sends versioned requests (e.g. GET /v1.47/containers/json). var allowedPatterns = []string{ + "/_ping", // no version prefix (Docker < 1.24 / direct health check) + "/v*/_ping", // versioned ping for Docker client health checks + + // Container listing and inspection โ€” unversioned (RC8 fix) + versioned + "/containers/json", "/v*/containers/json", + "/containers/*/json", "/v*/containers/*/json", + "/containers/*/logs", + "/v*/containers/*/logs", + "/containers/*/stats", + "/v*/containers/*/stats", + "/containers/*/top", + "/v*/containers/*/top", + + // Daemon info โ€” unversioned + versioned + "/info", "/v*/info", + "/images/json", "/v*/images/json", + "/version", "/v*/version", + "/events", "/v*/events", + + // Volumes โ€” unversioned + versioned + "/volumes", + "/v*/volumes", + "/volumes/*", + "/v*/volumes/*", + + // Networks โ€” unversioned + versioned + "/networks", + "/v*/networks", + "/networks/*", + "/v*/networks/*", + + // System disk usage โ€” unversioned + versioned + "/system/df", + "/v*/system/df", } // Filter is an HTTP allowlist filter for Docker socket proxy streams. @@ -38,8 +76,20 @@ func New() *Filter { } // Allow returns true if method+reqPath is on the allowlist. -// Only GET is permitted; all other methods are rejected immediately. +// Only GET is permitted, except HEAD which is allowed on /_ping and /v*/_ping +// (Docker SDK connectivity check). func (f *Filter) Allow(method, reqPath string) bool { + // HEAD is permitted only for /_ping (Docker SDK connectivity check). + if strings.EqualFold(method, http.MethodHead) { + cleanPath := path.Clean(reqPath) + for _, p := range []string{"/_ping", "/v*/_ping"} { + if matched, _ := path.Match(p, cleanPath); matched { + return true + } + } + return false + } + if !strings.EqualFold(method, http.MethodGet) { return false } @@ -79,6 +129,10 @@ func (f *Filter) ServeProxy(dst string, r io.Reader, w io.Writer) error { } defer conn.Close() + // Ensure Docker closes the socket after the response so ServeProxy can + // terminate cleanly instead of waiting on an idle keep-alive connection. + req.Close = true + // Forward the full request (headers + body) to the Docker socket. if err := req.Write(conn); err != nil { return fmt.Errorf("muzzle: forward request to docker: %w", err) diff --git a/agent/muzzle/muzzle_test.go b/agent/muzzle/muzzle_test.go index cd305f88c..391aba031 100644 --- a/agent/muzzle/muzzle_test.go +++ b/agent/muzzle/muzzle_test.go @@ -1,9 +1,18 @@ package muzzle_test import ( + "bufio" "bytes" + "errors" + "fmt" + "io" + "net" + "net/http" + "path/filepath" "strings" + "sync" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -11,6 +20,72 @@ import ( "github.com/Wikid82/charon/agent/muzzle" ) +var errWriterClosed = errors.New("writer closed") + +type closableWriter struct { + mu sync.Mutex + buf bytes.Buffer + closed bool + firstWrite chan struct{} + once sync.Once +} + +func newClosableWriter() *closableWriter { + return &closableWriter{firstWrite: make(chan struct{})} +} + +func (w *closableWriter) Write(p []byte) (int, error) { + w.once.Do(func() { close(w.firstWrite) }) + + w.mu.Lock() + defer w.mu.Unlock() + if w.closed { + return 0, errWriterClosed + } + return w.buf.Write(p) +} + +func (w *closableWriter) Close() { + w.mu.Lock() + defer w.mu.Unlock() + w.closed = true +} + +func (w *closableWriter) String() string { + w.mu.Lock() + defer w.mu.Unlock() + return w.buf.String() +} + +func startUnixHTTPServer(t *testing.T, handler func(net.Conn)) (string, func()) { + t.Helper() + + sockPath := filepath.Join(t.TempDir(), "docker.sock") + ln, err := net.Listen("unix", sockPath) + require.NoError(t, err) + + done := make(chan struct{}) + go func() { + defer close(done) + conn, acceptErr := ln.Accept() + if acceptErr != nil { + return + } + handler(conn) + }() + + cleanup := func() { + _ = ln.Close() + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("unix server goroutine did not exit") + } + } + + return sockPath, cleanup +} + func TestFilter_Allow(t *testing.T) { f := muzzle.New() @@ -19,26 +94,71 @@ func TestFilter_Allow(t *testing.T) { reqPath string allowed bool }{ - // Allowed: read-only GET endpoints - {"GET", "/v1.41/containers/json", true}, - {"GET", "/v1.41/info", true}, - {"GET", "/v1.41/version", true}, - {"GET", "/v1.41/images/json", true}, - {"GET", "/v1.41/events", true}, - {"GET", "/v1.44/containers/abc123/json", true}, + // --- Allowed: health check --- + {"GET", "/_ping", true}, + {"GET", "/v1.44/_ping", true}, + {"HEAD", "/_ping", true}, + {"HEAD", "/v1.47/_ping", true}, + {"HEAD", "/containers/json", false}, // HEAD blocked for non-ping paths + {"HEAD", "/v1.47/containers/json", false}, // HEAD blocked for non-ping paths + + // --- Allowed: GET on versioned paths --- + {"GET", "/v1.47/containers/json", true}, {"GET", "/v1.24/containers/json", true}, + {"GET", "/v1.47/containers/abc123/json", true}, + {"GET", "/v1.47/containers/abc123/logs", true}, + {"GET", "/v1.47/containers/abc123/stats", true}, + {"GET", "/v1.47/containers/abc123/top", true}, + {"GET", "/v1.47/info", true}, + {"GET", "/v1.47/images/json", true}, + {"GET", "/v1.47/version", true}, + {"GET", "/v1.47/events", true}, + {"GET", "/v1.47/volumes", true}, + {"GET", "/v1.47/volumes/myvolume", true}, + {"GET", "/v1.47/networks", true}, + {"GET", "/v1.47/networks/mynet", true}, + {"GET", "/v1.47/system/df", true}, + {"GET", "/v1.41/volumes/myvol", true}, + {"GET", "/v1.41/networks/mynet", true}, + + // --- Allowed: GET on UNVERSIONED paths (RC8/RC9 fix) --- + {"GET", "/containers/json", true}, + {"GET", "/containers/abc123/json", true}, + {"GET", "/containers/abc123/logs", true}, + {"GET", "/containers/abc123/stats", true}, + {"GET", "/containers/abc123/top", true}, + {"GET", "/info", true}, + {"GET", "/images/json", true}, + {"GET", "/version", true}, + {"GET", "/events", true}, + {"GET", "/volumes", true}, + {"GET", "/volumes/myvolume", true}, + {"GET", "/networks", true}, + {"GET", "/networks/mynet", true}, + {"GET", "/system/df", true}, - // Blocked: mutating methods + // --- Blocked: mutating methods --- + {"POST", "/containers/create", false}, + {"DELETE", "/containers/abc123", false}, + {"PUT", "/containers/abc123/start", false}, + {"PATCH", "/v1.47/containers/abc123", false}, {"POST", "/v1.41/containers/create", false}, {"DELETE", "/v1.41/containers/abc", false}, {"PUT", "/v1.41/networks/abc", false}, {"PATCH", "/v1.41/containers/abc/update", false}, - // Blocked: paths not on allowlist + // --- Blocked: paths not on allowlist --- + {"GET", "/containers/abc123/start", false}, + {"GET", "/containers/abc123/stop", false}, + {"GET", "/exec/abc123/start", false}, + {"GET", "/build", false}, + {"GET", "/v1.47/exec/abc123", false}, {"GET", "/v1.41/exec/abc/start", false}, {"GET", "/v1.41/containers/prune", false}, - {"GET", "/v1.41/networks", false}, - {"GET", "/v1.41/volumes", false}, + + // --- Path traversal: path.Clean normalises before matching --- + {"GET", "/v1.47/../containers/json", true}, // resolves to /containers/json โ€” allowed + {"GET", "/containers/../../etc/passwd", false}, // resolves to /etc/passwd โ€” blocked } for _, tt := range tests { @@ -81,3 +201,166 @@ func TestFilter_ServeProxy_Blocked_PUT(t *testing.T) { require.Error(t, err) assert.Contains(t, buf.String(), "403") } + +func TestFilter_ServeProxy_Blocked_UnversionedPost(t *testing.T) { + f := muzzle.New() + + reqStr := "POST /containers/create HTTP/1.1\r\nHost: localhost\r\nContent-Length: 0\r\n\r\n" + var buf bytes.Buffer + + err := f.ServeProxy("/tmp/nonexistent.sock", strings.NewReader(reqStr), &buf) + require.Error(t, err) + assert.Contains(t, buf.String(), "403") +} + +func TestServeProxy_ConnectionCloseSetOnRequest(t *testing.T) { + f := muzzle.New() + + reqSeen := make(chan *http.Request, 1) + serverErr := make(chan error, 1) + + sockPath, cleanup := startUnixHTTPServer(t, func(conn net.Conn) { + defer conn.Close() + + req, err := http.ReadRequest(bufio.NewReader(conn)) + if err != nil { + serverErr <- err + return + } + reqSeen <- req + + _, err = io.WriteString(conn, "HTTP/1.1 200 OK\r\nContent-Length: 5\r\nConnection: close\r\n\r\nhello") + if err != nil { + serverErr <- err + } + }) + defer cleanup() + + var out bytes.Buffer + req := "GET /containers/json HTTP/1.1\r\nHost: localhost\r\n\r\n" + err := f.ServeProxy(sockPath, strings.NewReader(req), &out) + require.NoError(t, err) + + seen := <-reqSeen + select { + case err := <-serverErr: + require.NoError(t, err) + default: + } + assert.True(t, seen.Close) + assert.Equal(t, "close", strings.ToLower(seen.Header.Get("Connection"))) + assert.Contains(t, out.String(), "200 OK") + assert.Contains(t, out.String(), "hello") +} + +func TestServeProxy_CompletesAfterDockerResponse(t *testing.T) { + f := muzzle.New() + serverErr := make(chan error, 1) + body := `{"status":"ok"}` + + sockPath, cleanup := startUnixHTTPServer(t, func(conn net.Conn) { + defer conn.Close() + + _, err := http.ReadRequest(bufio.NewReader(conn)) + if err != nil { + serverErr <- err + return + } + + _, err = io.WriteString(conn, fmt.Sprintf("HTTP/1.1 200 OK\r\nContent-Length: %d\r\nConnection: close\r\n\r\n%s", len(body), body)) + if err != nil { + serverErr <- err + } + }) + defer cleanup() + + var out bytes.Buffer + req := "GET /containers/json HTTP/1.1\r\nHost: localhost\r\n\r\n" + + done := make(chan error, 1) + go func() { + done <- f.ServeProxy(sockPath, strings.NewReader(req), &out) + }() + + select { + case err := <-done: + require.NoError(t, err) + case <-time.After(2 * time.Second): + t.Fatal("ServeProxy did not return after complete response") + } + + assert.Contains(t, out.String(), body) + select { + case err := <-serverErr: + require.NoError(t, err) + default: + } +} + +func TestServeProxy_StreamingResponseTerminatesOnWriterClose(t *testing.T) { + f := muzzle.New() + + serverWriteErr := make(chan error, 1) + serverErr := make(chan error, 1) + + sockPath, cleanup := startUnixHTTPServer(t, func(conn net.Conn) { + defer conn.Close() + + _, err := http.ReadRequest(bufio.NewReader(conn)) + if err != nil { + serverErr <- err + return + } + + _, err = io.WriteString(conn, "HTTP/1.1 200 OK\r\nTransfer-Encoding: chunked\r\nConnection: close\r\n\r\n") + if err != nil { + serverErr <- err + return + } + + for { + if _, writeErr := io.WriteString(conn, "6\r\nhello!\r\n"); writeErr != nil { + serverWriteErr <- writeErr + return + } + time.Sleep(10 * time.Millisecond) + } + }) + defer cleanup() + + w := newClosableWriter() + req := "GET /events HTTP/1.1\r\nHost: localhost\r\n\r\n" + + done := make(chan error, 1) + go func() { + done <- f.ServeProxy(sockPath, strings.NewReader(req), w) + }() + + select { + case <-w.firstWrite: + case <-time.After(2 * time.Second): + t.Fatal("stream did not start") + } + + w.Close() + + select { + case err := <-done: + require.Error(t, err) + case <-time.After(2 * time.Second): + t.Fatal("ServeProxy did not return after writer close") + } + + select { + case err := <-serverWriteErr: + require.Error(t, err) + case <-time.After(2 * time.Second): + t.Fatal("mock docker stream did not observe closed connection") + } + + select { + case err := <-serverErr: + require.NoError(t, err) + default: + } +} diff --git a/backend/internal/api/routes/routes.go b/backend/internal/api/routes/routes.go index 37415ea1c..ddb2de51d 100644 --- a/backend/internal/api/routes/routes.go +++ b/backend/internal/api/routes/routes.go @@ -510,6 +510,7 @@ func RegisterWithDeps(ctx context.Context, router *gin.Engine, db *gorm.DB, cfg dockerHandler := handlers.NewDockerHandler(dockerService, remoteServerService) if orthrusServer != nil { dockerHandler.SetOrthrusResolver(orthrusServer) + uptimeService.SetOrthrusResolver(orthrusServer) } dockerHandler.RegisterRoutes(management) diff --git a/backend/internal/orthrus/muzzle.go b/backend/internal/orthrus/muzzle.go index 227fa149a..9cec42541 100644 --- a/backend/internal/orthrus/muzzle.go +++ b/backend/internal/orthrus/muzzle.go @@ -2,6 +2,7 @@ package orthrus import ( "net/http" + "path" "regexp" "strings" @@ -24,10 +25,27 @@ var versionPrefixRe = regexp.MustCompile(`^/v\d+\.\d+`) // allowedDockerPaths is the set of Docker API paths that are safe to expose to agents. // Path matching is performed after stripping the version prefix. var allowedDockerPaths = map[string]struct{}{ + "/_ping": {}, "/containers/json": {}, "/images/json": {}, "/info": {}, "/version": {}, + "/events": {}, + "/volumes": {}, + "/networks": {}, + "/system/df": {}, +} + +// allowedDockerPatterns covers dynamic-segment paths such as +// /containers/{id}/json, /volumes/{name}, and /networks/{id}. +// Matching uses path.Match after the version prefix has been stripped. +var allowedDockerPatterns = []string{ + "/containers/*/json", + "/containers/*/logs", + "/containers/*/stats", + "/containers/*/top", + "/volumes/*", + "/networks/*", } // Muzzle is an http.Handler wrapper that restricts Docker socket access @@ -42,8 +60,17 @@ func NewMuzzle(next http.Handler) *Muzzle { } // ServeHTTP implements http.Handler. Only GET requests to allowlisted paths -// are forwarded; all others receive 403 Forbidden. +// are forwarded; HEAD is also permitted for /_ping (Docker client health checks). +// All other methods or paths receive 403 Forbidden. func (m *Muzzle) ServeHTTP(w http.ResponseWriter, r *http.Request) { + stripped := versionPrefixRe.ReplaceAllString(r.URL.Path, "") + + // HEAD /_ping is permitted alongside GET for Docker client health checks. + if r.Method == http.MethodHead && stripped == "/_ping" { + m.next.ServeHTTP(w, r) + return + } + if r.Method != http.MethodGet { logger.Log().WithField("method", util.SanitizeForLog(r.Method)).WithField("path", sanitizePath(r.URL.Path)). Warn("orthrus: muzzle blocked non-GET Docker request") @@ -51,12 +78,24 @@ func (m *Muzzle) ServeHTTP(w http.ResponseWriter, r *http.Request) { return } - stripped := versionPrefixRe.ReplaceAllString(r.URL.Path, "") if _, ok := allowedDockerPaths[stripped]; ok { m.next.ServeHTTP(w, r) return } + // Check dynamic path patterns for container/volume/network inspection. + // Normalize to an absolute path by trimming stray slashes and re-anchoring + // to "/" so that path.Match works correctly. Traversal sequences such as ".." + // are left unresolved intentionally โ€” they will not match any allowed pattern + // and will be blocked, which is the safe behavior. + cleanPath := "/" + strings.Trim(stripped, "/") + for _, pat := range allowedDockerPatterns { + if matched, err := path.Match(pat, cleanPath); err == nil && matched { + m.next.ServeHTTP(w, r) + return + } + } + logger.Log().WithField("method", util.SanitizeForLog(r.Method)).WithField("path", sanitizePath(r.URL.Path)). Warn("orthrus: muzzle blocked disallowed Docker path") http.Error(w, "Forbidden", http.StatusForbidden) diff --git a/backend/internal/orthrus/muzzle_test.go b/backend/internal/orthrus/muzzle_test.go index 894d5f8d5..92ac4e83f 100644 --- a/backend/internal/orthrus/muzzle_test.go +++ b/backend/internal/orthrus/muzzle_test.go @@ -16,10 +16,15 @@ func passthroughHandler() http.Handler { func TestMuzzle_AllowlistedGET_Passthrough(t *testing.T) { allowed := []string{ + "/_ping", "/containers/json", "/images/json", "/info", "/version", + "/events", + "/volumes", + "/networks", + "/system/df", } m := NewMuzzle(passthroughHandler()) @@ -40,6 +45,11 @@ func TestMuzzle_VersionPrefixStripped_Passthrough(t *testing.T) { "/v1.40/images/json", "/v1.41/info", "/v1.42/version", + "/v1.47/_ping", + "/v1.44/events", + "/v1.44/volumes", + "/v1.44/networks", + "/v1.47/system/df", } m := NewMuzzle(passthroughHandler()) @@ -81,13 +91,60 @@ func TestMuzzle_DELETE_Blocked(t *testing.T) { assert.Equal(t, http.StatusForbidden, rr.Code) } +func TestMuzzle_HEAD_Ping_Passthrough(t *testing.T) { + m := NewMuzzle(passthroughHandler()) + + for _, path := range []string{"/_ping", "/v1.44/_ping"} { + t.Run(path, func(t *testing.T) { + req := httptest.NewRequest(http.MethodHead, path, http.NoBody) + rr := httptest.NewRecorder() + m.ServeHTTP(rr, req) + assert.Equal(t, http.StatusOK, rr.Code) + }) + } +} + +func TestMuzzle_HEAD_NonPing_Blocked(t *testing.T) { + m := NewMuzzle(passthroughHandler()) + req := httptest.NewRequest(http.MethodHead, "/containers/json", http.NoBody) + rr := httptest.NewRecorder() + m.ServeHTTP(rr, req) + assert.Equal(t, http.StatusForbidden, rr.Code) +} + +func TestMuzzle_DynamicPaths_Passthrough(t *testing.T) { + paths := []string{ + "/containers/abc123/json", + "/v1.44/containers/abc123/json", + "/containers/abc123/logs", + "/v1.47/containers/abc123/logs", + "/containers/abc123/stats", + "/v1.47/containers/abc123/stats", + "/containers/abc123/top", + "/v1.47/containers/abc123/top", + "/volumes/myvolume", + "/v1.44/volumes/myvolume", + "/networks/mynet", + "/v1.44/networks/mynet", + } + + m := NewMuzzle(passthroughHandler()) + + for _, p := range paths { + t.Run(p, func(t *testing.T) { + req := httptest.NewRequest(http.MethodGet, p, http.NoBody) + rr := httptest.NewRecorder() + m.ServeHTTP(rr, req) + assert.Equal(t, http.StatusOK, rr.Code) + }) + } +} + func TestMuzzle_UnknownPath_Blocked(t *testing.T) { paths := []string{ "/containers/create", "/exec/abc/start", "/containers/abc/kill", - "/networks/create", - "/_ping", } m := NewMuzzle(passthroughHandler()) diff --git a/backend/internal/orthrus/server.go b/backend/internal/orthrus/server.go index 024331c2e..d1a063c5c 100644 --- a/backend/internal/orthrus/server.go +++ b/backend/internal/orthrus/server.go @@ -91,6 +91,15 @@ func (s *OrthrusServer) HandleWebSocket(c *gin.Context) { return } + // Displace the prior session for this UUID BEFORE binding the new proxy + // listeners so the old session's extListener is closed and its port is freed + // before StartDockerProxy / StartExternalProxy attempt to bind. + if old, loaded := s.sessions.LoadAndDelete(agent.UUID); loaded { + if oldSess, ok := old.(*AgentSession); ok { + _ = oldSess.Close() + } + } + if err := session.StartDockerProxy(); err != nil { logger.Log().WithField("uuid", util.SanitizeForLog(agent.UUID)). WithError(err).Warn("orthrus: failed to start docker proxy listener") @@ -188,8 +197,13 @@ func (s *OrthrusServer) watchHeartbeat(agentUUID string, sess *AgentSession) { case <-ticker.C: if !sess.IsAlive() { _ = sess.Close() // stops runProxyListener goroutine; idempotent - s.markOffline(agentUUID) - s.sessions.Delete(agentUUID) + // Only remove from the map and mark offline when this goroutine's + // session pointer is still the current one. A stale goroutine + // holding an old pointer will find CompareAndDelete returns false + // and exits without corrupting the new session's state. + if s.sessions.CompareAndDelete(agentUUID, sess) { + s.markOffline(agentUUID) + } return } } diff --git a/backend/internal/orthrus/server_coverage_test.go b/backend/internal/orthrus/server_coverage_test.go index 6135e7cf5..f940981e0 100644 --- a/backend/internal/orthrus/server_coverage_test.go +++ b/backend/internal/orthrus/server_coverage_test.go @@ -368,3 +368,58 @@ func TestOrthrusServer_HandleWebSocket_ExternalProxyFails(t *testing.T) { assert.False(t, status.Active) assert.NotEmpty(t, status.Error) } +// TestHandleWebSocket_DisplacesExistingSession covers server.go:98-100 โ€” +// the displacement block that closes the old session when a new connection +// arrives for an agent UUID that already has an active session in the map. +func TestHandleWebSocket_DisplacesExistingSession(t *testing.T) { + db := setupServerTestDB(t) + srv, err := NewOrthrusServer(db, setupTestCA(t)) + require.NoError(t, err) + srv.heartbeatTimeout = 200 * time.Millisecond + + token := "ch_orthrus_displace01" //nolint:gosec // G101: test credential + hash, err := bcrypt.GenerateFromPassword([]byte(token), bcrypt.MinCost) + require.NoError(t, err) + + agent := &models.OrthrusAgent{ + UUID: "displace-uuid", + Name: "displace-agent", + AuthKeyHash: string(hash), + Status: models.OrthrusStatusPending, + } + require.NoError(t, db.Create(agent).Error) + + // Create an "old" session and store it in the sessions map to simulate a + // prior connection that has not yet been cleaned up. + oldConn, oldCleanup := testWSPair(t) + defer oldCleanup() + oldSess, err := NewAgentSession("displace-uuid", "displace-agent", oldConn) + require.NoError(t, err) + srv.sessions.Store("displace-uuid", oldSess) + + gin.SetMode(gin.TestMode) + router := gin.New() + router.GET("/ws", srv.HandleWebSocket) + ts := httptest.NewServer(router) + t.Cleanup(srv.Stop) + t.Cleanup(ts.Close) + + wsURL := "ws" + strings.TrimPrefix(ts.URL, "http") + "/ws" + header := http.Header{"Authorization": []string{"Bearer " + token}} + conn, dialResp, err := gorillaws.DefaultDialer.Dial(wsURL, header) + require.NoError(t, err) + if dialResp != nil { + _ = dialResp.Body.Close() + } + defer func() { _ = conn.Close() }() + + // Wait for the new session to be stored, which means HandleWebSocket has run + // past the displacement block and stored the replacement session. + assert.Eventually(t, func() bool { + raw, ok := srv.GetSession("displace-uuid") + return ok && raw != oldSess + }, 2*time.Second, 20*time.Millisecond, "new session should replace old session") + + // The old session must have been closed by the displacement block (lines 98-100). + assert.False(t, oldSess.IsAlive(), "old session must be closed by displacement") +} diff --git a/backend/internal/orthrus/server_test.go b/backend/internal/orthrus/server_test.go index 1898ad263..d6de28b5a 100644 --- a/backend/internal/orthrus/server_test.go +++ b/backend/internal/orthrus/server_test.go @@ -5,6 +5,7 @@ import ( "net/http/httptest" "path/filepath" "testing" + "time" "github.com/gin-gonic/gin" "golang.org/x/crypto/bcrypt" @@ -189,3 +190,118 @@ func TestOrthrusServer_HandleWebSocket_InvalidToken(t *testing.T) { assert.Equal(t, http.StatusUnauthorized, w.Code) } + +// TestWatchHeartbeat_StaleGoroutine_DoesNotEvictNewSession is a regression test +// for the session race condition: a stale watchHeartbeat goroutine (holding a +// reference to an old, dead session) must not evict or mark offline a newer +// session that has already been stored for the same agent UUID. +func TestWatchHeartbeat_StaleGoroutine_DoesNotEvictNewSession(t *testing.T) { + db := setupServerTestDB(t) + srv, err := NewOrthrusServer(db, setupTestCA(t)) + require.NoError(t, err) + // Short timeout so the ticker fires immediately in the test. + srv.heartbeatTimeout = time.Millisecond + + const agentUUID = "race-regression-uuid" + + // Insert the agent in the DB with status online so we can verify markOffline + // is not called. + agent := &models.OrthrusAgent{ + UUID: agentUUID, + Name: "race-agent", + Status: models.OrthrusStatusOnline, + } + require.NoError(t, db.Create(agent).Error) + + // sess1: already closed โ€” represents a stale session whose watchHeartbeat + // goroutine is still running after a newer session has replaced it. + conn1, done1 := testWSPair(t) + defer done1() + sess1, err := NewAgentSession(agentUUID, "race-agent", conn1) + require.NoError(t, err) + require.NoError(t, sess1.Close()) + require.False(t, sess1.IsAlive()) + + // sess2: alive โ€” represents the current (newer) reconnect stored in the map. + conn2, done2 := testWSPair(t) + defer done2() + sess2, err := NewAgentSession(agentUUID, "race-agent", conn2) + require.NoError(t, err) + t.Cleanup(func() { _ = sess2.Close() }) + srv.sessions.Store(agentUUID, sess2) + + // Run the stale watchHeartbeat (for sess1) and wait for it to exit. + // With CompareAndDelete, it finds sess1 โ‰  sess2 in the map, so it returns + // false and skips markOffline โ€” sess2 stays in the map. + done := make(chan struct{}) + go func() { + defer close(done) + srv.watchHeartbeat(agentUUID, sess1) + }() + + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("watchHeartbeat did not exit within deadline") + } + + // sess2 must still be present in the map. + raw, ok := srv.sessions.Load(agentUUID) + require.True(t, ok, "sess2 should still be in the sessions map") + assert.Same(t, sess2, raw.(*AgentSession), "sess2 pointer must be unchanged") + + // The agent must NOT have been marked offline. + var stored models.OrthrusAgent + require.NoError(t, db.Where("uuid = ?", agentUUID).First(&stored).Error) + assert.Equal(t, models.OrthrusStatusOnline, stored.Status, + "stale goroutine must not flip agent status to offline") +} + +// TestWatchHeartbeat_CurrentSession_MarksOfflineAndEvictsFromMap exercises the +// CompareAndDelete true-branch: when the session pointer in the map matches the +// goroutine's pointer, the agent is marked offline and the map entry is removed. +func TestWatchHeartbeat_CurrentSession_MarksOfflineAndEvictsFromMap(t *testing.T) { + db := setupServerTestDB(t) + srv, err := NewOrthrusServer(db, setupTestCA(t)) + require.NoError(t, err) + srv.heartbeatTimeout = time.Millisecond + + const agentUUID = "current-session-uuid" + agent := &models.OrthrusAgent{ + UUID: agentUUID, + Name: "current-agent", + Status: models.OrthrusStatusOnline, + } + require.NoError(t, db.Create(agent).Error) + + conn, wsCleanup := testWSPair(t) + defer wsCleanup() + + sess, err := NewAgentSession(agentUUID, "current-agent", conn) + require.NoError(t, err) + require.NoError(t, sess.Close()) + require.False(t, sess.IsAlive()) + + // Store the SAME pointer so CompareAndDelete returns true. + srv.sessions.Store(agentUUID, sess) + + doneCh := make(chan struct{}) + go func() { + defer close(doneCh) + srv.watchHeartbeat(agentUUID, sess) + }() + + select { + case <-doneCh: + case <-time.After(2 * time.Second): + t.Fatal("watchHeartbeat did not exit within deadline") + } + + _, ok := srv.sessions.Load(agentUUID) + assert.False(t, ok, "session must be evicted when CompareAndDelete succeeds") + + var stored models.OrthrusAgent + require.NoError(t, db.Where("uuid = ?", agentUUID).First(&stored).Error) + assert.Equal(t, models.OrthrusStatusOffline, stored.Status, + "agent must be marked offline when CompareAndDelete succeeds") +} diff --git a/backend/internal/orthrus/session.go b/backend/internal/orthrus/session.go index 0727dfa59..a965a94b1 100644 --- a/backend/internal/orthrus/session.go +++ b/backend/internal/orthrus/session.go @@ -289,6 +289,7 @@ func (s *AgentSession) StartExternalProxy(port int) error { pr.Out.Host = "" }, FlushInterval: -1, + Transport: &http.Transport{DisableKeepAlives: true}, } srv := &http.Server{ diff --git a/backend/internal/orthrus/session_test.go b/backend/internal/orthrus/session_test.go index 0174fb6f9..1e87bb458 100644 --- a/backend/internal/orthrus/session_test.go +++ b/backend/internal/orthrus/session_test.go @@ -1,10 +1,15 @@ package orthrus import ( + "fmt" + "io" + "net" "net/http" "net/http/httptest" "strings" + "sync/atomic" "testing" + "time" "github.com/gorilla/websocket" "github.com/stretchr/testify/assert" @@ -73,3 +78,48 @@ func TestAgentSession_Close_SetsNotAlive(t *testing.T) { require.NoError(t, sess.Close()) assert.False(t, sess.IsAlive()) } + +func TestStartExternalProxy_TransportDisablesKeepAlives(t *testing.T) { + serverConn, done := testWSPair(t) + defer done() + + sess, err := NewAgentSession("keepalive-uuid", "keepalive-agent", serverConn) + require.NoError(t, err) + defer func() { _ = sess.Close() }() + + var connCount atomic.Int32 + mock := httptest.NewUnstartedServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = io.WriteString(w, `{"ok":true}`) + })) + mock.Config.ConnState = func(_ net.Conn, state http.ConnState) { + if state == http.StateNew { + connCount.Add(1) + } + } + mock.Start() + defer mock.Close() + + mockPort := mock.Listener.Addr().(*net.TCPAddr).Port + + sess.mu.Lock() + sess.proxyPort = mockPort + sess.mu.Unlock() + + extPort := findFreePort(t) + require.NoError(t, sess.StartExternalProxy(extPort)) + + assert.Eventually(t, func() bool { + return sess.GetExternalProxyStatus().Active + }, 2*time.Second, 10*time.Millisecond) + + client := &http.Client{Timeout: 2 * time.Second} + for i := 0; i < 2; i++ { + resp, reqErr := client.Get(fmt.Sprintf("http://127.0.0.1:%d/containers/json", extPort)) + require.NoError(t, reqErr) + require.Equal(t, http.StatusOK, resp.StatusCode) + _, _ = io.Copy(io.Discard, resp.Body) + require.NoError(t, resp.Body.Close()) + } + + assert.Equal(t, int32(2), connCount.Load()) +} diff --git a/backend/internal/services/uptime_service.go b/backend/internal/services/uptime_service.go index e0bbc2c08..02946fba4 100644 --- a/backend/internal/services/uptime_service.go +++ b/backend/internal/services/uptime_service.go @@ -8,6 +8,7 @@ import ( "net" "net/http" "net/url" + "reflect" "strconv" "strings" "sync" @@ -21,9 +22,16 @@ import ( "gorm.io/gorm" ) +// orthrusStatusChecker allows UptimeService to query Orthrus session liveness +// without a direct dependency on the orthrus package. +type orthrusStatusChecker interface { + GetProxyAddr(agentUUID string) (string, bool) +} + type UptimeService struct { DB *gorm.DB NotificationService *NotificationService + orthrusResolver orthrusStatusChecker // nil when Orthrus feature is disabled // Batching: track pending notifications pendingNotifications map[string]*pendingHostNotification notificationMutex sync.Mutex @@ -77,6 +85,22 @@ func NewUptimeService(db *gorm.DB, ns *NotificationService) *UptimeService { } } +// SetOrthrusResolver injects the Orthrus session resolver. +// Uses the typed-nil guard pattern established in DockerHandler. +func (s *UptimeService) SetOrthrusResolver(r orthrusStatusChecker) { + if r == nil { + s.orthrusResolver = nil + return + } + + rv := reflect.ValueOf(r) + if (rv.Kind() == reflect.Ptr || rv.Kind() == reflect.Interface) && rv.IsNil() { + s.orthrusResolver = nil + return + } + s.orthrusResolver = r +} + // extractPort extracts the port from a URL or host:port string func extractPort(urlStr string) string { // Try parsing as URL first @@ -270,6 +294,16 @@ func (s *UptimeService) SyncMonitors() error { // The upstream host for grouping upstreamHost := server.Host + // Orthrus-managed servers: connectivity is measured by session liveness, not TCP. + if server.ConnectionType == models.ConnectionTypeOrthrus { + if server.OrthrusAgentUUID == nil || *server.OrthrusAgentUUID == "" { + continue // No agent linked โ€” cannot create a meaningful monitor + } + targetType = "orthrus" + targetURL = *server.OrthrusAgentUUID // Agent UUID as the monitor identifier + // upstreamHost remains server.Host (Tailscale IP) โ€” correct for grouping/display + } + switch err { case gorm.ErrRecordNotFound: // Find or create UptimeHost @@ -382,6 +416,8 @@ func (s *UptimeService) CheckAll() { tcpMonitors := make([]models.UptimeMonitor, 0, len(monitors)) nonTCPMonitors := make([]models.UptimeMonitor, 0, len(monitors)) + // "orthrus" type is not "tcp", so it falls into nonTCPMonitors and + // continues to run checkMonitor independently even when the host is down. for _, monitor := range monitors { normalizedType := strings.ToLower(strings.TrimSpace(monitor.Type)) if normalizedType == "tcp" { @@ -484,6 +520,22 @@ func (s *UptimeService) checkHost(ctx context.Context, host *models.UptimeHost) return } + // Fast-path: if every monitor for this host is Orthrus-type, skip the + // TCP pre-check entirely โ€” individual checkMonitor calls determine status. + hasDialable := false + for _, m := range monitors { + if strings.ToLower(m.Type) != "orthrus" { + hasDialable = true + break + } + } + if !hasDialable { + return + } + + // Track whether any non-Orthrus monitor with a valid port was attempted. + attempted := false + // Try to connect to any of the monitor ports with retry logic success := false var msg string @@ -508,6 +560,11 @@ func (s *UptimeService) checkHost(ctx context.Context, host *models.UptimeHost) } for _, monitor := range monitors { + // Orthrus liveness is checked per-monitor via session state, not TCP pre-check. + if strings.ToLower(monitor.Type) == "orthrus" { + continue + } + var port string // Use actual backend port from ProxyHost if available @@ -522,6 +579,7 @@ func (s *UptimeService) checkHost(ctx context.Context, host *models.UptimeHost) continue } + attempted = true logger.Log().WithFields(map[string]any{ "monitor": monitor.Name, "extracted_port": extractPort(monitor.URL), @@ -554,6 +612,12 @@ func (s *UptimeService) checkHost(ctx context.Context, host *models.UptimeHost) } } + // If every monitor for this host is Orthrus-type, there are no dialable ports. + // Skip the TCP pre-check; individual checkMonitor() calls determine status. + if !attempted { + return + } + latency := time.Since(start).Milliseconds() oldStatus := host.Status var newStatus string @@ -807,6 +871,23 @@ func (s *UptimeService) checkMonitor(monitor models.UptimeMonitor) { } else { msg = err.Error() } + case "orthrus": + agentUUID := monitor.URL + if s.orthrusResolver == nil { + msg = "Orthrus subsystem unavailable" + break + } + if agentUUID == "" { + msg = "Monitor missing agent UUID" + break + } + _, ok := s.orthrusResolver.GetProxyAddr(agentUUID) + if ok { + success = true + msg = "Orthrus session active" + } else { + msg = "Orthrus agent not connected" + } default: msg = "Unknown monitor type" } diff --git a/backend/internal/services/uptime_service_test.go b/backend/internal/services/uptime_service_test.go index e3e5c2aa5..93cb0159e 100644 --- a/backend/internal/services/uptime_service_test.go +++ b/backend/internal/services/uptime_service_test.go @@ -1,6 +1,7 @@ package services import ( + "context" "fmt" "net" "net/http" @@ -1890,3 +1891,461 @@ func TestCheckMonitor_TCP_AcceptsRFC1918Address(t *testing.T) { db.First(&result, "id = ?", monitor.ID) assert.Equal(t, "up", result.Status, "TCP monitor to loopback should report up") } + +// --- Orthrus uptime monitoring tests --- + +type mockOrthrusResolver struct { + addr string + ok bool +} + +func (m *mockOrthrusResolver) GetProxyAddr(_ string) (string, bool) { + return m.addr, m.ok +} + +func TestUptimeService_SetOrthrusResolver(t *testing.T) { + t.Run("normal set", func(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + mock := &mockOrthrusResolver{addr: "127.0.0.1:1234", ok: true} + us.SetOrthrusResolver(mock) + assert.Equal(t, mock, us.orthrusResolver) + }) + + t.Run("nil resolver clears field", func(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + us.SetOrthrusResolver(&mockOrthrusResolver{}) + us.SetOrthrusResolver(nil) + assert.Nil(t, us.orthrusResolver) + }) + + t.Run("typed nil resolver clears field", func(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + + us.SetOrthrusResolver(&mockOrthrusResolver{addr: "127.0.0.1:1234", ok: true}) + + var typedNil *mockOrthrusResolver + us.SetOrthrusResolver(typedNil) + + assert.Nil(t, us.orthrusResolver) + }) + + t.Run("resolver replacement", func(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + r1 := &mockOrthrusResolver{addr: "a", ok: true} + r2 := &mockOrthrusResolver{addr: "b", ok: false} + us.SetOrthrusResolver(r1) + us.SetOrthrusResolver(r2) + assert.Equal(t, r2, us.orthrusResolver) + }) +} + +func TestSyncMonitors_OrthrusRemoteServer_CreatesOrthrusMonitor(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + + agentUUID := "test-agent-uuid-1234" + server := models.RemoteServer{ + UUID: "remote-orthrus-1", + Name: "Orthrus Server", + Host: "100.99.23.57", + Port: 2375, + Scheme: "http", + Enabled: true, + ConnectionType: models.ConnectionTypeOrthrus, + OrthrusAgentUUID: &agentUUID, + } + require.NoError(t, db.Create(&server).Error) + + require.NoError(t, us.SyncMonitors()) + + var monitor models.UptimeMonitor + require.NoError(t, db.Where("remote_server_id = ?", server.ID).First(&monitor).Error) + assert.Equal(t, "orthrus", monitor.Type) + assert.Equal(t, agentUUID, monitor.URL) + assert.Equal(t, "100.99.23.57", monitor.UpstreamHost) +} + +func TestSyncMonitors_OrthrusRemoteServer_MigratesExistingTCPMonitor(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + + agentUUID := "migrate-agent-uuid" + server := models.RemoteServer{ + UUID: "remote-migrate-1", + Name: "Migrate Server", + Host: "100.99.23.58", + Port: 2375, + Scheme: "http", + Enabled: true, + ConnectionType: models.ConnectionTypeOrthrus, + OrthrusAgentUUID: &agentUUID, + } + require.NoError(t, db.Create(&server).Error) + + legacyMonitor := models.UptimeMonitor{ + ID: "legacy-tcp-monitor", + RemoteServerID: &server.ID, + Name: server.Name, + Type: "tcp", + URL: fmt.Sprintf("%s:%d", server.Host, server.Port), + UpstreamHost: server.Host, + Enabled: true, + Status: "up", + } + require.NoError(t, db.Create(&legacyMonitor).Error) + + require.NoError(t, us.SyncMonitors()) + + var updated models.UptimeMonitor + require.NoError(t, db.Where("remote_server_id = ?", server.ID).First(&updated).Error) + assert.Equal(t, "orthrus", updated.Type) + assert.Equal(t, agentUUID, updated.URL) +} + +func TestSyncMonitors_NonOrthrusRemoteServer_StillUsesHTTP(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + + server := models.RemoteServer{ + UUID: "remote-direct-1", + Name: "Direct Server", + Host: "192.168.1.100", + Port: 8080, + Scheme: "http", + Enabled: true, + ConnectionType: models.ConnectionTypeDirect, + } + require.NoError(t, db.Create(&server).Error) + + require.NoError(t, us.SyncMonitors()) + + var monitor models.UptimeMonitor + require.NoError(t, db.Where("remote_server_id = ?", server.ID).First(&monitor).Error) + assert.Equal(t, "http", monitor.Type) + assert.Contains(t, monitor.URL, "192.168.1.100") +} + +func TestCheckHost_OrthrusOnlyHost_SkipsTCPDial(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + + uptimeHost := models.UptimeHost{ + Host: "100.99.23.57", + Name: "Orthrus Only Host", + Status: "pending", + } + require.NoError(t, db.Create(&uptimeHost).Error) + + hostID := uptimeHost.ID + orthrusMonitor := models.UptimeMonitor{ + ID: "orthrus-only-1", + Name: "Orthrus Monitor", + Type: "orthrus", + URL: "some-agent-uuid", + Enabled: true, + Status: "pending", + UptimeHostID: &hostID, + UpstreamHost: "100.99.23.57", + } + require.NoError(t, db.Create(&orthrusMonitor).Error) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + us.checkHost(ctx, &uptimeHost) + + var refreshed models.UptimeHost + require.NoError(t, db.Where("id = ?", uptimeHost.ID).First(&refreshed).Error) + assert.Equal(t, "pending", refreshed.Status, "Orthrus-only host should not have TCP pre-check run") +} + +func TestCheckHost_MixedHost_OrthrusAndTCP_DialsTCPOnly(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + us.config.FailureThreshold = 1 + + ln, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + port := ln.Addr().(*net.TCPAddr).Port + go func() { + conn, _ := ln.Accept() + if conn != nil { + _ = conn.Close() + } + }() + t.Cleanup(func() { _ = ln.Close() }) + + uptimeHost := models.UptimeHost{ + Host: "127.0.0.1", + Name: "Mixed Host", + Status: "pending", + } + require.NoError(t, db.Create(&uptimeHost).Error) + + hostID := uptimeHost.ID + orthrusMonitor := models.UptimeMonitor{ + ID: "mixed-orthrus-1", + Name: "Orthrus Monitor", + Type: "orthrus", + URL: "agent-uuid-xyz", + Enabled: true, + Status: "pending", + UptimeHostID: &hostID, + UpstreamHost: "127.0.0.1", + } + tcpMonitor := models.UptimeMonitor{ + ID: "mixed-tcp-1", + Name: "TCP Monitor", + Type: "tcp", + URL: fmt.Sprintf("127.0.0.1:%d", port), + Enabled: true, + Status: "pending", + UptimeHostID: &hostID, + UpstreamHost: "127.0.0.1", + } + require.NoError(t, db.Create(&orthrusMonitor).Error) + require.NoError(t, db.Create(&tcpMonitor).Error) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + us.checkHost(ctx, &uptimeHost) + + var refreshed models.UptimeHost + require.NoError(t, db.Where("id = ?", uptimeHost.ID).First(&refreshed).Error) + assert.Equal(t, "up", refreshed.Status, "Mixed host TCP port should succeed") +} + +func TestCheckMonitor_OrthrusType_AgentConnected_ReturnsUp(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + us.SetOrthrusResolver(&mockOrthrusResolver{addr: "127.0.0.1:54321", ok: true}) + + monitor := models.UptimeMonitor{ + ID: "orthrus-up-1", + Name: "Orthrus Connected", + Type: "orthrus", + URL: "connected-agent-uuid", + Enabled: true, + Status: "pending", + MaxRetries: 1, + } + require.NoError(t, db.Create(&monitor).Error) + + us.checkMonitor(monitor) + + var refreshed models.UptimeMonitor + require.NoError(t, db.Where("id = ?", monitor.ID).First(&refreshed).Error) + assert.Equal(t, "up", refreshed.Status) + + var heartbeat models.UptimeHeartbeat + require.NoError(t, db.Where("monitor_id = ?", monitor.ID).Order("created_at desc").First(&heartbeat).Error) + assert.Equal(t, "up", heartbeat.Status) + assert.Equal(t, "Orthrus session active", heartbeat.Message) +} + +func TestCheckMonitor_OrthrusType_AgentDisconnected_ReturnsDown(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + us.SetOrthrusResolver(&mockOrthrusResolver{addr: "", ok: false}) + + monitor := models.UptimeMonitor{ + ID: "orthrus-down-1", + Name: "Orthrus Disconnected", + Type: "orthrus", + URL: "disconnected-agent-uuid", + Enabled: true, + Status: "pending", + MaxRetries: 1, + FailureCount: 0, + } + require.NoError(t, db.Create(&monitor).Error) + + us.checkMonitor(monitor) + + var refreshed models.UptimeMonitor + require.NoError(t, db.Where("id = ?", monitor.ID).First(&refreshed).Error) + assert.Equal(t, "down", refreshed.Status) + + var heartbeat models.UptimeHeartbeat + require.NoError(t, db.Where("monitor_id = ?", monitor.ID).Order("created_at desc").First(&heartbeat).Error) + assert.Equal(t, "down", heartbeat.Status) + assert.Equal(t, "Orthrus agent not connected", heartbeat.Message) +} + +func TestCheckMonitor_OrthrusType_NilResolver_ReturnsDown(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + + monitor := models.UptimeMonitor{ + ID: "orthrus-nil-resolver", + Name: "Orthrus Nil Resolver", + Type: "orthrus", + URL: "some-agent-uuid", + Enabled: true, + Status: "pending", + MaxRetries: 1, + FailureCount: 0, + } + require.NoError(t, db.Create(&monitor).Error) + + us.checkMonitor(monitor) + + var heartbeat models.UptimeHeartbeat + require.NoError(t, db.Where("monitor_id = ?", monitor.ID).Order("created_at desc").First(&heartbeat).Error) + assert.Equal(t, "Orthrus subsystem unavailable", heartbeat.Message) +} + +func TestCheckAll_OrthrusMonitor_NotShortCircuitedWhenHostDown(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + us.config.FailureThreshold = 1 + us.SetOrthrusResolver(&mockOrthrusResolver{addr: "", ok: false}) + + uptimeHost := models.UptimeHost{ + Host: "100.99.23.57", + Name: "Down Orthrus Host", + Status: "down", + } + require.NoError(t, db.Create(&uptimeHost).Error) + + hostID := uptimeHost.ID + orthrusMonitor := models.UptimeMonitor{ + ID: "checkall-orthrus-1", + Name: "Orthrus Not Short-Circuited", + Type: "orthrus", + URL: "agent-uuid", + Enabled: true, + Status: "pending", + UptimeHostID: &hostID, + UpstreamHost: "100.99.23.57", + MaxRetries: 1, + } + require.NoError(t, db.Create(&orthrusMonitor).Error) + + us.CheckAll() + + assert.Eventually(t, func() bool { + var refreshed models.UptimeMonitor + if db.Where("id = ?", orthrusMonitor.ID).First(&refreshed).Error != nil { + return false + } + return refreshed.Status == "down" + }, 3*time.Second, 25*time.Millisecond) + + var heartbeat models.UptimeHeartbeat + err := db.Where("monitor_id = ?", orthrusMonitor.ID).Order("created_at desc").First(&heartbeat).Error + require.NoError(t, err) + assert.Equal(t, "Orthrus agent not connected", heartbeat.Message, + "Orthrus monitor should be checked via checkMonitor, not short-circuited") + assert.NotEqual(t, "Host unreachable", heartbeat.Message) +} + +// TestSyncMonitors_OrthrusRemoteServer_NilAgentUUID_SkipsMonitorCreation verifies +// that an Orthrus-type server with a nil OrthrusAgentUUID is skipped without +// creating a monitor (the continue guard inside SyncMonitors). +func TestSyncMonitors_OrthrusRemoteServer_NilAgentUUID_SkipsMonitorCreation(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + + server := models.RemoteServer{ + UUID: "remote-nil-agent", + Name: "Nil Agent Server", + Host: "100.99.23.60", + Port: 2375, + Scheme: "http", + Enabled: true, + ConnectionType: models.ConnectionTypeOrthrus, + OrthrusAgentUUID: nil, + } + require.NoError(t, db.Create(&server).Error) + + require.NoError(t, us.SyncMonitors()) + + var count int64 + db.Model(&models.UptimeMonitor{}).Where("upstream_host = ?", server.Host).Count(&count) + assert.Equal(t, int64(0), count, "no monitor should be created when OrthrusAgentUUID is nil") +} + +// TestCheckMonitor_OrthrusType_EmptyURL_ReturnsDown covers the empty agent UUID +// guard inside the "orthrus" case of checkMonitor (msg = "Monitor missing agent UUID"). +func TestCheckMonitor_OrthrusType_EmptyURL_ReturnsDown(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + us.SetOrthrusResolver(&mockOrthrusResolver{addr: "127.0.0.1:1234", ok: true}) + + monitor := models.UptimeMonitor{ + ID: "orthrus-empty-url", + Name: "Orthrus Empty URL", + Type: "orthrus", + URL: "", + Enabled: true, + Status: "pending", + MaxRetries: 1, + } + require.NoError(t, db.Create(&monitor).Error) + + us.checkMonitor(monitor) + + var heartbeat models.UptimeHeartbeat + require.NoError(t, db.Where("monitor_id = ?", monitor.ID). + Order("created_at desc").First(&heartbeat).Error) + assert.Equal(t, "Monitor missing agent UUID", heartbeat.Message) +} + +// TestCheckHost_NonOrthrusMonitorNoPort_SkipsTCPDial covers the !attempted early +// return: hasDialable is true (there is a non-orthrus monitor) but the monitor's +// URL yields no parseable port, so attempted stays false and checkHost returns +// without updating the host status. +func TestCheckHost_NonOrthrusMonitorNoPort_SkipsTCPDial(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + + uptimeHost := models.UptimeHost{ + Host: "10.0.0.1", + Name: "No Port Host", + Status: "pending", + } + require.NoError(t, db.Create(&uptimeHost).Error) + + hostID := uptimeHost.ID + noPortMonitor := models.UptimeMonitor{ + ID: "no-port-monitor-1", + Name: "No Port Monitor", + Type: "http", + URL: "just-a-hostname", + Enabled: true, + Status: "pending", + UptimeHostID: &hostID, + UpstreamHost: "10.0.0.1", + MaxRetries: 1, + } + require.NoError(t, db.Create(&noPortMonitor).Error) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + us.checkHost(ctx, &uptimeHost) + + var refreshed models.UptimeHost + require.NoError(t, db.Where("id = ?", uptimeHost.ID).First(&refreshed).Error) + assert.Equal(t, "pending", refreshed.Status, + "host status must not change when no TCP dial was attempted") +} diff --git a/docs/plans/current_spec.md b/docs/plans/current_spec.md index 34c7d9189..6ad01c784 100644 --- a/docs/plans/current_spec.md +++ b/docs/plans/current_spec.md @@ -1,682 +1,250 @@ ---- -post_title: Caddy Version Drift in GitHub Security Root-Cause Plan -categories: - - plans -tags: - - security - - trivy - - docker - - github-actions - - supply-chain -summary: Investigation and durable remediation plan to identify why GitHub Security still reports Caddy 2.11.2 after Dockerfile was updated to 2.11.3, with concrete verification commands, failure-mode matrix, and commit slicing. -post_date: 2026-05-24 ---- - ## Introduction -### Overview - -Dockerfile now pins Caddy to 2.11.3, but GitHub Security still shows Trivy -alerts indicating 2.11.2. This plan defines how to identify the exact reporting -source (workflow, category, artifact, branch, platform, or cache path), prove -root cause with evidence, and implement durable fixes so alert state converges -with actual shipped images. - -### Objectives - -- Identify the precise workflow/category/artifact still emitting Caddy 2.11.2. -- Determine why that path bypassed or outlived the Dockerfile update. -- Define durable remediation over one-off closures. -- Validate remediation with repeatable commands and expected outcomes. - -### Scope - -In scope: +This plan targets the latest CI failure in job Security Scan PR Image and is +scoped to workflow-only changes in +[.github/workflows/docker-build.yml](.github/workflows/docker-build.yml). -- GitHub Actions workflows that build images and upload SARIF. -- Trivy/Grype/SBOM/provenance pipeline wiring. -- PR/push/nightly/weekly branch and category behavior. -- Local task/skill wiring that influences operator assumptions. -- Minimal config updates needed for long-term correctness. - -Out of scope: - -- Unrelated frontend/backend product behavior. -- Dependency policy changes not related to the stale Caddy signal. - -## Requirements - -### EARS Requirements - -- WHEN Dockerfile pins Caddy to 2.11.3, THE SYSTEM SHALL report Caddy 2.11.3 - in all image-based security scans for the same source revision. -- WHEN SARIF is uploaded from multiple workflows, THE SYSTEM SHALL use - unambiguous categories that map to active workflows only. -- IF a workflow scans a partial artifact (for example only one binary), THEN THE - SYSTEM SHALL not be treated as authoritative for full container component - status. -- WHEN builds are skipped or use stale tags/artifacts, THE SYSTEM SHALL emit an - explicit signal that scan freshness is unknown. -- WHEN multi-arch images are published, THE SYSTEM SHALL verify Caddy version - per relevant platform digest, not only by floating tag. +Objectives: +- Explain why diagnostics still reports fallback with reason parser command + failure. +- Verify whether `.trivyignore` is honored by both PR Trivy steps. +- Keep step 15 (Enforce PR Trivy security gate) as the authoritative blocker. +- Add deterministic diagnostics that can still list unsuppressed IDs when + possible. +- Define minimal edits, exact step names, and validation commands. ## Research Findings -### Confirmed Version Pin - -- Dockerfile sets CADDY_VERSION=2.11.3 and CADDY_CANDIDATE_VERSION=2.11.3. -- Caddy is built from source in caddy-builder and copied into runtime. - -### Exact Files and Workflows to Inspect - -#### Primary CI and Security Sources - +### Evidence Pointers + +1. Gate failure is real and is driven by `trivy-scan` outcome: + - log shows enforce step evaluating failure and exiting 1: + [.github/logs/ci_failure.log](.github/logs/ci_failure.log#L1551) + [.github/logs/ci_failure.log](.github/logs/ci_failure.log#L1579) + - workflow gate logic is explicitly tied to `steps.trivy-scan.outcome`: + [.github/workflows/docker-build.yml](.github/workflows/docker-build.yml#L1020) + [.github/workflows/docker-build.yml](.github/workflows/docker-build.yml#L1023) + +2. Diagnostics fallback is specifically parser-command related (not missing + SARIF): + - fallback reason assignment: + [.github/logs/ci_failure.log](.github/logs/ci_failure.log#L1505) + - fallback summary path in workflow: + [.github/workflows/docker-build.yml](.github/workflows/docker-build.yml#L1005) + +3. `.trivyignore` is passed and discovered in both PR scan steps: + - table scan input and ignore discovery: + [.github/logs/ci_failure.log](.github/logs/ci_failure.log#L711) + [.github/logs/ci_failure.log](.github/logs/ci_failure.log#L716) + - SARIF scan input and ignore discovery: + [.github/logs/ci_failure.log](.github/logs/ci_failure.log#L1172) + [.github/logs/ci_failure.log](.github/logs/ci_failure.log#L1177) + - corresponding workflow step configuration: + [.github/workflows/docker-build.yml](.github/workflows/docker-build.yml#L852) + [.github/workflows/docker-build.yml](.github/workflows/docker-build.yml#L862) + - IDs repeatedly seen in log are present in ignore file (example set): + [.github/logs/ci_failure.log](.github/logs/ci_failure.log#L1208) + [.github/logs/ci_failure.log](.github/logs/ci_failure.log#L1218) + [.github/logs/ci_failure.log](.github/logs/ci_failure.log#L1256) + [.trivyignore](.trivyignore#L39) + [.trivyignore](.trivyignore#L49) + [.trivyignore](.trivyignore#L88) + +4. Signal mismatch is present: + - table output reports 0 vulnerabilities and indicates suppression occurred: + [.github/logs/ci_failure.log](.github/logs/ci_failure.log#L832) + [.github/logs/ci_failure.log](.github/logs/ci_failure.log#L833) + - SARIF step still exits with code 1: + [.github/logs/ci_failure.log](.github/logs/ci_failure.log#L1292) + [.github/logs/ci_failure.log](.github/logs/ci_failure.log#L1294) + +5. Current diagnostics parser is brittle and hides useful failure detail: + - large coupled jq filter with package regex extraction: + [.github/workflows/docker-build.yml](.github/workflows/docker-build.yml#L933) + [.github/workflows/docker-build.yml](.github/workflows/docker-build.yml#L960) + - stderr is redirected away in the command path used for findings JSON, + making root parser error opaque in logs. + +### Root Cause Assessment + +1. Why parser command failure still happens: +- The diagnostics step couples ID extraction and package text parsing in one jq + pipeline. Any jq runtime/shape issue in optional fields can fail the whole + command and force fallback. +- The current implementation suppresses parser stderr (`2>/dev/null`) for the + core parser command, so CI only records generic fallback reason without + actionable parser detail. + +2. Are table and SARIF scans honoring ignore file: +- Evidence shows both steps load `.trivyignore`. +- Therefore ignore-file loading is not the primary failure. +- Most likely mismatch is due to scan surface differences when SARIF step exits + 1 while table shows 0 vulnerabilities. A minimal deterministic fix is to + pin both PR Trivy steps to vulnerability scanner scope (`vuln`) so the gate + aligns exactly with unsuppressed vulnerability IDs governed by + `.trivyignore`. + +## Technical Specification (Workflow-Only) + +File in scope: - [.github/workflows/docker-build.yml](.github/workflows/docker-build.yml) -- [.github/workflows/security-pr.yml](.github/workflows/security-pr.yml) -- [.github/workflows/security-weekly-rebuild.yml](.github/workflows/security-weekly-rebuild.yml) -- [.github/workflows/nightly-build.yml](.github/workflows/nightly-build.yml) -- [.github/workflows/supply-chain-pr.yml](.github/workflows/supply-chain-pr.yml) -- [.github/workflows/supply-chain-verify.yml](.github/workflows/supply-chain-verify.yml) - -#### Scan Configuration and Ignore Policy - -- [trivy.yaml](trivy.yaml) -- [.trivyignore](.trivyignore) -- [.grype.yaml](.grype.yaml) - -#### Task Wiring and Local Operator Paths - -- [.vscode/tasks.json](.vscode/tasks.json) -- [.github/skills/scripts/skill-runner.sh](.github/skills/scripts/skill-runner.sh) -- [.github/skills/security-scan-trivy-scripts/run.sh](.github/skills/security-scan-trivy-scripts/run.sh) -- [.github/skills/security-scan-docker-image-scripts/run.sh](.github/skills/security-scan-docker-image-scripts/run.sh) - -#### Documentation that Influences Scan Interpretation - -- [SECURITY.md](SECURITY.md) -- [docs/guides/supply-chain-security-developer-guide.md](docs/guides/supply-chain-security-developer-guide.md) -- [ARCHITECTURE.md](ARCHITECTURE.md) - -### High-Signal Observations - -- docker-build.yml uploads Trivy SARIF under multiple categories, including a - legacy compatibility alias: - .github/workflows/docker-publish.yml:build-and-push. -- security-pr.yml does filesystem scan of extracted /app/charon binary, - not full image and not /usr/bin/caddy; this path cannot be authoritative for - Caddy image component status. -- nightly-build.yml scans category trivy-nightly and uses separate branch - flow; stale findings can persist if nightly diverges from main or is not - rebuilt after merge. -- docker-build.yml has skip logic for chore/Renovate patterns; if build is - skipped on a critical version-bump commit, canonical scan categories may not - refresh. -- supply-chain-verify.yml has pull-request tag logic that still references - pr- style in some paths while docker-build.yml emits immutable - pr-- tags; this can cause wrong-artifact lookups in some modes. -- Local Trivy skill scans repository filesystem (trivy fs /app), which can - disagree with image scans and should not be used to infer GitHub Security - image alert closure behavior. - -## Probable Failure Modes -### FM-1 Stale SARIF Category Track (Legacy Alias) +Exact step names in scope: +1. Run Trivy scan on PR image (table output) +2. Run Trivy scan on PR image (SARIF - blocking) +3. Diagnose unsuppressed PR Trivy blockers +4. Enforce PR Trivy security gate (no logic change) -- Source: docker-build.yml compatibility upload to - .github/workflows/docker-publish.yml:build-and-push. -- Effect: old category can keep/open alerts even after active workflow updates. +### Minimal Edit Set -### FM-2 Scanner Targets Wrong Artifact +### Edit A: Align scanner surface for both PR Trivy steps -- Source: security-pr.yml scans only extracted /app/charon. -- Effect: misses Caddy binary (/usr/bin/caddy) status and gives false closure - confidence. +Steps: +- Run Trivy scan on PR image (table output) +- Run Trivy scan on PR image (SARIF - blocking) -### FM-3 Build Skipped, Scan Never Refreshed +Change: +- Add explicit `scanners: 'vuln'` to both steps. -- Source: skip logic on chore/bot commits in docker-build.yml. -- Effect: post-merge categories not refreshed, old alerts remain open. +Reason: +- Makes table and SARIF behavior deterministic for vulnerability gating. +- Ensures `.trivyignore` governs the same result set used by both outputs. +- Preserves intended gate semantics: fail on unsuppressed vulnerabilities or + scan execution failure. -### FM-4 Branch Mismatch +### Edit B: Make diagnostics deterministic and ID-first -- Source: nightly/dev/main have separate scan categories and schedules. -- Effect: Security tab still shows 2.11.2 from non-main category/ref. +Step: +- Diagnose unsuppressed PR Trivy blockers -### FM-5 Stale Cached Build Output +Changes: +- Keep prechecks for file missing/unreadable/invalid JSON/unexpected schema. +- Split parsing into two paths: + - authoritative path: extract unsuppressed IDs only, with robust optional + navigation and `try` safeguards. + - optional enrichment path (package parsing) that never controls fallback. +- On parser failure: + - record `parser_exit_code`. + - capture first parser stderr line in summary (sanitized, no secrets). +- Summary always includes deterministic fields: + - Diagnostics status + - Fallback reason (if any) + - Count + - Blocker IDs (or unknown when parser truly cannot produce IDs) -- Source: BuildKit cache behavior in certain workflows/stages. -- Effect: rebuilt image may still embed older Caddy artifact in one path. +Reason: +- Guarantees best-effort blocker-ID reporting even when enrichment fails. +- Removes ambiguity from current generic parser fallback. -### FM-6 PR Tag and Artifact Resolution Drift +### Edit C: Preserve authoritative gate semantics -- Source: mutable/legacy tag expectations (pr-) vs immutable tags - (pr--). -- Effect: scanner pulls unexpected image. +Step: +- Enforce PR Trivy security gate -### FM-7 Trivy DB/Cache Timing and Feed Drift +Change: +- No behavioral change. -- Source: DB cache freshness differences between runs. -- Effect: inconsistent vulnerability metadata across runs. +Invariant: +- Keep `if: always()`. +- Keep pass/fail decision tied only to `steps.trivy-scan.outcome`. +- Diagnostics output remains informational and non-authoritative. -### FM-8 Multi-Arch Manifest Mismatch - -- Source: one architecture updated while another remains old. -- Effect: alerts persist for platform-specific digest even if amd64 looks fixed. - -### FM-9 Old SARIF Still Open on Default Branch - -- Source: no subsequent successful upload for same category/ref to close prior - result set. -- Effect: stale findings remain visible. - -## Technical Specifications - -### Investigation Data Model - -For each open Caddy finding, collect this tuple: - -- alert number -- tool name (Trivy) -- state -- most recent instance ref -- SARIF category -- workflow/run URL -- scanned target type (image or fs) -- image ref or digest (if applicable) -- observed Caddy version evidence - -### Verification Commands and Expected Outcomes +## Implementation Plan -#### 1. Enumerate Open Trivy Alerts and Categories (Paginated + Ref-Aware) +### Phase 1: Scanner Alignment +- Add `scanners: 'vuln'` to both PR Trivy scan steps. +- Do not change `severity`, `trivyignores`, `continue-on-error`, or + `exit-code` behavior. -```bash -# Optional: set REF_PREFIX to constrain results to a branch namespace. -# Examples: refs/heads/main, refs/heads/nightly -REF_PREFIX="refs/heads/main" - -gh api --paginate \ - -H "Accept: application/vnd.github+json" \ - "/repos///code-scanning/alerts?tool_name=Trivy&state=open&per_page=100" \ - | jq -r --arg refPrefix "$REF_PREFIX" ' - .[] - | select(.most_recent_instance.ref | startswith($refPrefix)) - | [ - .number, - .most_recent_instance.ref, - .most_recent_instance.commit_sha, - .most_recent_instance.category, - .most_recent_instance.analysis_key, - .most_recent_instance.location.path, - .most_recent_instance.message.text - ] | @tsv' -``` +### Phase 2: Diagnostics Refactor +- Refactor diagnostics step to ID-first parsing and deterministic fallback + taxonomy. +- Add parser exit-code and first-error-line reporting when parsing fails. -Expected outcome: +### Phase 3: Gate Invariant Verification +- Confirm enforce step remains unchanged in semantics and remains step-15 + authority. -- Full open-alert inventory is complete (no missed pages). -- Each open alert is mapped to ref + category + analysis key for correlation. -- Explicit pass condition: every alert containing Caddy 2.11.2 has a known - category/ref owner row in the investigation table. +## Validation Commands -#### 2. Map Alert Category/Ref to Workflow Runs +Run from repository root: ```bash -gh run list --workflow docker-build.yml --branch main --limit 50 \ - --json databaseId,headSha,createdAt,conclusion,url - -gh run list --workflow security-weekly-rebuild.yml --branch main --limit 20 \ - --json databaseId,headSha,createdAt,conclusion,url - -gh run list --workflow nightly-build.yml --branch nightly --limit 20 \ - --json databaseId,headSha,createdAt,conclusion,url +actionlint .github/workflows/docker-build.yml ``` -Expected outcome: - -- For each alert tuple, at least one candidate run is identified by matching - workflow + ref + headSha window. -- Explicit pass condition: no unresolved alert tuple remains without a candidate - producing run. - -#### 3. Correlate SARIF to Alerts Using Structured Fields (Not String-Only) - ```bash -# After downloading SARIF from candidate runs, extract structured tuples. -jq -r ' - .runs[] as $run - | $run.results[]? - | [ - ($run.automationDetails.id // ""), - (.ruleId // ""), - (.level // ""), - (.locations[0].physicalLocation.artifactLocation.uri // ""), - (.partialFingerprints.primaryLocationLineHash // ""), - (.properties.image_ref // .properties.imageRef // ""), - (.message.text // "") - ] | @tsv -' trivy-results.sarif - -# Optional diagnostic grep is allowed only as a secondary check. -grep -En "2\.11\.(2|3)|caddyserver/caddy" trivy-results.sarif || true +rg -n "Run Trivy scan on PR image \(table output\)|Run Trivy scan on PR image \(SARIF - blocking\)|Diagnose unsuppressed PR Trivy blockers|Enforce PR Trivy security gate|scanners:" .github/workflows/docker-build.yml ``` -Expected outcome: - -- Structured SARIF fields map back to alert category/ref/analysis key lineage. -- Explicit pass condition: each open 2.11.2 alert is backed by one matched SARIF - result tuple from the owning workflow/category. - -#### 4. Verify Built Image Caddy Version by Digest +If local image and Trivy are available: ```bash -IMAGE="ghcr.io//charon@sha256:" -docker pull "$IMAGE" -docker run --rm --entrypoint caddy "$IMAGE" version +trivy image --format table --severity CRITICAL,HIGH --scanners vuln --ignorefile .trivyignore ``` -Expected outcome: - -- Returns v2.11.3 for corrected artifacts. -- If v2.11.2, confirms build-path/cache/tag issue. -- Explicit pass condition: all active-category digests under investigation report - v2.11.3. - -#### 5. Verify Multi-Arch Digests - ```bash -docker buildx imagetools inspect "ghcr.io//charon@sha256:" +trivy image --format sarif --severity CRITICAL,HIGH --scanners vuln --ignorefile .trivyignore -o trivy-pr-results.sarif ``` -Expected outcome: - -- Lists per-platform manifests; follow with per-platform checks where possible. -- Confirms whether one architecture still carries 2.11.2. -- Explicit pass condition: no platform digest in active categories reports - Caddy 2.11.2. - -#### 6. Validate Workflow Skip Behavior on Merge Commit - ```bash -gh run view --log | grep -n "skip_build\|Determine skip condition" +jq -e '.' trivy-pr-results.sarif && jq -e '(.runs | type) == "array"' trivy-pr-results.sarif ``` -Expected outcome: - -- Confirms whether key post-merge build/scan was skipped due to commit title or - actor rules. -- Explicit pass condition: security-relevant commits are not skipped, or skip is - accompanied by an accepted/manual refresh path. - -#### 7. Validate Scan Target Type - -```bash -# Read workflow logic directly for scan target -grep -n "scan-type\|scan-ref\|image-ref\|/app/charon\|/usr/bin/caddy" .github/workflows/security-pr.yml .github/workflows/docker-build.yml -``` - -Expected outcome: - -- Evidence of fs-only binary scan vs full image scan path. -- Explicit pass condition: authoritative image-scan workflow is clearly - identified and documented for closure decisions. - -## Durable Remediation Options (Prioritized) - -### Priority 0 Legacy SARIF Category Closure Backfill Before Retirement - -- Build a closure inventory of all open legacy-category alerts keyed by: - category + ref + workflow + alert number. -- Run an explicit backfill cycle that refreshes those legacy tracks with current - results before retiring compatibility categories. -- Retire compatibility categories only after the backfill shows no unresolved - Caddy 2.11.2 lineage in those tracks. - -Why: - -- Prevents orphaned legacy alert tracks from surviving category retirement. -- Ensures closure state is auditable before compatibility removal. - -### Priority 1 Remove Legacy SARIF Alias Categories - -- Remove compatibility uploads in docker-build.yml for: - - .github/workflows/docker-publish.yml:build-and-push - - any duplicate compatibility category not actively used. -- Keep one canonical category per scan job. - -Why: - -- Prevents long-tail stale alert tracks and ambiguous ownership. - -### Priority 2 Enforce Canonical Image Scan as Source of Truth - -- Treat image-based Trivy scan in docker-build.yml and - security-weekly-rebuild.yml as authoritative for Caddy status. -- Keep security-pr.yml binary scan as supplemental, or extend it to extract - and scan /usr/bin/caddy explicitly if retained for Caddy signal. -- Add explicit category ownership and normalization for - security-weekly-rebuild.yml (set stable category naming and owner cadence). - -Why: - -- Eliminates partial-artifact blind spots. -- Removes implicit category drift in weekly rebuild uploads. - -### Priority 3 Tighten Post-Merge Refresh Guarantees - -- Ensure Dockerfile/security-significant changes cannot bypass build+scan via - skip logic. -- Add a condition override: if Dockerfile or workflow security files changed, - force build/scan. - -Why: - -- Guarantees scan freshness after security-relevant merges. - -### Priority 4 Normalize PR Tag and Artifact Resolution - -- Align all workflows on immutable pr-- semantics. -- Remove or gate legacy pr- fallback paths. - -Why: - -- Avoids scanning wrong images. - -### Priority 5 Add Freshness Telemetry and Closure Proof - -- Emit build digest, Caddy version, and scan category into step summary and - retained artifact for each security upload. -- Optional: add a post-scan assertion that SARIF run metadata references same - digest scanned. - -Why: - -- Makes stale-path diagnosis immediate. - -### Priority 6 Multi-Arch Explicit Verification - -- Add per-arch Caddy version check in CI before SARIF upload, at least for - linux/amd64 and linux/arm64 manifests where available. - -Why: - -- Prevents hidden platform drift. - -## Review of .gitignore, .dockerignore, codecov.yml, Dockerfile - -### .gitignore - -- No change required for this issue. -- Existing SARIF/artifact ignores are appropriate. - -### .dockerignore - -- No change required for this issue. -- Current exclusions do not explain stale Caddy finding source. - -### codecov.yml - -- No change required for this issue. -- Coverage settings are unrelated to code scanning freshness. - -### Dockerfile - -- Functional version pin is already correct at 2.11.3. -- Optional hardening (recommended if issue persists): - - add explicit build-time assertion that built Caddy reports expected major/minor/patch; - - emit OCI label with built Caddy version for traceability. - -## Implementation Plan - -### Phase 1 Playwright/UI Validation - -Objective: - -- Confirm this problem is CI/security-pipeline only, not a UI regression. -- Run UI/Playwright checks conditionally when product-surface files changed. - -Tasks: - -1. Compute change scope: - - workflow/security-pipeline-only change (for example .github/workflows, - scripts security tooling, scan configs), or - - product-surface change (frontend/backend/runtime behavior). -2. If product-surface change is present, run targeted smoke UI task(s) per DoD - policy. -3. If workflow/security-pipeline-only, record conditional waiver and skip UI - execution. -4. Record that no UI behavior change is expected from this remediation. - -Validation gate: - -- For product-surface changes: no UI regressions. -- For workflow/security-only changes: waiver recorded with scope evidence. - -### Phase 2 Root-Cause Evidence Collection - -Objective: - -- Prove exact stale reporting source. - -Tasks: - -1. Enumerate open Trivy alerts with category/ref. -2. Correlate each with workflow run and SARIF artifact. -3. Confirm scanned image digest and runtime Caddy version. -4. Determine whether issue is category-stale, artifact mismatch, skip-build, - branch mismatch, or multi-arch divergence. - -Validation gate: - -- One or more root causes selected, each mapped to category + ref + workflow - with evidence artifact links. - -### Phase 3 Workflow Remediation - -Objective: - -- Remove ambiguity and enforce canonical scan path. - -Tasks: - -1. Execute legacy-category closure backfill for affected category/ref/workflow - tuples. -2. Remove legacy category uploads in docker-build.yml after verified backfill. -3. Align category naming to one active track per workflow. -4. Add explicit weekly category ownership/normalization for - security-weekly-rebuild.yml. -5. Add force-scan behavior for Dockerfile/security-file changes. -6. Align PR image tag/artifact resolution where drift exists. - -Validation gate: - -- Backfill completed and documented before compatibility retirement. -- New run uploads SARIF to canonical category only; no legacy category updates. - -### Phase 4 Verification and Backfill - -Objective: - -- Demonstrate closure behavior and prevent recurrence. - -Tasks: - -1. Re-run relevant workflows on latest main commit. -2. Verify SARIF reflects Caddy 2.11.3. -3. Verify GitHub Security open Trivy alerts for Caddy 2.11.2 close or are - superseded by resolved state. -4. If needed, run weekly/nightly paths to refresh branch-specific categories. - -Validation gate: - -- No open Caddy 2.11.2 findings remain in active categories for current refs. -- Legacy categories either show verified closure after backfill or are - formally tracked with owner + retirement checkpoint. - -### Phase 5 Documentation and Governance - -Objective: - -- Keep future investigations short and deterministic. - -Tasks: - -1. Update SECURITY.md and supply-chain docs with category/branch ownership. -2. Document canonical workflow for image-based closure verification. -3. Note deprecated categories and retirement date. - -Validation gate: - -- Docs clearly map alert -> category -> workflow -> digest -> component version. +## Expected Outcomes + +1. Diagnostics parser failure path becomes explicit and actionable: + - fallback includes parser exit code and parser error hint. + +2. Ignore behavior is deterministic across PR table and SARIF scans: + - both use the same scanner surface (`vuln`) and same ignore file. + +3. Step 15 remains authoritative: + - Enforce PR Trivy security gate fails only when `trivy-scan` outcome is not + success. + - diagnostics content cannot change gate decision. + +4. When SARIF is parseable: + - summary lists unsuppressed blocker IDs (or `none`), not generic unknown. + +## Acceptance Criteria (EARS) + +1. WHEN Diagnose unsuppressed PR Trivy blockers runs and SARIF is missing, + THE SYSTEM SHALL emit diagnostics status fallback with reason file missing. +2. WHEN SARIF is unreadable, THE SYSTEM SHALL emit fallback with reason file + unreadable. +3. WHEN SARIF is invalid JSON, THE SYSTEM SHALL emit fallback with reason + invalid JSON. +4. WHEN SARIF schema is unexpected, THE SYSTEM SHALL emit fallback with reason + unexpected schema. +5. WHEN parser execution fails, THE SYSTEM SHALL emit fallback with reason + parser command failure and include parser exit code. +6. WHEN parser enrichment fails but ID extraction succeeds, + THE SYSTEM SHALL still report blocker IDs and parsed status. +7. WHEN both PR Trivy steps execute, + THE SYSTEM SHALL use `.trivyignore` and scanner scope `vuln` consistently. +8. WHEN Enforce PR Trivy security gate executes, + THE SYSTEM SHALL keep `if: always()` and SHALL fail only when + `steps.trivy-scan.outcome` is not success. ## Commit Slicing Strategy -### Decision - -Single PR with ordered logical commits (security pipeline coherence issue, tightly -related files, low product-surface risk). - -### Trigger Reasons - -- Cross-domain but same concern: workflow wiring + security signal integrity. -- High reviewer value from isolated commits (evidence, wiring, docs). -- Fast rollback needed if any category/reporting side effect appears. - -### Ordered Commits - -#### Commit 1 Evidence and Traceability Baseline - -Scope: - -- Add/adjust investigation notes and CI summaries (non-behavioral) to capture - category/digest/Caddy version mapping. - -Files (expected): - -- [.github/workflows/docker-build.yml](.github/workflows/docker-build.yml) -- [docs/plans/current_spec.md](docs/plans/current_spec.md) - -Dependencies: - -- None. - -Validation gate: - -- Workflow summary includes category + digest + caddy version evidence. - -#### Commit 2 Legacy Category Closure Backfill - -Scope: - -- Build and execute backfill for legacy category/ref/workflow tuples. -- Capture closure evidence and retirement readiness criteria. - -Files (expected): - -- [.github/workflows/docker-build.yml](.github/workflows/docker-build.yml) -- [docs/plans/current_spec.md](docs/plans/current_spec.md) - -Dependencies: - -- Commit 1 merged or present in branch. - -Validation gate: - -- Legacy category closure evidence exists for each tracked tuple. - -#### Commit 3 Canonical Category and Weekly Ownership Normalization - -Scope: - -- Remove legacy SARIF alias uploads and keep canonical categories. -- Add explicit weekly category ownership/normalization for - security-weekly-rebuild. -- Optionally refine security-pr.yml target semantics for clarity. - -Files (expected): - -- [.github/workflows/docker-build.yml](.github/workflows/docker-build.yml) -- [.github/workflows/security-weekly-rebuild.yml](.github/workflows/security-weekly-rebuild.yml) -- [.github/workflows/security-pr.yml](.github/workflows/security-pr.yml) - -Dependencies: - -- Commit 2. - -Validation gate: - -- SARIF uploaded to canonical categories only with explicit weekly ownership. - -#### Commit 4 Freshness and Skip-Guard Enforcement - -Scope: - -- Ensure Dockerfile/security-file changes force build+scan refresh. -- Normalize PR tag/artifact resolution where needed. - -Files (expected): - -- [.github/workflows/docker-build.yml](.github/workflows/docker-build.yml) -- [.github/workflows/supply-chain-verify.yml](.github/workflows/supply-chain-verify.yml) -- [.github/workflows/supply-chain-pr.yml](.github/workflows/supply-chain-pr.yml) - -Dependencies: - -- Commit 3. - -Validation gate: - -- Security-relevant change cannot be skipped; image ref resolution deterministic. - -#### Commit 5 Docs and Operational Runbook - -Scope: - -- Document canonical verification path and stale-alert troubleshooting. - -Files (expected): - -- [SECURITY.md](SECURITY.md) -- [docs/guides/supply-chain-security-developer-guide.md](docs/guides/supply-chain-security-developer-guide.md) - -Dependencies: - -- Commit 4. - -Validation gate: - -- Operators can map alert -> category -> workflow -> digest -> component version. - -### Rollback and Contingency - -- If category cleanup causes missing visibility, revert Commit 3 only while - retaining backfill evidence from Commit 2. -- If force-scan logic creates excessive CI load, revert Commit 4 and keep - category cleanup. -- Maintain a one-cycle overlap where canonical and old category results are - compared internally (not uploaded as parallel public categories). - -## Risks and Mitigations - -- Risk: closing legacy category may hide historical trend continuity. - - Mitigation: keep artifact retention and migration note in docs. -- Risk: force-scan increases CI time. - - Mitigation: scope force condition to Dockerfile/security workflow paths only. -- Risk: multi-arch checks add flakiness on runners. - - Mitigation: keep per-arch checks lightweight and deterministic. - -## Acceptance Criteria - -- Root cause identified with evidence from open alerts, SARIF category, and - workflow run mapping. -- Root cause set allows one or more concurrent causes, each mapped to category, - ref, and workflow with evidence. -- Canonical workflows show scans for current digest with Caddy 2.11.3. -- Legacy category closure backfill completed before retirement of compatibility - categories. -- Legacy/ambiguous category uploads removed or explicitly deprecated. -- security-weekly-rebuild has explicit category ownership and normalization. -- Security tab no longer shows active Caddy 2.11.2 findings for current active - categories/refs. -- Durable guardrails in place for skip, tag resolution, and scan target clarity. -- DoD checks pass; any residual findings are documented with explicit owner and - next action. +Decision: +- Single PR with ordered logical commits. + +Commit 1: +- Scope: scanner alignment on PR Trivy steps (`scanners: 'vuln'`). +- Files: [.github/workflows/docker-build.yml](.github/workflows/docker-build.yml) +- Dependencies: none. +- Validation gate: actionlint and step-name/field presence verification. + +Commit 2: +- Scope: diagnostics parser hardening and deterministic fallback reporting. +- Files: [.github/workflows/docker-build.yml](.github/workflows/docker-build.yml) +- Dependencies: Commit 1 preferred (for stable scanner surface). +- Validation gate: actionlint and SARIF parse smoke checks. + +Rollback and contingency: +- Revert Commit 2 first if diagnostics formatting regresses. +- Revert Commit 1 only if scanner-scope change is not desired by policy. +- Keep enforce-gate semantics unchanged in all rollback paths. diff --git a/docs/reports/qa_report.md b/docs/reports/qa_report.md index d550f3cf8..85341dc42 100644 --- a/docs/reports/qa_report.md +++ b/docs/reports/qa_report.md @@ -1,350 +1,158 @@ -# QA & Security Audit โ€” feature/proxy_groups - -| Field | Value | -|-------------|----------------------------------------------------------------------------------------------| -| Date | 2026-05-21 | -| Branch | `feature/proxy_groups` | -| HEAD commit | `1cfe4f36` โ€” Merge branch 'development' into feature/proxy_groups | -| Auditor | QA Security Agent | -| Verdict | โœ… **APPROVED FOR MERGE** | - -## Scope - -Branch introduces the Proxy Groups feature: a new `proxy_group_handler.go` backend -handler, updated `proxy_host_handler.go`, new frontend components and hooks for -drag-and-drop group management, associated tests, locales, and E2E specs. - -**Change summary:** 58 files changed, 4612 insertions, 799 deletions. - -**Key additions:** `BulkUpdateGroup` backend endpoint, `GroupDropZone` and -`ProxyHostDragHandle` components, `useProxyGroupDnD` and `useProxyGroups` hooks, -`bulkUpdateGroup` API, `@dnd-kit/core` and `@dnd-kit/utilities` dependencies, -and i18n keys in 5 locales. - ---- - -## Check Results - -| # | Check | Result | Notes | -|---|------------------------------|-----------|------------------------------------------------------| -| 1 | Backend Tests + Coverage | โœ… PASS | 87.1% coverage, 69s, 0 failures | -| 2 | Frontend Vitest Coverage | โœ… PASS | lcov.info valid (277 KB) | -| 3 | Local Patch Report | โœ… PASS | 93.9% overall, 94.3% backend, 93.4% frontend | -| 4 | TypeScript Type Check | โœ… PASS | 0 errors from `tsc --noEmit` | -| 5 | Pre-commit Hooks (lefthook) | โœ… CLEAN | All tools clean; staged check passed | -| 6 | GORM Security Scan | โœ… PASS | 0 CRITICAL/HIGH; 2 INFO (non-blocking) | -| 7 | Go Linting (golangci-lint) | โœ… PASS | Clean on all branch-modified Go files | -| 8 | ESLint | โœ… PASS | 0 new errors; 1 pre-existing error (not in branch) | -| 9 | Vulnerability Scanning | โœ… PASS | 0 HIGH/CRITICAL (npm audit + Trivy FS) | ---- - -## Step 1 โ€” Backend Unit Tests - -| Item | Value | -|----------|---------------------------------------------------| -| Command | `go test ./internal/api/handlers/... -coverprofile=coverage.txt` | -| Coverage | 87.1% | -| Duration | 69.241s | -| Failures | 0 | - -Coverage exceeds the 85% CI gate. All tests pass. - ---- - -## Step 2 โ€” Frontend Vitest Coverage - -| Item | Value | -|------------------|--------------------------------------| -| Command | `npm run test:coverage` | -| Coverage artifact | `frontend/coverage/lcov.info` (valid, 277 KB) | -| Failures | 0 | - ---- - -## Step 3 โ€” Local Patch Coverage Report - -Generated: 2026-05-21T11:56:43Z, baseline: `origin/main...HEAD` - -| Scope | Changed Lines | Covered Lines | Patch Coverage | Status | -|----------|---:|---:|---:|---| -| Overall | 461 | 433 | 93.9% | pass | -| Backend | 264 | 249 | 94.3% | pass | -| Frontend | 197 | 184 | 93.4% | pass | - -### Files below 95% threshold - -| File | Patch Coverage | Uncovered Lines | Ranges | -|---|---:|---:|---| -| `frontend/src/pages/ProxyHosts.tsx` | 80.6% | 13 | 480-481, 489-490, 766, 768, 770, 773, 1434, 1441, 1465, 1480, 1492 | -| `backend/internal/api/handlers/proxy_host_handler.go` | 91.5% | 8 | 257-258, 848-853 | -| `backend/internal/api/handlers/proxy_group_handler.go` | 93.2% | 7 | 121-122, 124-125, 128-130 | - -All three files exceed the 85% overall gate. The `ProxyHosts.tsx` uncovered lines -are in drag-and-drop event handlers and bulk-operation branches. The Go handler -uncovered lines are error-handling paths (bind failures, DB errors). Both are -candidates for targeted follow-up tests. - ---- - -## Step 4 โ€” TypeScript Compilation - -``` -tsc --noEmit -0 errors -``` - ---- - -## Step 5 โ€” Pre-commit Hooks (lefthook v2.1.8) - -Staged check: all 15 hooks skipped (no staged files) โ€” 0.04s. Clean. - -Individual tool runs (Steps 6โ€“8) confirm all file-scoped hooks pass for every -branch-modified file. No hook failures expected. - ---- - -## Step 6 โ€” GORM Security Scan - -``` -Script: scripts/scan-gorm-security.sh --check -Exit code: 0 โ€” PASS -CRITICAL: 0 -HIGH: 0 -MEDIUM: 0 -INFO: 2 -``` - -The 2 INFO findings are missing-index suggestions on `UserPermittedHost.ProxyHostID` -โ€” informational only, no remediation required for this branch. - ---- - -## Step 7 โ€” Go Linting (golangci-lint) - -Clean on all branch-modified Go files: - -- `backend/internal/api/handlers/proxy_group_handler.go` -- `backend/internal/api/handlers/proxy_host_handler.go` -- `backend/internal/api/routes/routes.go` -- `backend/internal/models/proxy_group.go` -- `backend/internal/models/proxy_host.go` -- `backend/internal/models/uptime.go` -- `backend/internal/services/crowdsec_whitelist_service.go` -- `backend/internal/services/proxy_group_service.go` -- `backend/internal/services/proxyhost_service.go` -- `backend/internal/services/uptime_service.go` - ---- - -## Step 8 โ€” ESLint - -``` -npm run lint -Errors: 1 (pre-existing, not in branch diff) -Warnings: 1064 (all pre-existing) -New issues introduced by branch: 0 -``` - -### Pre-existing error (not introduced by this branch) - -| File | Line | Rule | -|---|---|---| -| `frontend/src/components/__tests__/GroupDropZone.test.tsx` | 29 | `testing-library/prefer-screen-queries` | - -Confirmed pre-existing: `git log origin/main..HEAD -- ` returns empty. -New `@dnd-kit` components lint clean. - ---- - -## Step 9 โ€” Vulnerability Scanning - -### npm audit - -``` -npm audit --audit-level=high (frontend/) -Result: found 0 vulnerabilities -``` - -New branch dependencies `@dnd-kit/core ^6.3.1` and `@dnd-kit/utilities ^3.2.2` -have no known HIGH or CRITICAL vulnerabilities. - -### Trivy filesystem scan - -``` -Scan target: . (Trivy v0.52, fresh DB 2026-05-21) -Scanners: vuln, secret -HIGH/CRITICAL: 0 -``` - -### Backend go.mod - -No changes to `backend/go.mod` in this branch. All existing backend Go -dependencies are unchanged. - -### Agent go.mod additions (test-only, no security impact) - -- `gopkg.in/check.v1` -- `github.com/kr/pretty` -- `github.com/rogpeppe/go-internal` - -### Pre-existing image scan findings (reference only) - -`trivy-image-report.json` artifact (2026-03-24 image scan) documents two -pre-existing HIGH vulnerabilities in the embedded CrowdSec binaries. Not -introduced by this branch; no upstream fix is available. - -| GHSA ID | Package | Fixed? | -|---|---|---| -| `GHSA-6g7g-w4f8-9c9x` | `github.com/buger/jsonparser v1.1.1` | โŒ None | -| `GHSA-jqcq-xjh3-6g23` | `github.com/jackc/pgproto3/v2 v2.3.3` | โŒ None | - -Exploitability is low โ€” both packages are internal to the CrowdSec binary and -not reachable via the Charon API surface. - ---- - -## Open Items - -| # | Severity | Description | Action | -|---|---|---|---| -| 1 | Low | `ProxyHosts.tsx` patch coverage 80.6% (13 uncovered changed lines in DnD/bulk handlers) | Add targeted tests in follow-up | -| 2 | Low | Pre-existing ESLint error `GroupDropZone.test.tsx:29` | Fix in separate cleanup PR | -| 3 | Informational | GORM INFO: missing index on `UserPermittedHost.ProxyHostID` | Add DB migration index in follow-up | -| 4 | Informational | Pre-existing container image: 2 HIGH vulns with no upstream fix | Monitor upstream; no action on this branch | - ---- - -## Verdict - -**โœ… APPROVED FOR MERGE** - -This branch is safe to merge. Unit test coverage for the new DnD components -(`GroupDropZone`, `ProxyHostDragHandle`, `useProxyGroupDnD`) is recommended as -a follow-up task before the next release. - ---- - -# QA & Security Audit โ€” feature/bug_report (FeedbackWidget) +## QA Report - Scoped Workflow Audit | Field | Value | |---|---| -| **Date** | 2026-05-18 | -| **Branch** | `feature/bug_report` | -| **HEAD Commit** | `e157b820` โ€” fix: align ProxyGroupForm test i18n mock with component translation keys | -| **Audit Scope** | Uncommitted working-tree changes: `FeedbackWidget.tsx` (new), `FeedbackWidget.test.tsx` (new), `Layout.tsx` (modified), 5 locale `translation.json` files (modified) | -| **Auditor** | GitHub Copilot (QA Security Mode) | -| **Verdict** | โœ… **PASS** | +| Date | 2026-05-25 | +| Primary target | `.github/workflows/docker-build.yml` | +| Current changed files (`git diff --name-only`) | `.github/workflows/docker-build.yml`, `docs/plans/current_spec.md` | +| Scoped release lens | Workflow behavior and workflow-policy compliance only | ---- +## Required Checks -## Summary +### 1) actionlint `.github/workflows/docker-build.yml` -Frontend-only feature adding a floating feedback FAB button (`FeedbackWidget.tsx`) fixed at -the bottom-right of the layout. When activated, a panel opens with links to the GitHub bug -report and feature request issue templates. The component is integrated into `Layout.tsx` and -translation keys were added to all 5 supported locales. No backend changes, no new -dependencies, no new attack surface. +- Command: `actionlint .github/workflows/docker-build.yml` +- Result: **PASS** +- Evidence: Command returned no findings. -**Feature scope:** `FeedbackWidget.tsx`, `FeedbackWidget.test.tsx`, `Layout.tsx` (one-line -integration), i18n keys in `en`, `de`, `es`, `fr`, `zh` locales. +### 2) Confirm change scope remains workflow-only ---- +- Commands: `git status --short`, `git diff --name-only` +- Result: **PASS (workflow-only for releasability scope)** +- Evidence: + - Workflow file changed: `.github/workflows/docker-build.yml` + - Non-workflow change is documentation/planning only: `docs/plans/current_spec.md` + - No backend/frontend/runtime application paths are modified. -## Check Results +### 3) Confirm step-15 authority semantics preserved in file -| # | Check | Result | Notes | -|---|---|---|---| -| 1 | Targeted Unit Tests (FeedbackWidget + Layout) | โœ… PASS | 31/31 tests pass (15 FeedbackWidget, 16 Layout), ~5.76 s | -| 2 | Full Frontend Test Suite (209 suites, Vitest) | โœ… PASS | 197/209 suites confirmed passing before 600 s timeout; re-run initiated โ€” 0 additional failures found. 1 pre-existing failure (see Pre-existing Failures). | -| 3 | TypeScript Type Check | โœ… PASS | 0 errors โ€” `npx tsc --noEmit` | -| 4 | Frontend ESLint | โœ… PASS (after fix) | 1 MEDIUM error fixed during audit (F1); 8 warnings remain (all LOW / pre-existing, exit 0) | -| 5 | Pre-commit Hooks (lefthook v2.1.6) | โœ… PASS | `frontend-type-check` and `frontend-lint` pass | -| 6 | Semgrep SAST | โœ… PASS | 0 findings โ€” 351 rules across 174 files (configs: `p/react`, `p/typescript`, `p/secrets`) | -| 7 | Security Code Review (manual โ€” OWASP Top 10) | โœ… PASS | See Security Review section | -| 8 | GORM Security Scanner | N/A | Frontend-only change; no backend model changes | +- Location evidence: `.github/workflows/docker-build.yml` lines 1020-1026 +- Check evidence: + - Step `Enforce PR Trivy security gate` still exists. + - It still evaluates `steps.trivy-scan.outcome` and explicitly `exit 1` when scan outcome is not `success`. + - `git diff` contains no edits for this step header/guard/exit logic. +- Result: **PASS (semantics preserved in this patch)** ---- +### 4) Classify non-applicable gates explicitly -## Security Review +- **Conditionally applicable, not triggered by this scope** + - GORM security scan gate: not triggered (no `backend/internal/models/**`, GORM service, or migration path changes). + - E2E container rebuild gate: not triggered for workflow/docs-only changes. +- **Non-applicable for this scoped workflow verdict** + - Backend coverage gate. + - Frontend coverage gate. + - Frontend type-check gate. + - App build gates (backend/frontend). + - App E2E functional gate. -**Scope:** Manual OWASP A01โ€“A10 review of `FeedbackWidget.tsx` and `Layout.tsx`. +### 5) Final scoped releasability verdict -| Risk | Finding | Status | -|---|---|---| -| A03 โ€” Injection (XSS) | No `dangerouslySetInnerHTML`, no user-controlled HTML rendered. All visible text sourced from `useTranslation()` i18n keys only. | โœ… PASS | -| A03 โ€” Open Redirect | Both `` tags use hardcoded GitHub URLs (`GITHUB_BUG_URL`, `GITHUB_FEATURE_URL`). No user input influences the `href`. | โœ… PASS | -| Clickjacking / Tab-napping | Both external links carry `target="_blank"` **and** `rel="noopener noreferrer"`, preventing opener access and referrer leakage. | โœ… PASS | -| Sensitive Data Exposure | No credentials, tokens, or PII in component, test, or locale files. | โœ… PASS | -| Semgrep (p/secrets) | 0 findings. | โœ… PASS | +- **RELEASABLE (WORKFLOW SCOPE)** +- Basis: + - Mandatory workflow lint (`actionlint`) passed. + - Scope contains workflow plus non-runtime documentation only. + - Step-15 gate authority semantics are preserved in-file. ---- +## Concise Evidence Log -## Static Analysis +1. `actionlint .github/workflows/docker-build.yml` -> pass (no output). +2. `git diff --name-only` -> `.github/workflows/docker-build.yml`, `docs/plans/current_spec.md`. +3. `rg -n "Enforce PR Trivy security gate|steps.trivy-scan.outcome" .github/workflows/docker-build.yml` -> step present. +4. `git diff -- .github/workflows/docker-build.yml | rg -n "Enforce PR Trivy security gate|continue-on-error|steps.trivy-scan.outcome"` -> no matching diff lines for gate semantics. -| Tool | Result | Detail | -|---|---|---| -| ESLint | โœ… Exit 0 | 1 MEDIUM error fixed (F1); 8 warnings (all LOW or pre-existing, listed in Findings) | -| TypeScript (`tsc --noEmit`) | โœ… 0 errors | โ€” | -| Semgrep | โœ… 0 findings | 351 rules, 174 files | +## Phase 7 Closure - SARIF Diagnostics Taxonomy Patch ---- +| Field | Value | +|---|---| +| Date | 2026-05-25 | +| Patch scope | Workflow-only diagnostics hardening in `.github/workflows/docker-build.yml` | +| Verdict | **CLOSED - RELEASABLE (WORKFLOW SCOPE)** | -## Pre-commit Hooks (lefthook v2.1.6) +### Scoped Verdict -| Hook | Result | Notes | -|---|---|---| -| `frontend-type-check` | โœ… PASS | | -| `frontend-lint` | โœ… PASS | | -| `check-version-match` | โš ๏ธ PRE-EXISTING | `.version` (v0.21.0) โ‰  latest git tag โ€” pre-existing project-wide issue; unrelated to this feature | +- The diagnostics fallback taxonomy is now explicit and scoped. +- The workflow remains releasable for this patch scope. +- The gate authority model is preserved: step `Enforce PR Trivy security gate` remains the sole blocking authority. ---- +### Key Evidence -## Coverage +1. Baseline failure evidence captured in `.github/logs/ci_failure.log` shows previous generic fallback: + - `FALLBACK_REASON="Unable to parse SARIF results"` +2. Updated workflow evidence in `.github/workflows/docker-build.yml` shows explicit fallback taxonomy: + - `file missing` + - `file unreadable` + - `invalid JSON` + - `unexpected schema` + - `parser command failure` +3. Updated workflow parse behavior for valid SARIF with empty results is non-fallback: + - `.runs` schema check is explicit. + - `FINDINGS_COUNT` defaults to `0` and summary emits parsed state with count `0`. +4. Step-15 authority invariant remains intact in `.github/workflows/docker-build.yml`: + - `Enforce PR Trivy security gate` keeps `if: always()`. + - Gate decision still depends only on `${{ steps.trivy-scan.outcome }}`. + - Diagnostics output does not affect pass/fail. -`FeedbackWidget.tsx` is fully covered by its 15 dedicated unit tests. No coverage regression -introduced to existing suites. No backend coverage impact. +### Closure Notes ---- +- This closure is scoped to workflow behavior and diagnostics classification. +- Runtime application behavior is unchanged. -## E2E Tests (Playwright) +## Scoped Workflow QA Re-Verification -Not run in this audit session. This is a frontend-only cosmetic feature (a floating button and -panel with static links). The full Playwright suite runs in CI and will validate the component -in context. Manual spot-check confirmed the component renders correctly in the layout. +| Field | Value | +|---|---| +| Date | 2026-05-25 | +| Scope under test | `.github/workflows/docker-build.yml` and `docs/plans/current_spec.md` | +| Requested checks | actionlint, scope verification, step-15 authority invariance, releasability verdict | ---- +### Evidence -## Pre-existing Test Failures +1. `actionlint .github/workflows/docker-build.yml` + - Result: **PASS** (no findings) +2. `git diff --name-only` + - Result: `.github/workflows/docker-build.yml`, `docs/plans/current_spec.md` + - Scope assessment: workflow + plan doc only; no runtime app paths changed +3. `rg -n "Enforce PR Trivy security gate|if: always\(\)|steps\.trivy-scan\.outcome" .github/workflows/docker-build.yml` + - Result: gate step and guard logic present +4. `git diff -- .github/workflows/docker-build.yml` + - Result: no changes to step `Enforce PR Trivy security gate`, `if: always()`, or `steps.trivy-scan.outcome` gate decision logic -| Suite | Failures | Attribution | -|---|---|---| -| `ProxyHostForm-dropdown-changes.test.tsx` | 1 test failed (19 959 ms) | Pre-existing โ€” not caused by FeedbackWidget | +### Scoped Releasability Verdict ---- +- **RELEASABLE (WORKFLOW SCOPE)** +- Rationale: + - Mandatory workflow lint passed. + - Change scope remains limited to workflow + planning documentation. + - Step-15 authority semantics remain unchanged and continue to be the sole blocking gate based on `steps.trivy-scan.outcome`. -## Findings Summary +## Final Closure - Workflow-Only Remediation (Scanner Alignment) -| ID | Severity | Category | Finding | Status | -|---|---|---|---|---| -| F1 | ๐ŸŸก MEDIUM โ€” **FIXED** | ESLint / a11y | `jsx-a11y/no-noninteractive-element-interactions` on `