diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 8619215b2..b8d2e558c 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -32,7 +32,11 @@ jobs: steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 with: - ref: ${{ github.event.workflow_run.head_sha || github.sha }} + # For pull_request events, use the branch HEAD SHA (not the ephemeral merge + # commit that github.sha resolves to), which is directly fetchable by SHA. + # For workflow_run events fall back to the triggering HEAD SHA. + # For push/workflow_dispatch fall back to github.sha. + ref: ${{ github.event.pull_request.head.sha || github.event.workflow_run.head_sha || github.sha }} - name: Set up Go uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6 diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index e3682d08e..711c3d3b8 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -7,9 +7,10 @@ name: Docker Build, Publish & Test # - Enhanced PR handling with dedicated scanning # - Improved workflow orchestration with supply-chain-verify.yml # -# PHASE 1 OPTIMIZATION (February 2026): + + # PHASE 1 OPTIMIZATION (February 2026): # - PR images now pushed to GHCR registry (enables downstream workflow consumption) -# - Immutable PR tagging: pr-{number}-{short-sha} (prevents race conditions) +# - Stable PR tagging: pr-{number} (freshness gate prevents stale scans) # - Feature branch tagging: {sanitized-branch-name}-{short-sha} (enables unique testing) # - Tag sanitization per spec Section 3.2 (handles special chars, slashes, etc.) # - Mandatory security scanning for PR images (blocks on CRITICAL/HIGH vulnerabilities) @@ -30,7 +31,7 @@ on: types: [completed] concurrency: - group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event_name == 'workflow_run' && github.event.workflow_run.head_branch || github.head_ref || github.ref_name }} + group: ${{ github.workflow }}-${{ github.head_ref || github.event.workflow_run.head_branch || github.ref_name }} cancel-in-progress: true permissions: @@ -46,7 +47,7 @@ env: TRIGGER_HEAD_SHA: ${{ github.event_name == 'workflow_run' && github.event.workflow_run.head_sha || github.sha }} TRIGGER_REF: ${{ github.event_name == 'workflow_run' && format('refs/heads/{0}', github.event.workflow_run.head_branch) || github.ref }} TRIGGER_HEAD_REF: ${{ github.event_name == 'workflow_run' && github.event.workflow_run.head_branch || github.head_ref }} - TRIGGER_PR_NUMBER: ${{ github.event_name == 'workflow_run' && join(github.event.workflow_run.pull_requests.*.number, '') || format('{0}', github.event.pull_request.number) }} + TRIGGER_PR_NUMBER: ${{ github.event_name == 'pull_request' && format('{0}', github.event.pull_request.number) || '' }} TRIGGER_ACTOR: ${{ github.event_name == 'workflow_run' && github.event.workflow_run.actor.login || github.actor }} TRIGGER_BASE_REF: ${{ github.event_name == 'workflow_run' && '' || github.base_ref }} @@ -67,7 +68,6 @@ jobs: outputs: skip_build: ${{ steps.skip.outputs.skip_build }} digest: ${{ steps.build-and-push.outputs.digest }} - pr_image_ref: ${{ steps.pr-image-ref.outputs.image_ref }} steps: - name: Checkout repository @@ -160,9 +160,11 @@ jobs: echo "Force building on feature branch (PR)" fi - echo "skip_build=$should_skip" >> "$GITHUB_OUTPUT" - echo "is_feature_push=$is_feature_push" >> "$GITHUB_OUTPUT" - echo "force_refresh=$force_refresh" >> "$GITHUB_OUTPUT" + { + echo "skip_build=$should_skip" + echo "is_feature_push=$is_feature_push" + echo "force_refresh=$force_refresh" + } >> "$GITHUB_OUTPUT" - name: Set up QEMU if: steps.skip.outputs.skip_build != 'true' @@ -249,6 +251,77 @@ jobs: fi fi + - name: Resolve security scan image reference (contract) + if: steps.skip.outputs.skip_build != 'true' + id: pr_image_ref_output + run: | + set -euo pipefail + + sanitize_branch() { + local raw_branch="$1" + local sanitized_branch + + sanitized_branch=$(echo "${raw_branch}" | tr '[:upper:]' '[:lower:]') + sanitized_branch=${sanitized_branch//[^a-z0-9-]/-} + while [[ "${sanitized_branch}" == *"--"* ]]; do + sanitized_branch=${sanitized_branch//--/-} + done + sanitized_branch=${sanitized_branch##[^a-z0-9]*} + sanitized_branch=${sanitized_branch%%[^a-z0-9-]*} + + if [[ -z "${sanitized_branch}" ]]; then + sanitized_branch="branch" + fi + + echo "${sanitized_branch}" + } + + IMAGE_TAG="" + + if [[ "${TRIGGER_EVENT}" == "pull_request" ]]; then + PR_NUMBER="${TRIGGER_PR_NUMBER}" + + if [[ "${GITHUB_EVENT_NAME}" == "workflow_run" ]]; then + mapfile -t WORKFLOW_RUN_PR_NUMBERS < <(jq -r '.workflow_run.pull_requests[].number // empty' "$GITHUB_EVENT_PATH") + + if [[ "${#WORKFLOW_RUN_PR_NUMBERS[@]}" != "1" ]]; then + echo "❌ ERROR: Expected exactly one workflow_run pull request number, found ${#WORKFLOW_RUN_PR_NUMBERS[@]}" + exit 1 + fi + + PR_NUMBER="${WORKFLOW_RUN_PR_NUMBERS[0]}" + fi + + if [[ -z "${PR_NUMBER}" || "${PR_NUMBER}" == "null" ]]; then + echo "❌ ERROR: Missing pull request number for stable security scan tag" + exit 1 + fi + + IMAGE_TAG="pr-${PR_NUMBER}" + elif [[ "${TRIGGER_REF}" == refs/heads/nightly ]]; then + IMAGE_TAG="nightly" + elif [[ "${TRIGGER_REF}" == refs/heads/main ]]; then + IMAGE_TAG="main" + elif [[ "${TRIGGER_REF}" == refs/heads/development ]]; then + IMAGE_TAG="development" + else + BRANCH_NAME="${TRIGGER_HEAD_BRANCH:-${TRIGGER_REF#refs/heads/}}" + IMAGE_TAG="branch-$(sanitize_branch "${BRANCH_NAME}")" + fi + + IMAGE_REF="${GHCR_REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}" + + echo "image_tag=${IMAGE_TAG}" >> "$GITHUB_OUTPUT" + echo "image_ref=${IMAGE_REF}" >> "$GITHUB_OUTPUT" + echo "Resolved security scan image reference: ${IMAGE_REF}" + + { + echo "## PR Image Reference Diagnostics" + echo "" + echo "- Image tag: ${IMAGE_TAG}" + echo "- Emitted image_ref: ${IMAGE_REF}" + } >> "$GITHUB_STEP_SUMMARY" + - name: Generate Docker metadata id: meta uses: docker/metadata-action@80c7e94dd9b9319bd5eb7a0e0fe9291e23a2a2e9 # v6.1.0 @@ -264,10 +337,9 @@ jobs: type=raw,value=dev,enable=${{ env.TRIGGER_REF == 'refs/heads/development' }} type=raw,value=nightly,enable=${{ env.TRIGGER_REF == 'refs/heads/nightly' }} type=raw,value=beta,enable=${{ env.TRIGGER_EVENT == 'pull_request' && env.TRIGGER_BASE_REF == 'development' }} - type=raw,value=${{ steps.branch-tags.outputs.pr_feature_branch_sha_tag }},enable=${{ env.TRIGGER_EVENT == 'pull_request' && steps.branch-tags.outputs.pr_feature_branch_sha_tag != '' }} + type=raw,value=${{ steps.pr_image_ref_output.outputs.image_tag }},enable=${{ steps.pr_image_ref_output.outputs.image_tag != '' }} type=raw,value=${{ steps.branch-tags.outputs.feature_branch_tag }},enable=${{ env.TRIGGER_EVENT != 'pull_request' && startsWith(env.TRIGGER_REF, 'refs/heads/feature/') && steps.branch-tags.outputs.feature_branch_tag != '' }} type=raw,value=${{ steps.branch-tags.outputs.branch_sha_tag }},enable=${{ env.TRIGGER_EVENT != 'pull_request' && steps.branch-tags.outputs.branch_sha_tag != '' }} - type=raw,value=pr-${{ env.TRIGGER_PR_NUMBER }}-{{sha}},enable=${{ env.TRIGGER_EVENT == 'pull_request' }},prefix=,suffix= type=sha,format=short,prefix=,suffix=,enable=${{ env.TRIGGER_EVENT != 'pull_request' && (env.TRIGGER_REF == 'refs/heads/main' || env.TRIGGER_REF == 'refs/heads/development' || env.TRIGGER_REF == 'refs/heads/nightly') }} flavor: | latest=false @@ -277,21 +349,18 @@ jobs: io.charon.build.timestamp=${{ github.event.repository.updated_at }} io.charon.feature.branch=${{ steps.branch-tags.outputs.feature_branch_tag }} - - name: Resolve PR image reference + - name: Assert PR output contract if: steps.skip.outputs.skip_build != 'true' && env.TRIGGER_EVENT == 'pull_request' - id: pr-image-ref run: | - IMAGE_REF=$(echo "${{ steps.meta.outputs.tags }}" | head -n 1) - if [[ -z "$IMAGE_REF" ]]; then - echo "❌ ERROR: No PR image reference found in metadata tags" + IMAGE_REF="${{ steps.pr_image_ref_output.outputs.image_ref }}" + if [[ -z "${IMAGE_REF}" ]]; then + echo "❌ ERROR: image_ref output contract violated (empty output)" exit 1 fi - echo "image_ref=${IMAGE_REF}" >> "$GITHUB_OUTPUT" - echo "Resolved PR image reference: ${IMAGE_REF}" # Phase 1 Optimization: Build once, test many - # - For PRs: Multi-platform (amd64, arm64) + immutable tags (pr-{number}-{short-sha}) + # - For PRs: Multi-platform (amd64, arm64) + stable security-scan tag (pr-{number}) # - For feature branches: Multi-platform (amd64, arm64) + sanitized tags ({branch}-{short-sha}) - # - For main/dev: Multi-platform (amd64, arm64) for production + # - For main/dev/nightly: Multi-platform (amd64, arm64) for production # - Always push to registry (enables downstream workflow consumption) # - Retry logic handles transient registry failures (3 attempts, 10s wait) # See: docs/plans/current_spec.md Section 4.1 @@ -355,52 +424,40 @@ jobs: # This enables backward compatibility with workflows that use artifacts if [[ "${{ env.TRIGGER_EVENT }}" == "pull_request" ]]; then echo "📥 Pulling image back for artifact creation..." - FIRST_TAG=$(echo "${{ steps.meta.outputs.tags }}" | head -n1) - docker pull "${FIRST_TAG}" - echo "✅ Image pulled: ${FIRST_TAG}" + PR_IMAGE_REF="${{ steps.pr_image_ref_output.outputs.image_ref }}" + if [[ -z "${PR_IMAGE_REF}" ]]; then + echo "❌ ERROR: image_ref output contract violated during pull-back" + exit 1 + fi + docker pull "${PR_IMAGE_REF}" + echo "✅ Image pulled: ${PR_IMAGE_REF}" fi - # Critical Fix: Use exact tag from metadata instead of manual reconstruction - # WHY: docker/build-push-action with load:true applies the exact tags from - # docker/metadata-action. Manual reconstruction can cause mismatches due to: - # - Case sensitivity variations (owner name normalization) - # - Tag format differences in Buildx internal behavior - # - Registry prefix inconsistencies - # - # SOLUTION: Extract the first tag from metadata output (which is the PR tag) - # and use it directly with docker save. This guarantees we reference the - # exact image that was loaded into the local Docker daemon. - # - # VALIDATION: Added defensive checks to fail fast with diagnostics if: - # 1. No tag found in metadata output - # 2. Image doesn't exist locally after build - # 3. Artifact creation fails + # Use the stable security scan reference so the saved artifact matches the + # image that the PR scan job will pull and validate. - name: Save Docker Image as Artifact if: success() && steps.skip.outputs.skip_build != 'true' && env.TRIGGER_EVENT == 'pull_request' run: | - # Extract the first tag from metadata action (PR tag) - IMAGE_TAG=$(echo "${{ steps.meta.outputs.tags }}" | head -n 1) + IMAGE_REF="${{ steps.pr_image_ref_output.outputs.image_ref }}" - if [[ -z "${IMAGE_TAG}" ]]; then - echo "❌ ERROR: No image tag found in metadata output" - echo "Metadata tags output:" - echo "${{ steps.meta.outputs.tags }}" + if [[ -z "${IMAGE_REF}" ]]; then + echo "❌ ERROR: image_ref output contract violated (empty image ref)" exit 1 fi - echo "🔍 Detected image tag: ${IMAGE_TAG}" + echo "🔍 Detected image ref: ${IMAGE_REF}" # Verify the image exists locally - if ! docker image inspect "${IMAGE_TAG}" >/dev/null 2>&1; then - echo "❌ ERROR: Image ${IMAGE_TAG} not found locally" + if ! docker image inspect "${IMAGE_REF}" >/dev/null 2>&1; then + echo "❌ ERROR: Image ${IMAGE_REF} not found locally" echo "📋 Available images:" docker images exit 1 fi # Save the image using the exact tag from metadata - echo "💾 Saving image: ${IMAGE_TAG}" - docker save "${IMAGE_TAG}" -o /tmp/charon-pr-image.tar + echo "💾 Saving image: ${IMAGE_REF}" + docker save "${IMAGE_REF}" -o /tmp/charon-pr-image.tar # Verify the artifact was created echo "✅ Artifact created:" @@ -424,9 +481,9 @@ jobs: # Determine the image reference based on event type if [ "${{ env.TRIGGER_EVENT }}" = "pull_request" ]; then - IMAGE_REF="${{ steps.pr-image-ref.outputs.image_ref }}" + IMAGE_REF="${{ steps.pr_image_ref_output.outputs.image_ref }}" if [ -z "${IMAGE_REF}" ]; then - echo "❌ ERROR: Failed to load PR image reference from pr-image-ref output" + echo "❌ ERROR: Failed to load PR image reference from image_ref output" exit 1 fi echo "Using PR image: $IMAGE_REF" @@ -451,9 +508,9 @@ jobs: # Determine the image reference based on event type if [ "${{ env.TRIGGER_EVENT }}" = "pull_request" ]; then - IMAGE_REF="${{ steps.pr-image-ref.outputs.image_ref }}" + IMAGE_REF="${{ steps.pr_image_ref_output.outputs.image_ref }}" if [ -z "${IMAGE_REF}" ]; then - echo "❌ ERROR: Failed to load PR image reference from pr-image-ref output" + echo "❌ ERROR: Failed to load PR image reference from image_ref output" exit 1 fi echo "Using PR image: $IMAGE_REF" @@ -518,9 +575,9 @@ jobs: # Determine the image reference based on event type if [ "${{ env.TRIGGER_EVENT }}" = "pull_request" ]; then - IMAGE_REF="${{ steps.pr-image-ref.outputs.image_ref }}" + IMAGE_REF="${{ steps.pr_image_ref_output.outputs.image_ref }}" if [ -z "${IMAGE_REF}" ]; then - echo "❌ ERROR: Failed to load PR image reference from pr-image-ref output" + echo "❌ ERROR: Failed to load PR image reference from image_ref output" exit 1 fi echo "Using PR image: $IMAGE_REF" @@ -717,12 +774,15 @@ jobs: needs: build-and-push if: needs.build-and-push.outputs.skip_build != 'true' && needs.build-and-push.result == 'success' && github.event_name == 'pull_request' runs-on: ubuntu-latest - timeout-minutes: 10 + timeout-minutes: 20 permissions: contents: read packages: read security-events: write steps: + - name: Checkout repository for Trivy ignore rules + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - name: Normalize image name run: | IMAGE_NAME=$(echo "${{ env.IMAGE_NAME }}" | tr '[:upper:]' '[:lower:]') @@ -731,12 +791,22 @@ jobs: - name: Load PR image reference id: pr-image run: | - IMAGE_REF="${{ needs.build-and-push.outputs.pr_image_ref }}" - if [[ -z "$IMAGE_REF" ]]; then - echo "❌ ERROR: Missing PR image reference from build-and-push outputs" + DIGEST="${{ needs.build-and-push.outputs.digest }}" + + if [[ -z "$DIGEST" ]]; then + echo "❌ ERROR: build-and-push digest output is empty; cannot proceed without immutable reference" + exit 1 + fi + + if ! [[ "$DIGEST" =~ ^sha256:[0-9a-f]{64}$ ]]; then + echo "❌ ERROR: Invalid digest format (expected sha256:<64 hex chars>): ${DIGEST}" exit 1 fi + + IMAGE_REF="ghcr.io/${IMAGE_NAME}@${DIGEST}" + echo "digest=${DIGEST}" >> "$GITHUB_OUTPUT" echo "image_ref=${IMAGE_REF}" >> "$GITHUB_OUTPUT" + echo "Using immutable digest reference: ${IMAGE_REF}" - name: Log in to GitHub Container Registry uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0 @@ -745,27 +815,36 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Validate image freshness + - name: Inspect PR image metadata + id: image-metadata run: | - echo "🔍 Validating image freshness for PR #${{ env.TRIGGER_PR_NUMBER }}..." - echo "Expected SHA: ${{ env.TRIGGER_HEAD_SHA }}" + echo "🔍 Inspecting image metadata for PR #${{ env.TRIGGER_PR_NUMBER }}..." echo "Image: ${{ steps.pr-image.outputs.image_ref }}" - # Pull image to inspect + # Pull the immutable digest reference to inspect labels. docker pull "${{ steps.pr-image.outputs.image_ref }}" - # Extract commit SHA from image label LABEL_SHA=$(docker inspect "${{ steps.pr-image.outputs.image_ref }}" \ --format '{{index .Config.Labels "org.opencontainers.image.revision"}}') echo "Image label SHA: ${LABEL_SHA}" + echo "label_sha=${LABEL_SHA}" >> "$GITHUB_OUTPUT" + + - name: Validate image freshness + run: | + LABEL_SHA="${{ steps.image-metadata.outputs.label_sha }}" + + if [[ -z "${LABEL_SHA}" ]]; then + echo "❌ ERROR: Missing image revision label" + exit 1 + fi if [[ "${LABEL_SHA}" != "${{ env.TRIGGER_HEAD_SHA }}" ]]; then - echo "⚠️ WARNING: Image SHA mismatch!" + echo "❌ ERROR: Image SHA mismatch!" echo " Expected: ${{ env.TRIGGER_HEAD_SHA }}" echo " Got: ${LABEL_SHA}" - echo "Image may be stale. Resuming for triage (Bypassing failure)." - # exit 1 + echo "Image may be stale. Failing closed." + exit 1 fi echo "✅ Image freshness validated" @@ -775,6 +854,8 @@ jobs: with: image-ref: ${{ steps.pr-image.outputs.image_ref }} format: 'table' + scanners: 'vuln' + trivyignores: '.trivyignore' severity: 'CRITICAL,HIGH' exit-code: '0' version: 'v0.70.0' @@ -786,8 +867,12 @@ jobs: image-ref: ${{ steps.pr-image.outputs.image_ref }} format: 'sarif' output: 'trivy-pr-results.sarif' + scanners: 'vuln' + trivyignores: '.trivyignore' severity: 'CRITICAL,HIGH' - exit-code: '1' # Intended to block, but continued on error for now + # Keep scanning strict for CRITICAL/HIGH; fail is enforced explicitly + # at the end so SARIF upload and summaries still run. + exit-code: '1' version: 'v0.70.0' continue-on-error: true @@ -813,14 +898,24 @@ jobs: env: IMAGE_REF: ${{ steps.pr-image.outputs.image_ref }} run: | - CADDY_VERSION=$(docker run --rm --pull=never "${IMAGE_REF}" caddy version 2>/dev/null || echo "unknown") + CADDY_VERSION="unknown" + CADDY_VERSION_SOURCE="fallback-not-available" + + # Prefer absolute entrypoint to avoid shell/command resolution differences. + if CADDY_VERSION=$(timeout 30s docker run --rm --pull=never --entrypoint /usr/bin/caddy "${IMAGE_REF}" version 2>/dev/null); then + CADDY_VERSION_SOURCE="--entrypoint /usr/bin/caddy" + elif CADDY_VERSION=$(timeout 30s docker run --rm --pull=never --entrypoint caddy "${IMAGE_REF}" version 2>/dev/null); then + CADDY_VERSION_SOURCE="--entrypoint caddy" + fi + { echo "## PR Trivy SARIF Traceability" echo "" echo "- Category: ${{ env.TRIVY_SARIF_CATEGORY }}" echo "- Image: ${IMAGE_REF}" - echo "- Digest: n/a (tag-based PR image)" + echo "- Digest: ${IMAGE_REF#*@}" echo "- Caddy version: ${CADDY_VERSION}" + echo "- Caddy version source: ${CADDY_VERSION_SOURCE}" } >> "$GITHUB_STEP_SUMMARY" - name: Create scan summary @@ -830,7 +925,127 @@ jobs: echo "## 🔒 PR Image Security Scan" echo "" echo "- **Image**: ${{ steps.pr-image.outputs.image_ref }}" + echo "- **Image Digest**: ${{ steps.pr-image.outputs.digest }}" + echo "- **Image Revision Label SHA**: ${{ steps.image-metadata.outputs.label_sha || 'missing' }}" echo "- **PR**: #${{ env.TRIGGER_PR_NUMBER }}" echo "- **Commit**: ${{ env.TRIGGER_HEAD_SHA }}" echo "- **Scan Status**: ${{ steps.trivy-scan.outcome == 'success' && '✅ No critical vulnerabilities' || '❌ Vulnerabilities detected' }}" } >> "$GITHUB_STEP_SUMMARY" + + - name: Diagnose unsuppressed PR Trivy blockers + if: always() + continue-on-error: true + run: | + SARIF_PATH="trivy-pr-results.sarif" + FALLBACK_REASON="" + FINDINGS_COUNT="0" + UNIQUE_IDS_CSV="none" + PARSER_EXIT_CODE="" + PARSER_HINT="" + + echo "Unsuppressed HIGH/CRITICAL findings (from ${SARIF_PATH}):" + + if [[ ! -f "${SARIF_PATH}" ]]; then + FALLBACK_REASON="file missing" + elif [[ ! -r "${SARIF_PATH}" ]]; then + FALLBACK_REASON="file unreadable" + elif ! jq -e '.' "${SARIF_PATH}" >/dev/null 2>&1; then + FALLBACK_REASON="invalid JSON" + elif ! jq -e '(.runs | type) == "array"' "${SARIF_PATH}" >/dev/null 2>&1; then + FALLBACK_REASON="unexpected schema" + else + PARSER_ERR_FILE=$(mktemp) + if IDS_JSON=$(jq -c ' + [ + .runs[]? as $run + | (($run.results // [])[]?) as $result + | select((($result.suppressions // []) | length) == 0) + | { + id: ( + $result.ruleId + // ($result.rule // {} | .id) + // ( + if ($result.ruleIndex != null and (($run.tool.driver.rules? // null) | type) == "array") then + ($run.tool.driver.rules[$result.ruleIndex].id // empty) + else + empty + end + ) + // "unknown" + ) + } + ] + ' "${SARIF_PATH}" 2>"${PARSER_ERR_FILE}"); then + FINDINGS_COUNT=$(echo "${IDS_JSON}" | jq 'length' 2>/dev/null || echo "0") + UNIQUE_IDS_CSV=$(echo "${IDS_JSON}" | jq -r '[.[].id | select(. != "" and . != null)] | unique | join(", ")' 2>/dev/null || echo "none") + if [[ -z "${UNIQUE_IDS_CSV}" ]]; then + UNIQUE_IDS_CSV="none" + fi + + if [[ "${FINDINGS_COUNT}" -gt 0 ]]; then + if ! jq -r ' + .runs[]? as $run + | (($run.results // [])[]?) as $result + | select((($result.suppressions // []) | length) == 0) + | "- \(( + $result.ruleId + // ($result.rule // {} | .id) + // ( + if ($result.ruleIndex != null and (($run.tool.driver.rules? // null) | type) == \"array\") then + ($run.tool.driver.rules[$result.ruleIndex].id // \"unknown\") + else + \"unknown\" + end + ) + )) | package: \(( + ($result.message.text // \"\") + | (try capture(\"(?i)(?:Package|PkgName|Pkg|Library)\\\\s*[:=]\\\\s*`?(?[A-Za-z0-9._+:+-]+)`?\").pkg catch \"n/a\") + ))" + ' "${SARIF_PATH}"; then + echo "- unable to render parsed findings" + fi + else + echo "- none" + fi + else + FALLBACK_REASON="parser command failure" + PARSER_EXIT_CODE="$?" + PARSER_HINT=$(head -n 1 "${PARSER_ERR_FILE}" | tr -d '\r' || true) + if [[ -z "${PARSER_HINT}" ]]; then + PARSER_HINT="no parser stderr available" + fi + fi + + rm -f "${PARSER_ERR_FILE}" + fi + + { + echo "## PR Trivy Unsuppressed Blockers" + echo "" + if [[ -n "${FALLBACK_REASON}" ]]; then + echo "- Diagnostics status: fallback" + echo "- Fallback reason: ${FALLBACK_REASON}" + if [[ "${FALLBACK_REASON}" == "parser command failure" ]]; then + echo "- Parser exit code: ${PARSER_EXIT_CODE:-unknown}" + echo "- Parser hint: ${PARSER_HINT}" + fi + echo "- Count: unknown" + echo "- Blocker IDs: unknown" + else + echo "- Diagnostics status: parsed" + echo "- Count: ${FINDINGS_COUNT}" + if [[ "${FINDINGS_COUNT}" -gt 0 ]]; then + echo "- Blocker IDs: ${UNIQUE_IDS_CSV}" + else + echo "- Blocker IDs: none" + fi + fi + } >> "$GITHUB_STEP_SUMMARY" + + - name: Enforce PR Trivy security gate + if: always() + run: | + if [[ "${{ steps.trivy-scan.outcome }}" != "success" ]]; then + echo "❌ Blocking merge: PR image has CRITICAL/HIGH vulnerabilities or scan failed" + exit 1 + fi diff --git a/.gitignore b/.gitignore index 5b3674d79..aff059433 100644 --- a/.gitignore +++ b/.gitignore @@ -326,3 +326,6 @@ backend/test_out.txt backend/cf_coverage.txt backend/***_coverage.txt backend/***_cov.txt +.tmp/caddy-binary-pin-cleanup +.tmp/caddy-binary-pin-cleanup-local.tar +.tmp/*** \ No newline at end of file diff --git a/.trivyignore b/.trivyignore index fdd90a138..f5ede014e 100644 --- a/.trivyignore +++ b/.trivyignore @@ -103,3 +103,13 @@ CVE-2026-33997 # See also: .grype.yaml for full justification # exp: 2026-04-30 GHSA-pxq6-2prw-chj9 + +# CVE-2026-41889 / GHSA-j88v-2chj-qfwx: pgx/v4 panic on crafted PostgreSQL wire payload (DoS) +# Severity: LOW (CVSS 3.7) — Package: github.com/jackc/pgx/v4, embedded in CrowdSec binaries +# Fix path requires upstream migration to pgx/v5; CrowdSec currently vendors pgx/v4 in bundled components. +# Charon uses SQLite by default; PostgreSQL wire-protocol path is not reachable in standard deployment. +# Review by: 2026-05-25 +# See also: .grype.yaml for full justification +# exp: 2026-05-25 +CVE-2026-41889 +GHSA-j88v-2chj-qfwx diff --git a/Dockerfile b/Dockerfile index bc26ea794..1714dfd3c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,8 +26,8 @@ ARG CROWDSEC_RELEASE_SHA256=704e37121e7ac215991441cef0d8732e33fa3b1a2b2b88b53a0b ARG EXPR_LANG_VERSION=1.17.8 # renovate: datasource=go depName=golang.org/x/net ARG XNET_VERSION=0.55.0 -# renovate: datasource=go depName=github.com/smallstep/certificates -ARG SMALLSTEP_CERTIFICATES_VERSION=0.30.0 +# renovate: datasource=go depName=golang.org/x/crypto +ARG XCRYPTO_VERSION=0.52.0 # renovate: datasource=npm depName=npm ARG NPM_VERSION=11.11.1 @@ -241,7 +241,7 @@ ARG CORAZA_CADDY_VERSION ARG XCADDY_VERSION=0.4.6 ARG EXPR_LANG_VERSION ARG XNET_VERSION -ARG SMALLSTEP_CERTIFICATES_VERSION +ARG XCRYPTO_VERSION # hadolint ignore=DL3018 RUN apk add --no-cache bash git @@ -289,6 +289,7 @@ RUN --mount=type=cache,target=/root/.cache/go-build \ go get github.com/expr-lang/expr@v${EXPR_LANG_VERSION}; \ # renovate: datasource=go depName=github.com/hslatman/ipstore go get github.com/hslatman/ipstore@v0.4.0; \ + go get golang.org/x/crypto@v${XCRYPTO_VERSION}; \ go get golang.org/x/net@v${XNET_VERSION}; \ # CVE-2026-33186: gRPC-Go auth bypass (fixed in v1.79.3) # CVE-2026-34986: go-jose/v4 transitive fix (requires grpc >= v1.80.0) @@ -316,10 +317,6 @@ RUN --mount=type=cache,target=/root/.cache/go-build \ # remove once caddy-security ships a release built with goxmldsig >= v1.6.0. # renovate: datasource=go depName=github.com/russellhaering/goxmldsig go get github.com/russellhaering/goxmldsig@v1.6.0; \ - # CVE-2026-30836: smallstep/certificates 0.30.0-rc3 vulnerability - # Fix available at v0.30.0. Pin here so the Caddy binary is patched immediately; - # remove once caddy-security ships a release built with smallstep/certificates >= v0.30.0. - go get github.com/smallstep/certificates@v${SMALLSTEP_CERTIFICATES_VERSION}; \ # CVE-2026-32952: go-ntlmssp DoS via malicious NTLM challenge response # Affects /usr/bin/caddy (transitive dependency). Fix available at v0.1.1. # renovate: datasource=go depName=github.com/Azure/go-ntlmssp @@ -339,8 +336,17 @@ RUN --mount=type=cache,target=/root/.cache/go-build \ echo "Unsupported CADDY_PATCH_SCENARIO=${CADDY_PATCH_SCENARIO}"; \ exit 1; \ fi; \ + # Final re-pin: enforce requested Caddy core version after plugin/security updates. + go get github.com/caddyserver/caddy/v2@v${CADDY_TARGET_VERSION}; \ # Clean up go.mod and ensure all dependencies are resolved go mod tidy; \ + # Hard assertion: fail if module graph resolves to a different Caddy core version. + ACTUAL_CADDY_VERSION="$(go list -m -f "{{.Version}}" github.com/caddyserver/caddy/v2)"; \ + if [ "$ACTUAL_CADDY_VERSION" != "v${CADDY_TARGET_VERSION}" ]; then \ + echo "ERROR: Resolved Caddy version ${ACTUAL_CADDY_VERSION} does not match target v${CADDY_TARGET_VERSION}"; \ + exit 1; \ + fi; \ + echo "Verified Caddy module version: ${ACTUAL_CADDY_VERSION}"; \ echo "Dependencies patched successfully"; \ # Remove any temporary binaries from initial xcaddy run rm -f /tmp/caddy-initial; \ diff --git a/SECURITY.md b/SECURITY.md index d93510438..e20a57db8 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -27,7 +27,7 @@ public disclosure. ## Known Vulnerabilities -Last reviewed: 2026-05-20 +Last reviewed: 2026-05-25 ### [HIGH] CVE-2026-31790 · OpenSSL Vulnerability in Alpine Base Image @@ -284,6 +284,45 @@ database server is untrusted. EPSS score not yet available. Monitor https://github.com/jackc/pgproto3 for a fix release. Upgrade the indirect dependency once a patched version is available. Pre-existing; not introduced by PR #1031. +--- + +### [LOW] CVE-2026-41889 · pgx/v4 Panic via Crafted PostgreSQL Wire Payload + +| Field | Value | +|--------------|-------| +| **ID** | CVE-2026-41889 (GHSA-j88v-2chj-qfwx) | +| **Severity** | Low · 3.7 | +| **Status** | Awaiting Upstream | + +**What** +`github.com/jackc/pgx/v4` may panic when decoding a crafted PostgreSQL wire payload, +which can cause a denial-of-service condition in affected clients. + +**Who** + +- Discovered by: Automated scan (Trivy image scan) +- Reported: 2026-05-25 +- Affects: Deployments using PostgreSQL-backed CrowdSec code paths (non-default) + +**Where** + +- Component: `github.com/jackc/pgx/v4` (transitive dependency in bundled CrowdSec components) +- Versions affected: pgx/v4 prior to upstream remediation + +**When** + +- Discovered: 2026-05-25 +- Disclosed (if public): Public +- Target fix: When upstream dependencies migrate to a patched path (pgx/v5) + +**How** +Exploitation requires a crafted PostgreSQL protocol payload delivered to an affected pgx/v4 +decode path. Charon defaults to SQLite, so standard deployments do not expose this path. + +**Planned Remediation** +Track upstream CrowdSec dependency updates and remove suppression once pgx/v4 is no longer +present in bundled components. + ## Patched Vulnerabilities ### ✅ [HIGH] CVE-2026-34040 · Docker AuthZ Plugin Bypass via Oversized Request Body diff --git a/agent/muzzle/muzzle.go b/agent/muzzle/muzzle.go index 562670578..189a4460d 100644 --- a/agent/muzzle/muzzle.go +++ b/agent/muzzle/muzzle.go @@ -20,13 +20,51 @@ const forbiddenResponse = "HTTP/1.1 403 Forbidden\r\nContent-Length: 0\r\nConnec // allowedPatterns enumerates the Docker API paths that agents may access. // Matching uses path.Match after stripping query parameters; each pattern // uses `*` to match any single path segment (never crosses a slash). +// +// Both versioned (/v*/...) and unversioned (/...) forms are listed because +// Docker clients such as Dockhand send unversioned requests (e.g. GET /containers/json) +// while the canonical Docker CLI sends versioned requests (e.g. GET /v1.47/containers/json). var allowedPatterns = []string{ + "/_ping", // no version prefix (Docker < 1.24 / direct health check) + "/v*/_ping", // versioned ping for Docker client health checks + + // Container listing and inspection — unversioned (RC8 fix) + versioned + "/containers/json", "/v*/containers/json", + "/containers/*/json", "/v*/containers/*/json", + "/containers/*/logs", + "/v*/containers/*/logs", + "/containers/*/stats", + "/v*/containers/*/stats", + "/containers/*/top", + "/v*/containers/*/top", + + // Daemon info — unversioned + versioned + "/info", "/v*/info", + "/images/json", "/v*/images/json", + "/version", "/v*/version", + "/events", "/v*/events", + + // Volumes — unversioned + versioned + "/volumes", + "/v*/volumes", + "/volumes/*", + "/v*/volumes/*", + + // Networks — unversioned + versioned + "/networks", + "/v*/networks", + "/networks/*", + "/v*/networks/*", + + // System disk usage — unversioned + versioned + "/system/df", + "/v*/system/df", } // Filter is an HTTP allowlist filter for Docker socket proxy streams. @@ -38,8 +76,20 @@ func New() *Filter { } // Allow returns true if method+reqPath is on the allowlist. -// Only GET is permitted; all other methods are rejected immediately. +// Only GET is permitted, except HEAD which is allowed on /_ping and /v*/_ping +// (Docker SDK connectivity check). func (f *Filter) Allow(method, reqPath string) bool { + // HEAD is permitted only for /_ping (Docker SDK connectivity check). + if strings.EqualFold(method, http.MethodHead) { + cleanPath := path.Clean(reqPath) + for _, p := range []string{"/_ping", "/v*/_ping"} { + if matched, _ := path.Match(p, cleanPath); matched { + return true + } + } + return false + } + if !strings.EqualFold(method, http.MethodGet) { return false } @@ -79,6 +129,10 @@ func (f *Filter) ServeProxy(dst string, r io.Reader, w io.Writer) error { } defer conn.Close() + // Ensure Docker closes the socket after the response so ServeProxy can + // terminate cleanly instead of waiting on an idle keep-alive connection. + req.Close = true + // Forward the full request (headers + body) to the Docker socket. if err := req.Write(conn); err != nil { return fmt.Errorf("muzzle: forward request to docker: %w", err) diff --git a/agent/muzzle/muzzle_test.go b/agent/muzzle/muzzle_test.go index cd305f88c..391aba031 100644 --- a/agent/muzzle/muzzle_test.go +++ b/agent/muzzle/muzzle_test.go @@ -1,9 +1,18 @@ package muzzle_test import ( + "bufio" "bytes" + "errors" + "fmt" + "io" + "net" + "net/http" + "path/filepath" "strings" + "sync" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -11,6 +20,72 @@ import ( "github.com/Wikid82/charon/agent/muzzle" ) +var errWriterClosed = errors.New("writer closed") + +type closableWriter struct { + mu sync.Mutex + buf bytes.Buffer + closed bool + firstWrite chan struct{} + once sync.Once +} + +func newClosableWriter() *closableWriter { + return &closableWriter{firstWrite: make(chan struct{})} +} + +func (w *closableWriter) Write(p []byte) (int, error) { + w.once.Do(func() { close(w.firstWrite) }) + + w.mu.Lock() + defer w.mu.Unlock() + if w.closed { + return 0, errWriterClosed + } + return w.buf.Write(p) +} + +func (w *closableWriter) Close() { + w.mu.Lock() + defer w.mu.Unlock() + w.closed = true +} + +func (w *closableWriter) String() string { + w.mu.Lock() + defer w.mu.Unlock() + return w.buf.String() +} + +func startUnixHTTPServer(t *testing.T, handler func(net.Conn)) (string, func()) { + t.Helper() + + sockPath := filepath.Join(t.TempDir(), "docker.sock") + ln, err := net.Listen("unix", sockPath) + require.NoError(t, err) + + done := make(chan struct{}) + go func() { + defer close(done) + conn, acceptErr := ln.Accept() + if acceptErr != nil { + return + } + handler(conn) + }() + + cleanup := func() { + _ = ln.Close() + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("unix server goroutine did not exit") + } + } + + return sockPath, cleanup +} + func TestFilter_Allow(t *testing.T) { f := muzzle.New() @@ -19,26 +94,71 @@ func TestFilter_Allow(t *testing.T) { reqPath string allowed bool }{ - // Allowed: read-only GET endpoints - {"GET", "/v1.41/containers/json", true}, - {"GET", "/v1.41/info", true}, - {"GET", "/v1.41/version", true}, - {"GET", "/v1.41/images/json", true}, - {"GET", "/v1.41/events", true}, - {"GET", "/v1.44/containers/abc123/json", true}, + // --- Allowed: health check --- + {"GET", "/_ping", true}, + {"GET", "/v1.44/_ping", true}, + {"HEAD", "/_ping", true}, + {"HEAD", "/v1.47/_ping", true}, + {"HEAD", "/containers/json", false}, // HEAD blocked for non-ping paths + {"HEAD", "/v1.47/containers/json", false}, // HEAD blocked for non-ping paths + + // --- Allowed: GET on versioned paths --- + {"GET", "/v1.47/containers/json", true}, {"GET", "/v1.24/containers/json", true}, + {"GET", "/v1.47/containers/abc123/json", true}, + {"GET", "/v1.47/containers/abc123/logs", true}, + {"GET", "/v1.47/containers/abc123/stats", true}, + {"GET", "/v1.47/containers/abc123/top", true}, + {"GET", "/v1.47/info", true}, + {"GET", "/v1.47/images/json", true}, + {"GET", "/v1.47/version", true}, + {"GET", "/v1.47/events", true}, + {"GET", "/v1.47/volumes", true}, + {"GET", "/v1.47/volumes/myvolume", true}, + {"GET", "/v1.47/networks", true}, + {"GET", "/v1.47/networks/mynet", true}, + {"GET", "/v1.47/system/df", true}, + {"GET", "/v1.41/volumes/myvol", true}, + {"GET", "/v1.41/networks/mynet", true}, + + // --- Allowed: GET on UNVERSIONED paths (RC8/RC9 fix) --- + {"GET", "/containers/json", true}, + {"GET", "/containers/abc123/json", true}, + {"GET", "/containers/abc123/logs", true}, + {"GET", "/containers/abc123/stats", true}, + {"GET", "/containers/abc123/top", true}, + {"GET", "/info", true}, + {"GET", "/images/json", true}, + {"GET", "/version", true}, + {"GET", "/events", true}, + {"GET", "/volumes", true}, + {"GET", "/volumes/myvolume", true}, + {"GET", "/networks", true}, + {"GET", "/networks/mynet", true}, + {"GET", "/system/df", true}, - // Blocked: mutating methods + // --- Blocked: mutating methods --- + {"POST", "/containers/create", false}, + {"DELETE", "/containers/abc123", false}, + {"PUT", "/containers/abc123/start", false}, + {"PATCH", "/v1.47/containers/abc123", false}, {"POST", "/v1.41/containers/create", false}, {"DELETE", "/v1.41/containers/abc", false}, {"PUT", "/v1.41/networks/abc", false}, {"PATCH", "/v1.41/containers/abc/update", false}, - // Blocked: paths not on allowlist + // --- Blocked: paths not on allowlist --- + {"GET", "/containers/abc123/start", false}, + {"GET", "/containers/abc123/stop", false}, + {"GET", "/exec/abc123/start", false}, + {"GET", "/build", false}, + {"GET", "/v1.47/exec/abc123", false}, {"GET", "/v1.41/exec/abc/start", false}, {"GET", "/v1.41/containers/prune", false}, - {"GET", "/v1.41/networks", false}, - {"GET", "/v1.41/volumes", false}, + + // --- Path traversal: path.Clean normalises before matching --- + {"GET", "/v1.47/../containers/json", true}, // resolves to /containers/json — allowed + {"GET", "/containers/../../etc/passwd", false}, // resolves to /etc/passwd — blocked } for _, tt := range tests { @@ -81,3 +201,166 @@ func TestFilter_ServeProxy_Blocked_PUT(t *testing.T) { require.Error(t, err) assert.Contains(t, buf.String(), "403") } + +func TestFilter_ServeProxy_Blocked_UnversionedPost(t *testing.T) { + f := muzzle.New() + + reqStr := "POST /containers/create HTTP/1.1\r\nHost: localhost\r\nContent-Length: 0\r\n\r\n" + var buf bytes.Buffer + + err := f.ServeProxy("/tmp/nonexistent.sock", strings.NewReader(reqStr), &buf) + require.Error(t, err) + assert.Contains(t, buf.String(), "403") +} + +func TestServeProxy_ConnectionCloseSetOnRequest(t *testing.T) { + f := muzzle.New() + + reqSeen := make(chan *http.Request, 1) + serverErr := make(chan error, 1) + + sockPath, cleanup := startUnixHTTPServer(t, func(conn net.Conn) { + defer conn.Close() + + req, err := http.ReadRequest(bufio.NewReader(conn)) + if err != nil { + serverErr <- err + return + } + reqSeen <- req + + _, err = io.WriteString(conn, "HTTP/1.1 200 OK\r\nContent-Length: 5\r\nConnection: close\r\n\r\nhello") + if err != nil { + serverErr <- err + } + }) + defer cleanup() + + var out bytes.Buffer + req := "GET /containers/json HTTP/1.1\r\nHost: localhost\r\n\r\n" + err := f.ServeProxy(sockPath, strings.NewReader(req), &out) + require.NoError(t, err) + + seen := <-reqSeen + select { + case err := <-serverErr: + require.NoError(t, err) + default: + } + assert.True(t, seen.Close) + assert.Equal(t, "close", strings.ToLower(seen.Header.Get("Connection"))) + assert.Contains(t, out.String(), "200 OK") + assert.Contains(t, out.String(), "hello") +} + +func TestServeProxy_CompletesAfterDockerResponse(t *testing.T) { + f := muzzle.New() + serverErr := make(chan error, 1) + body := `{"status":"ok"}` + + sockPath, cleanup := startUnixHTTPServer(t, func(conn net.Conn) { + defer conn.Close() + + _, err := http.ReadRequest(bufio.NewReader(conn)) + if err != nil { + serverErr <- err + return + } + + _, err = io.WriteString(conn, fmt.Sprintf("HTTP/1.1 200 OK\r\nContent-Length: %d\r\nConnection: close\r\n\r\n%s", len(body), body)) + if err != nil { + serverErr <- err + } + }) + defer cleanup() + + var out bytes.Buffer + req := "GET /containers/json HTTP/1.1\r\nHost: localhost\r\n\r\n" + + done := make(chan error, 1) + go func() { + done <- f.ServeProxy(sockPath, strings.NewReader(req), &out) + }() + + select { + case err := <-done: + require.NoError(t, err) + case <-time.After(2 * time.Second): + t.Fatal("ServeProxy did not return after complete response") + } + + assert.Contains(t, out.String(), body) + select { + case err := <-serverErr: + require.NoError(t, err) + default: + } +} + +func TestServeProxy_StreamingResponseTerminatesOnWriterClose(t *testing.T) { + f := muzzle.New() + + serverWriteErr := make(chan error, 1) + serverErr := make(chan error, 1) + + sockPath, cleanup := startUnixHTTPServer(t, func(conn net.Conn) { + defer conn.Close() + + _, err := http.ReadRequest(bufio.NewReader(conn)) + if err != nil { + serverErr <- err + return + } + + _, err = io.WriteString(conn, "HTTP/1.1 200 OK\r\nTransfer-Encoding: chunked\r\nConnection: close\r\n\r\n") + if err != nil { + serverErr <- err + return + } + + for { + if _, writeErr := io.WriteString(conn, "6\r\nhello!\r\n"); writeErr != nil { + serverWriteErr <- writeErr + return + } + time.Sleep(10 * time.Millisecond) + } + }) + defer cleanup() + + w := newClosableWriter() + req := "GET /events HTTP/1.1\r\nHost: localhost\r\n\r\n" + + done := make(chan error, 1) + go func() { + done <- f.ServeProxy(sockPath, strings.NewReader(req), w) + }() + + select { + case <-w.firstWrite: + case <-time.After(2 * time.Second): + t.Fatal("stream did not start") + } + + w.Close() + + select { + case err := <-done: + require.Error(t, err) + case <-time.After(2 * time.Second): + t.Fatal("ServeProxy did not return after writer close") + } + + select { + case err := <-serverWriteErr: + require.Error(t, err) + case <-time.After(2 * time.Second): + t.Fatal("mock docker stream did not observe closed connection") + } + + select { + case err := <-serverErr: + require.NoError(t, err) + default: + } +} diff --git a/backend/internal/api/routes/routes.go b/backend/internal/api/routes/routes.go index 37415ea1c..ddb2de51d 100644 --- a/backend/internal/api/routes/routes.go +++ b/backend/internal/api/routes/routes.go @@ -510,6 +510,7 @@ func RegisterWithDeps(ctx context.Context, router *gin.Engine, db *gorm.DB, cfg dockerHandler := handlers.NewDockerHandler(dockerService, remoteServerService) if orthrusServer != nil { dockerHandler.SetOrthrusResolver(orthrusServer) + uptimeService.SetOrthrusResolver(orthrusServer) } dockerHandler.RegisterRoutes(management) diff --git a/backend/internal/orthrus/muzzle.go b/backend/internal/orthrus/muzzle.go index 227fa149a..6a4ea4b8d 100644 --- a/backend/internal/orthrus/muzzle.go +++ b/backend/internal/orthrus/muzzle.go @@ -2,6 +2,7 @@ package orthrus import ( "net/http" + "path" "regexp" "strings" @@ -24,10 +25,27 @@ var versionPrefixRe = regexp.MustCompile(`^/v\d+\.\d+`) // allowedDockerPaths is the set of Docker API paths that are safe to expose to agents. // Path matching is performed after stripping the version prefix. var allowedDockerPaths = map[string]struct{}{ + "/_ping": {}, "/containers/json": {}, "/images/json": {}, "/info": {}, "/version": {}, + "/events": {}, + "/volumes": {}, + "/networks": {}, + "/system/df": {}, +} + +// allowedDockerPatterns covers dynamic-segment paths such as +// /containers/{id}/json, /volumes/{name}, and /networks/{id}. +// Matching uses path.Match after the version prefix has been stripped. +var allowedDockerPatterns = []string{ + "/containers/*/json", + "/containers/*/logs", + "/containers/*/stats", + "/containers/*/top", + "/volumes/*", + "/networks/*", } // Muzzle is an http.Handler wrapper that restricts Docker socket access @@ -42,8 +60,22 @@ func NewMuzzle(next http.Handler) *Muzzle { } // ServeHTTP implements http.Handler. Only GET requests to allowlisted paths -// are forwarded; all others receive 403 Forbidden. +// are forwarded; HEAD is also permitted for /_ping (Docker client health checks). +// All other methods or paths receive 403 Forbidden. func (m *Muzzle) ServeHTTP(w http.ResponseWriter, r *http.Request) { + rawPath := versionPrefixRe.ReplaceAllString(r.URL.Path, "") + // Normalize away any "." or ".." segments before any allowlist check so that + // traversal-style paths such as /containers/../json cannot match patterns like + // /containers/*/json. path.Clean always returns a rooted result when given a + // rooted input; the explicit "/" prefix guards against an empty rawPath value. + stripped := path.Clean("/" + strings.TrimLeft(rawPath, "/")) + + // HEAD /_ping is permitted alongside GET for Docker client health checks. + if r.Method == http.MethodHead && stripped == "/_ping" { + m.next.ServeHTTP(w, r) + return + } + if r.Method != http.MethodGet { logger.Log().WithField("method", util.SanitizeForLog(r.Method)).WithField("path", sanitizePath(r.URL.Path)). Warn("orthrus: muzzle blocked non-GET Docker request") @@ -51,12 +83,20 @@ func (m *Muzzle) ServeHTTP(w http.ResponseWriter, r *http.Request) { return } - stripped := versionPrefixRe.ReplaceAllString(r.URL.Path, "") if _, ok := allowedDockerPaths[stripped]; ok { m.next.ServeHTTP(w, r) return } + // Check dynamic path patterns for container/volume/network inspection. + // stripped is already rooted and cleaned; use it directly. + for _, pat := range allowedDockerPatterns { + if matched, err := path.Match(pat, stripped); err == nil && matched { + m.next.ServeHTTP(w, r) + return + } + } + logger.Log().WithField("method", util.SanitizeForLog(r.Method)).WithField("path", sanitizePath(r.URL.Path)). Warn("orthrus: muzzle blocked disallowed Docker path") http.Error(w, "Forbidden", http.StatusForbidden) diff --git a/backend/internal/orthrus/muzzle_test.go b/backend/internal/orthrus/muzzle_test.go index 894d5f8d5..92ac4e83f 100644 --- a/backend/internal/orthrus/muzzle_test.go +++ b/backend/internal/orthrus/muzzle_test.go @@ -16,10 +16,15 @@ func passthroughHandler() http.Handler { func TestMuzzle_AllowlistedGET_Passthrough(t *testing.T) { allowed := []string{ + "/_ping", "/containers/json", "/images/json", "/info", "/version", + "/events", + "/volumes", + "/networks", + "/system/df", } m := NewMuzzle(passthroughHandler()) @@ -40,6 +45,11 @@ func TestMuzzle_VersionPrefixStripped_Passthrough(t *testing.T) { "/v1.40/images/json", "/v1.41/info", "/v1.42/version", + "/v1.47/_ping", + "/v1.44/events", + "/v1.44/volumes", + "/v1.44/networks", + "/v1.47/system/df", } m := NewMuzzle(passthroughHandler()) @@ -81,13 +91,60 @@ func TestMuzzle_DELETE_Blocked(t *testing.T) { assert.Equal(t, http.StatusForbidden, rr.Code) } +func TestMuzzle_HEAD_Ping_Passthrough(t *testing.T) { + m := NewMuzzle(passthroughHandler()) + + for _, path := range []string{"/_ping", "/v1.44/_ping"} { + t.Run(path, func(t *testing.T) { + req := httptest.NewRequest(http.MethodHead, path, http.NoBody) + rr := httptest.NewRecorder() + m.ServeHTTP(rr, req) + assert.Equal(t, http.StatusOK, rr.Code) + }) + } +} + +func TestMuzzle_HEAD_NonPing_Blocked(t *testing.T) { + m := NewMuzzle(passthroughHandler()) + req := httptest.NewRequest(http.MethodHead, "/containers/json", http.NoBody) + rr := httptest.NewRecorder() + m.ServeHTTP(rr, req) + assert.Equal(t, http.StatusForbidden, rr.Code) +} + +func TestMuzzle_DynamicPaths_Passthrough(t *testing.T) { + paths := []string{ + "/containers/abc123/json", + "/v1.44/containers/abc123/json", + "/containers/abc123/logs", + "/v1.47/containers/abc123/logs", + "/containers/abc123/stats", + "/v1.47/containers/abc123/stats", + "/containers/abc123/top", + "/v1.47/containers/abc123/top", + "/volumes/myvolume", + "/v1.44/volumes/myvolume", + "/networks/mynet", + "/v1.44/networks/mynet", + } + + m := NewMuzzle(passthroughHandler()) + + for _, p := range paths { + t.Run(p, func(t *testing.T) { + req := httptest.NewRequest(http.MethodGet, p, http.NoBody) + rr := httptest.NewRecorder() + m.ServeHTTP(rr, req) + assert.Equal(t, http.StatusOK, rr.Code) + }) + } +} + func TestMuzzle_UnknownPath_Blocked(t *testing.T) { paths := []string{ "/containers/create", "/exec/abc/start", "/containers/abc/kill", - "/networks/create", - "/_ping", } m := NewMuzzle(passthroughHandler()) diff --git a/backend/internal/orthrus/server.go b/backend/internal/orthrus/server.go index 024331c2e..d1a063c5c 100644 --- a/backend/internal/orthrus/server.go +++ b/backend/internal/orthrus/server.go @@ -91,6 +91,15 @@ func (s *OrthrusServer) HandleWebSocket(c *gin.Context) { return } + // Displace the prior session for this UUID BEFORE binding the new proxy + // listeners so the old session's extListener is closed and its port is freed + // before StartDockerProxy / StartExternalProxy attempt to bind. + if old, loaded := s.sessions.LoadAndDelete(agent.UUID); loaded { + if oldSess, ok := old.(*AgentSession); ok { + _ = oldSess.Close() + } + } + if err := session.StartDockerProxy(); err != nil { logger.Log().WithField("uuid", util.SanitizeForLog(agent.UUID)). WithError(err).Warn("orthrus: failed to start docker proxy listener") @@ -188,8 +197,13 @@ func (s *OrthrusServer) watchHeartbeat(agentUUID string, sess *AgentSession) { case <-ticker.C: if !sess.IsAlive() { _ = sess.Close() // stops runProxyListener goroutine; idempotent - s.markOffline(agentUUID) - s.sessions.Delete(agentUUID) + // Only remove from the map and mark offline when this goroutine's + // session pointer is still the current one. A stale goroutine + // holding an old pointer will find CompareAndDelete returns false + // and exits without corrupting the new session's state. + if s.sessions.CompareAndDelete(agentUUID, sess) { + s.markOffline(agentUUID) + } return } } diff --git a/backend/internal/orthrus/server_coverage_test.go b/backend/internal/orthrus/server_coverage_test.go index 6135e7cf5..f940981e0 100644 --- a/backend/internal/orthrus/server_coverage_test.go +++ b/backend/internal/orthrus/server_coverage_test.go @@ -368,3 +368,58 @@ func TestOrthrusServer_HandleWebSocket_ExternalProxyFails(t *testing.T) { assert.False(t, status.Active) assert.NotEmpty(t, status.Error) } +// TestHandleWebSocket_DisplacesExistingSession covers server.go:98-100 — +// the displacement block that closes the old session when a new connection +// arrives for an agent UUID that already has an active session in the map. +func TestHandleWebSocket_DisplacesExistingSession(t *testing.T) { + db := setupServerTestDB(t) + srv, err := NewOrthrusServer(db, setupTestCA(t)) + require.NoError(t, err) + srv.heartbeatTimeout = 200 * time.Millisecond + + token := "ch_orthrus_displace01" //nolint:gosec // G101: test credential + hash, err := bcrypt.GenerateFromPassword([]byte(token), bcrypt.MinCost) + require.NoError(t, err) + + agent := &models.OrthrusAgent{ + UUID: "displace-uuid", + Name: "displace-agent", + AuthKeyHash: string(hash), + Status: models.OrthrusStatusPending, + } + require.NoError(t, db.Create(agent).Error) + + // Create an "old" session and store it in the sessions map to simulate a + // prior connection that has not yet been cleaned up. + oldConn, oldCleanup := testWSPair(t) + defer oldCleanup() + oldSess, err := NewAgentSession("displace-uuid", "displace-agent", oldConn) + require.NoError(t, err) + srv.sessions.Store("displace-uuid", oldSess) + + gin.SetMode(gin.TestMode) + router := gin.New() + router.GET("/ws", srv.HandleWebSocket) + ts := httptest.NewServer(router) + t.Cleanup(srv.Stop) + t.Cleanup(ts.Close) + + wsURL := "ws" + strings.TrimPrefix(ts.URL, "http") + "/ws" + header := http.Header{"Authorization": []string{"Bearer " + token}} + conn, dialResp, err := gorillaws.DefaultDialer.Dial(wsURL, header) + require.NoError(t, err) + if dialResp != nil { + _ = dialResp.Body.Close() + } + defer func() { _ = conn.Close() }() + + // Wait for the new session to be stored, which means HandleWebSocket has run + // past the displacement block and stored the replacement session. + assert.Eventually(t, func() bool { + raw, ok := srv.GetSession("displace-uuid") + return ok && raw != oldSess + }, 2*time.Second, 20*time.Millisecond, "new session should replace old session") + + // The old session must have been closed by the displacement block (lines 98-100). + assert.False(t, oldSess.IsAlive(), "old session must be closed by displacement") +} diff --git a/backend/internal/orthrus/server_test.go b/backend/internal/orthrus/server_test.go index 1898ad263..d6de28b5a 100644 --- a/backend/internal/orthrus/server_test.go +++ b/backend/internal/orthrus/server_test.go @@ -5,6 +5,7 @@ import ( "net/http/httptest" "path/filepath" "testing" + "time" "github.com/gin-gonic/gin" "golang.org/x/crypto/bcrypt" @@ -189,3 +190,118 @@ func TestOrthrusServer_HandleWebSocket_InvalidToken(t *testing.T) { assert.Equal(t, http.StatusUnauthorized, w.Code) } + +// TestWatchHeartbeat_StaleGoroutine_DoesNotEvictNewSession is a regression test +// for the session race condition: a stale watchHeartbeat goroutine (holding a +// reference to an old, dead session) must not evict or mark offline a newer +// session that has already been stored for the same agent UUID. +func TestWatchHeartbeat_StaleGoroutine_DoesNotEvictNewSession(t *testing.T) { + db := setupServerTestDB(t) + srv, err := NewOrthrusServer(db, setupTestCA(t)) + require.NoError(t, err) + // Short timeout so the ticker fires immediately in the test. + srv.heartbeatTimeout = time.Millisecond + + const agentUUID = "race-regression-uuid" + + // Insert the agent in the DB with status online so we can verify markOffline + // is not called. + agent := &models.OrthrusAgent{ + UUID: agentUUID, + Name: "race-agent", + Status: models.OrthrusStatusOnline, + } + require.NoError(t, db.Create(agent).Error) + + // sess1: already closed — represents a stale session whose watchHeartbeat + // goroutine is still running after a newer session has replaced it. + conn1, done1 := testWSPair(t) + defer done1() + sess1, err := NewAgentSession(agentUUID, "race-agent", conn1) + require.NoError(t, err) + require.NoError(t, sess1.Close()) + require.False(t, sess1.IsAlive()) + + // sess2: alive — represents the current (newer) reconnect stored in the map. + conn2, done2 := testWSPair(t) + defer done2() + sess2, err := NewAgentSession(agentUUID, "race-agent", conn2) + require.NoError(t, err) + t.Cleanup(func() { _ = sess2.Close() }) + srv.sessions.Store(agentUUID, sess2) + + // Run the stale watchHeartbeat (for sess1) and wait for it to exit. + // With CompareAndDelete, it finds sess1 ≠ sess2 in the map, so it returns + // false and skips markOffline — sess2 stays in the map. + done := make(chan struct{}) + go func() { + defer close(done) + srv.watchHeartbeat(agentUUID, sess1) + }() + + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("watchHeartbeat did not exit within deadline") + } + + // sess2 must still be present in the map. + raw, ok := srv.sessions.Load(agentUUID) + require.True(t, ok, "sess2 should still be in the sessions map") + assert.Same(t, sess2, raw.(*AgentSession), "sess2 pointer must be unchanged") + + // The agent must NOT have been marked offline. + var stored models.OrthrusAgent + require.NoError(t, db.Where("uuid = ?", agentUUID).First(&stored).Error) + assert.Equal(t, models.OrthrusStatusOnline, stored.Status, + "stale goroutine must not flip agent status to offline") +} + +// TestWatchHeartbeat_CurrentSession_MarksOfflineAndEvictsFromMap exercises the +// CompareAndDelete true-branch: when the session pointer in the map matches the +// goroutine's pointer, the agent is marked offline and the map entry is removed. +func TestWatchHeartbeat_CurrentSession_MarksOfflineAndEvictsFromMap(t *testing.T) { + db := setupServerTestDB(t) + srv, err := NewOrthrusServer(db, setupTestCA(t)) + require.NoError(t, err) + srv.heartbeatTimeout = time.Millisecond + + const agentUUID = "current-session-uuid" + agent := &models.OrthrusAgent{ + UUID: agentUUID, + Name: "current-agent", + Status: models.OrthrusStatusOnline, + } + require.NoError(t, db.Create(agent).Error) + + conn, wsCleanup := testWSPair(t) + defer wsCleanup() + + sess, err := NewAgentSession(agentUUID, "current-agent", conn) + require.NoError(t, err) + require.NoError(t, sess.Close()) + require.False(t, sess.IsAlive()) + + // Store the SAME pointer so CompareAndDelete returns true. + srv.sessions.Store(agentUUID, sess) + + doneCh := make(chan struct{}) + go func() { + defer close(doneCh) + srv.watchHeartbeat(agentUUID, sess) + }() + + select { + case <-doneCh: + case <-time.After(2 * time.Second): + t.Fatal("watchHeartbeat did not exit within deadline") + } + + _, ok := srv.sessions.Load(agentUUID) + assert.False(t, ok, "session must be evicted when CompareAndDelete succeeds") + + var stored models.OrthrusAgent + require.NoError(t, db.Where("uuid = ?", agentUUID).First(&stored).Error) + assert.Equal(t, models.OrthrusStatusOffline, stored.Status, + "agent must be marked offline when CompareAndDelete succeeds") +} diff --git a/backend/internal/orthrus/session.go b/backend/internal/orthrus/session.go index 0727dfa59..8d3eb6320 100644 --- a/backend/internal/orthrus/session.go +++ b/backend/internal/orthrus/session.go @@ -283,12 +283,20 @@ func (s *AgentSession) StartExternalProxy(port int) error { loopbackTarget := fmt.Sprintf("127.0.0.1:%d", loopbackPort) targetURL := &url.URL{Scheme: "http", Host: loopbackTarget} + + // Clone DefaultTransport to preserve its sane dial/TLS/idle-conn + // defaults, then disable keep-alives so the external proxy never holds + // open connections to the loopback target longer than a single request. + baseTransport := http.DefaultTransport.(*http.Transport).Clone() + baseTransport.DisableKeepAlives = true + rp := &httputil.ReverseProxy{ Rewrite: func(pr *httputil.ProxyRequest) { pr.SetURL(targetURL) pr.Out.Host = "" }, FlushInterval: -1, + Transport: baseTransport, } srv := &http.Server{ diff --git a/backend/internal/orthrus/session_test.go b/backend/internal/orthrus/session_test.go index 0174fb6f9..1e87bb458 100644 --- a/backend/internal/orthrus/session_test.go +++ b/backend/internal/orthrus/session_test.go @@ -1,10 +1,15 @@ package orthrus import ( + "fmt" + "io" + "net" "net/http" "net/http/httptest" "strings" + "sync/atomic" "testing" + "time" "github.com/gorilla/websocket" "github.com/stretchr/testify/assert" @@ -73,3 +78,48 @@ func TestAgentSession_Close_SetsNotAlive(t *testing.T) { require.NoError(t, sess.Close()) assert.False(t, sess.IsAlive()) } + +func TestStartExternalProxy_TransportDisablesKeepAlives(t *testing.T) { + serverConn, done := testWSPair(t) + defer done() + + sess, err := NewAgentSession("keepalive-uuid", "keepalive-agent", serverConn) + require.NoError(t, err) + defer func() { _ = sess.Close() }() + + var connCount atomic.Int32 + mock := httptest.NewUnstartedServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = io.WriteString(w, `{"ok":true}`) + })) + mock.Config.ConnState = func(_ net.Conn, state http.ConnState) { + if state == http.StateNew { + connCount.Add(1) + } + } + mock.Start() + defer mock.Close() + + mockPort := mock.Listener.Addr().(*net.TCPAddr).Port + + sess.mu.Lock() + sess.proxyPort = mockPort + sess.mu.Unlock() + + extPort := findFreePort(t) + require.NoError(t, sess.StartExternalProxy(extPort)) + + assert.Eventually(t, func() bool { + return sess.GetExternalProxyStatus().Active + }, 2*time.Second, 10*time.Millisecond) + + client := &http.Client{Timeout: 2 * time.Second} + for i := 0; i < 2; i++ { + resp, reqErr := client.Get(fmt.Sprintf("http://127.0.0.1:%d/containers/json", extPort)) + require.NoError(t, reqErr) + require.Equal(t, http.StatusOK, resp.StatusCode) + _, _ = io.Copy(io.Discard, resp.Body) + require.NoError(t, resp.Body.Close()) + } + + assert.Equal(t, int32(2), connCount.Load()) +} diff --git a/backend/internal/services/uptime_service.go b/backend/internal/services/uptime_service.go index e0bbc2c08..02946fba4 100644 --- a/backend/internal/services/uptime_service.go +++ b/backend/internal/services/uptime_service.go @@ -8,6 +8,7 @@ import ( "net" "net/http" "net/url" + "reflect" "strconv" "strings" "sync" @@ -21,9 +22,16 @@ import ( "gorm.io/gorm" ) +// orthrusStatusChecker allows UptimeService to query Orthrus session liveness +// without a direct dependency on the orthrus package. +type orthrusStatusChecker interface { + GetProxyAddr(agentUUID string) (string, bool) +} + type UptimeService struct { DB *gorm.DB NotificationService *NotificationService + orthrusResolver orthrusStatusChecker // nil when Orthrus feature is disabled // Batching: track pending notifications pendingNotifications map[string]*pendingHostNotification notificationMutex sync.Mutex @@ -77,6 +85,22 @@ func NewUptimeService(db *gorm.DB, ns *NotificationService) *UptimeService { } } +// SetOrthrusResolver injects the Orthrus session resolver. +// Uses the typed-nil guard pattern established in DockerHandler. +func (s *UptimeService) SetOrthrusResolver(r orthrusStatusChecker) { + if r == nil { + s.orthrusResolver = nil + return + } + + rv := reflect.ValueOf(r) + if (rv.Kind() == reflect.Ptr || rv.Kind() == reflect.Interface) && rv.IsNil() { + s.orthrusResolver = nil + return + } + s.orthrusResolver = r +} + // extractPort extracts the port from a URL or host:port string func extractPort(urlStr string) string { // Try parsing as URL first @@ -270,6 +294,16 @@ func (s *UptimeService) SyncMonitors() error { // The upstream host for grouping upstreamHost := server.Host + // Orthrus-managed servers: connectivity is measured by session liveness, not TCP. + if server.ConnectionType == models.ConnectionTypeOrthrus { + if server.OrthrusAgentUUID == nil || *server.OrthrusAgentUUID == "" { + continue // No agent linked — cannot create a meaningful monitor + } + targetType = "orthrus" + targetURL = *server.OrthrusAgentUUID // Agent UUID as the monitor identifier + // upstreamHost remains server.Host (Tailscale IP) — correct for grouping/display + } + switch err { case gorm.ErrRecordNotFound: // Find or create UptimeHost @@ -382,6 +416,8 @@ func (s *UptimeService) CheckAll() { tcpMonitors := make([]models.UptimeMonitor, 0, len(monitors)) nonTCPMonitors := make([]models.UptimeMonitor, 0, len(monitors)) + // "orthrus" type is not "tcp", so it falls into nonTCPMonitors and + // continues to run checkMonitor independently even when the host is down. for _, monitor := range monitors { normalizedType := strings.ToLower(strings.TrimSpace(monitor.Type)) if normalizedType == "tcp" { @@ -484,6 +520,22 @@ func (s *UptimeService) checkHost(ctx context.Context, host *models.UptimeHost) return } + // Fast-path: if every monitor for this host is Orthrus-type, skip the + // TCP pre-check entirely — individual checkMonitor calls determine status. + hasDialable := false + for _, m := range monitors { + if strings.ToLower(m.Type) != "orthrus" { + hasDialable = true + break + } + } + if !hasDialable { + return + } + + // Track whether any non-Orthrus monitor with a valid port was attempted. + attempted := false + // Try to connect to any of the monitor ports with retry logic success := false var msg string @@ -508,6 +560,11 @@ func (s *UptimeService) checkHost(ctx context.Context, host *models.UptimeHost) } for _, monitor := range monitors { + // Orthrus liveness is checked per-monitor via session state, not TCP pre-check. + if strings.ToLower(monitor.Type) == "orthrus" { + continue + } + var port string // Use actual backend port from ProxyHost if available @@ -522,6 +579,7 @@ func (s *UptimeService) checkHost(ctx context.Context, host *models.UptimeHost) continue } + attempted = true logger.Log().WithFields(map[string]any{ "monitor": monitor.Name, "extracted_port": extractPort(monitor.URL), @@ -554,6 +612,12 @@ func (s *UptimeService) checkHost(ctx context.Context, host *models.UptimeHost) } } + // If every monitor for this host is Orthrus-type, there are no dialable ports. + // Skip the TCP pre-check; individual checkMonitor() calls determine status. + if !attempted { + return + } + latency := time.Since(start).Milliseconds() oldStatus := host.Status var newStatus string @@ -807,6 +871,23 @@ func (s *UptimeService) checkMonitor(monitor models.UptimeMonitor) { } else { msg = err.Error() } + case "orthrus": + agentUUID := monitor.URL + if s.orthrusResolver == nil { + msg = "Orthrus subsystem unavailable" + break + } + if agentUUID == "" { + msg = "Monitor missing agent UUID" + break + } + _, ok := s.orthrusResolver.GetProxyAddr(agentUUID) + if ok { + success = true + msg = "Orthrus session active" + } else { + msg = "Orthrus agent not connected" + } default: msg = "Unknown monitor type" } diff --git a/backend/internal/services/uptime_service_test.go b/backend/internal/services/uptime_service_test.go index e3e5c2aa5..93cb0159e 100644 --- a/backend/internal/services/uptime_service_test.go +++ b/backend/internal/services/uptime_service_test.go @@ -1,6 +1,7 @@ package services import ( + "context" "fmt" "net" "net/http" @@ -1890,3 +1891,461 @@ func TestCheckMonitor_TCP_AcceptsRFC1918Address(t *testing.T) { db.First(&result, "id = ?", monitor.ID) assert.Equal(t, "up", result.Status, "TCP monitor to loopback should report up") } + +// --- Orthrus uptime monitoring tests --- + +type mockOrthrusResolver struct { + addr string + ok bool +} + +func (m *mockOrthrusResolver) GetProxyAddr(_ string) (string, bool) { + return m.addr, m.ok +} + +func TestUptimeService_SetOrthrusResolver(t *testing.T) { + t.Run("normal set", func(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + mock := &mockOrthrusResolver{addr: "127.0.0.1:1234", ok: true} + us.SetOrthrusResolver(mock) + assert.Equal(t, mock, us.orthrusResolver) + }) + + t.Run("nil resolver clears field", func(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + us.SetOrthrusResolver(&mockOrthrusResolver{}) + us.SetOrthrusResolver(nil) + assert.Nil(t, us.orthrusResolver) + }) + + t.Run("typed nil resolver clears field", func(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + + us.SetOrthrusResolver(&mockOrthrusResolver{addr: "127.0.0.1:1234", ok: true}) + + var typedNil *mockOrthrusResolver + us.SetOrthrusResolver(typedNil) + + assert.Nil(t, us.orthrusResolver) + }) + + t.Run("resolver replacement", func(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + r1 := &mockOrthrusResolver{addr: "a", ok: true} + r2 := &mockOrthrusResolver{addr: "b", ok: false} + us.SetOrthrusResolver(r1) + us.SetOrthrusResolver(r2) + assert.Equal(t, r2, us.orthrusResolver) + }) +} + +func TestSyncMonitors_OrthrusRemoteServer_CreatesOrthrusMonitor(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + + agentUUID := "test-agent-uuid-1234" + server := models.RemoteServer{ + UUID: "remote-orthrus-1", + Name: "Orthrus Server", + Host: "100.99.23.57", + Port: 2375, + Scheme: "http", + Enabled: true, + ConnectionType: models.ConnectionTypeOrthrus, + OrthrusAgentUUID: &agentUUID, + } + require.NoError(t, db.Create(&server).Error) + + require.NoError(t, us.SyncMonitors()) + + var monitor models.UptimeMonitor + require.NoError(t, db.Where("remote_server_id = ?", server.ID).First(&monitor).Error) + assert.Equal(t, "orthrus", monitor.Type) + assert.Equal(t, agentUUID, monitor.URL) + assert.Equal(t, "100.99.23.57", monitor.UpstreamHost) +} + +func TestSyncMonitors_OrthrusRemoteServer_MigratesExistingTCPMonitor(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + + agentUUID := "migrate-agent-uuid" + server := models.RemoteServer{ + UUID: "remote-migrate-1", + Name: "Migrate Server", + Host: "100.99.23.58", + Port: 2375, + Scheme: "http", + Enabled: true, + ConnectionType: models.ConnectionTypeOrthrus, + OrthrusAgentUUID: &agentUUID, + } + require.NoError(t, db.Create(&server).Error) + + legacyMonitor := models.UptimeMonitor{ + ID: "legacy-tcp-monitor", + RemoteServerID: &server.ID, + Name: server.Name, + Type: "tcp", + URL: fmt.Sprintf("%s:%d", server.Host, server.Port), + UpstreamHost: server.Host, + Enabled: true, + Status: "up", + } + require.NoError(t, db.Create(&legacyMonitor).Error) + + require.NoError(t, us.SyncMonitors()) + + var updated models.UptimeMonitor + require.NoError(t, db.Where("remote_server_id = ?", server.ID).First(&updated).Error) + assert.Equal(t, "orthrus", updated.Type) + assert.Equal(t, agentUUID, updated.URL) +} + +func TestSyncMonitors_NonOrthrusRemoteServer_StillUsesHTTP(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + + server := models.RemoteServer{ + UUID: "remote-direct-1", + Name: "Direct Server", + Host: "192.168.1.100", + Port: 8080, + Scheme: "http", + Enabled: true, + ConnectionType: models.ConnectionTypeDirect, + } + require.NoError(t, db.Create(&server).Error) + + require.NoError(t, us.SyncMonitors()) + + var monitor models.UptimeMonitor + require.NoError(t, db.Where("remote_server_id = ?", server.ID).First(&monitor).Error) + assert.Equal(t, "http", monitor.Type) + assert.Contains(t, monitor.URL, "192.168.1.100") +} + +func TestCheckHost_OrthrusOnlyHost_SkipsTCPDial(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + + uptimeHost := models.UptimeHost{ + Host: "100.99.23.57", + Name: "Orthrus Only Host", + Status: "pending", + } + require.NoError(t, db.Create(&uptimeHost).Error) + + hostID := uptimeHost.ID + orthrusMonitor := models.UptimeMonitor{ + ID: "orthrus-only-1", + Name: "Orthrus Monitor", + Type: "orthrus", + URL: "some-agent-uuid", + Enabled: true, + Status: "pending", + UptimeHostID: &hostID, + UpstreamHost: "100.99.23.57", + } + require.NoError(t, db.Create(&orthrusMonitor).Error) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + us.checkHost(ctx, &uptimeHost) + + var refreshed models.UptimeHost + require.NoError(t, db.Where("id = ?", uptimeHost.ID).First(&refreshed).Error) + assert.Equal(t, "pending", refreshed.Status, "Orthrus-only host should not have TCP pre-check run") +} + +func TestCheckHost_MixedHost_OrthrusAndTCP_DialsTCPOnly(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + us.config.FailureThreshold = 1 + + ln, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + port := ln.Addr().(*net.TCPAddr).Port + go func() { + conn, _ := ln.Accept() + if conn != nil { + _ = conn.Close() + } + }() + t.Cleanup(func() { _ = ln.Close() }) + + uptimeHost := models.UptimeHost{ + Host: "127.0.0.1", + Name: "Mixed Host", + Status: "pending", + } + require.NoError(t, db.Create(&uptimeHost).Error) + + hostID := uptimeHost.ID + orthrusMonitor := models.UptimeMonitor{ + ID: "mixed-orthrus-1", + Name: "Orthrus Monitor", + Type: "orthrus", + URL: "agent-uuid-xyz", + Enabled: true, + Status: "pending", + UptimeHostID: &hostID, + UpstreamHost: "127.0.0.1", + } + tcpMonitor := models.UptimeMonitor{ + ID: "mixed-tcp-1", + Name: "TCP Monitor", + Type: "tcp", + URL: fmt.Sprintf("127.0.0.1:%d", port), + Enabled: true, + Status: "pending", + UptimeHostID: &hostID, + UpstreamHost: "127.0.0.1", + } + require.NoError(t, db.Create(&orthrusMonitor).Error) + require.NoError(t, db.Create(&tcpMonitor).Error) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + us.checkHost(ctx, &uptimeHost) + + var refreshed models.UptimeHost + require.NoError(t, db.Where("id = ?", uptimeHost.ID).First(&refreshed).Error) + assert.Equal(t, "up", refreshed.Status, "Mixed host TCP port should succeed") +} + +func TestCheckMonitor_OrthrusType_AgentConnected_ReturnsUp(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + us.SetOrthrusResolver(&mockOrthrusResolver{addr: "127.0.0.1:54321", ok: true}) + + monitor := models.UptimeMonitor{ + ID: "orthrus-up-1", + Name: "Orthrus Connected", + Type: "orthrus", + URL: "connected-agent-uuid", + Enabled: true, + Status: "pending", + MaxRetries: 1, + } + require.NoError(t, db.Create(&monitor).Error) + + us.checkMonitor(monitor) + + var refreshed models.UptimeMonitor + require.NoError(t, db.Where("id = ?", monitor.ID).First(&refreshed).Error) + assert.Equal(t, "up", refreshed.Status) + + var heartbeat models.UptimeHeartbeat + require.NoError(t, db.Where("monitor_id = ?", monitor.ID).Order("created_at desc").First(&heartbeat).Error) + assert.Equal(t, "up", heartbeat.Status) + assert.Equal(t, "Orthrus session active", heartbeat.Message) +} + +func TestCheckMonitor_OrthrusType_AgentDisconnected_ReturnsDown(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + us.SetOrthrusResolver(&mockOrthrusResolver{addr: "", ok: false}) + + monitor := models.UptimeMonitor{ + ID: "orthrus-down-1", + Name: "Orthrus Disconnected", + Type: "orthrus", + URL: "disconnected-agent-uuid", + Enabled: true, + Status: "pending", + MaxRetries: 1, + FailureCount: 0, + } + require.NoError(t, db.Create(&monitor).Error) + + us.checkMonitor(monitor) + + var refreshed models.UptimeMonitor + require.NoError(t, db.Where("id = ?", monitor.ID).First(&refreshed).Error) + assert.Equal(t, "down", refreshed.Status) + + var heartbeat models.UptimeHeartbeat + require.NoError(t, db.Where("monitor_id = ?", monitor.ID).Order("created_at desc").First(&heartbeat).Error) + assert.Equal(t, "down", heartbeat.Status) + assert.Equal(t, "Orthrus agent not connected", heartbeat.Message) +} + +func TestCheckMonitor_OrthrusType_NilResolver_ReturnsDown(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + + monitor := models.UptimeMonitor{ + ID: "orthrus-nil-resolver", + Name: "Orthrus Nil Resolver", + Type: "orthrus", + URL: "some-agent-uuid", + Enabled: true, + Status: "pending", + MaxRetries: 1, + FailureCount: 0, + } + require.NoError(t, db.Create(&monitor).Error) + + us.checkMonitor(monitor) + + var heartbeat models.UptimeHeartbeat + require.NoError(t, db.Where("monitor_id = ?", monitor.ID).Order("created_at desc").First(&heartbeat).Error) + assert.Equal(t, "Orthrus subsystem unavailable", heartbeat.Message) +} + +func TestCheckAll_OrthrusMonitor_NotShortCircuitedWhenHostDown(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + us.config.FailureThreshold = 1 + us.SetOrthrusResolver(&mockOrthrusResolver{addr: "", ok: false}) + + uptimeHost := models.UptimeHost{ + Host: "100.99.23.57", + Name: "Down Orthrus Host", + Status: "down", + } + require.NoError(t, db.Create(&uptimeHost).Error) + + hostID := uptimeHost.ID + orthrusMonitor := models.UptimeMonitor{ + ID: "checkall-orthrus-1", + Name: "Orthrus Not Short-Circuited", + Type: "orthrus", + URL: "agent-uuid", + Enabled: true, + Status: "pending", + UptimeHostID: &hostID, + UpstreamHost: "100.99.23.57", + MaxRetries: 1, + } + require.NoError(t, db.Create(&orthrusMonitor).Error) + + us.CheckAll() + + assert.Eventually(t, func() bool { + var refreshed models.UptimeMonitor + if db.Where("id = ?", orthrusMonitor.ID).First(&refreshed).Error != nil { + return false + } + return refreshed.Status == "down" + }, 3*time.Second, 25*time.Millisecond) + + var heartbeat models.UptimeHeartbeat + err := db.Where("monitor_id = ?", orthrusMonitor.ID).Order("created_at desc").First(&heartbeat).Error + require.NoError(t, err) + assert.Equal(t, "Orthrus agent not connected", heartbeat.Message, + "Orthrus monitor should be checked via checkMonitor, not short-circuited") + assert.NotEqual(t, "Host unreachable", heartbeat.Message) +} + +// TestSyncMonitors_OrthrusRemoteServer_NilAgentUUID_SkipsMonitorCreation verifies +// that an Orthrus-type server with a nil OrthrusAgentUUID is skipped without +// creating a monitor (the continue guard inside SyncMonitors). +func TestSyncMonitors_OrthrusRemoteServer_NilAgentUUID_SkipsMonitorCreation(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + + server := models.RemoteServer{ + UUID: "remote-nil-agent", + Name: "Nil Agent Server", + Host: "100.99.23.60", + Port: 2375, + Scheme: "http", + Enabled: true, + ConnectionType: models.ConnectionTypeOrthrus, + OrthrusAgentUUID: nil, + } + require.NoError(t, db.Create(&server).Error) + + require.NoError(t, us.SyncMonitors()) + + var count int64 + db.Model(&models.UptimeMonitor{}).Where("upstream_host = ?", server.Host).Count(&count) + assert.Equal(t, int64(0), count, "no monitor should be created when OrthrusAgentUUID is nil") +} + +// TestCheckMonitor_OrthrusType_EmptyURL_ReturnsDown covers the empty agent UUID +// guard inside the "orthrus" case of checkMonitor (msg = "Monitor missing agent UUID"). +func TestCheckMonitor_OrthrusType_EmptyURL_ReturnsDown(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + us.SetOrthrusResolver(&mockOrthrusResolver{addr: "127.0.0.1:1234", ok: true}) + + monitor := models.UptimeMonitor{ + ID: "orthrus-empty-url", + Name: "Orthrus Empty URL", + Type: "orthrus", + URL: "", + Enabled: true, + Status: "pending", + MaxRetries: 1, + } + require.NoError(t, db.Create(&monitor).Error) + + us.checkMonitor(monitor) + + var heartbeat models.UptimeHeartbeat + require.NoError(t, db.Where("monitor_id = ?", monitor.ID). + Order("created_at desc").First(&heartbeat).Error) + assert.Equal(t, "Monitor missing agent UUID", heartbeat.Message) +} + +// TestCheckHost_NonOrthrusMonitorNoPort_SkipsTCPDial covers the !attempted early +// return: hasDialable is true (there is a non-orthrus monitor) but the monitor's +// URL yields no parseable port, so attempted stays false and checkHost returns +// without updating the host status. +func TestCheckHost_NonOrthrusMonitorNoPort_SkipsTCPDial(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db, nil) + us := newTestUptimeService(t, db, ns) + + uptimeHost := models.UptimeHost{ + Host: "10.0.0.1", + Name: "No Port Host", + Status: "pending", + } + require.NoError(t, db.Create(&uptimeHost).Error) + + hostID := uptimeHost.ID + noPortMonitor := models.UptimeMonitor{ + ID: "no-port-monitor-1", + Name: "No Port Monitor", + Type: "http", + URL: "just-a-hostname", + Enabled: true, + Status: "pending", + UptimeHostID: &hostID, + UpstreamHost: "10.0.0.1", + MaxRetries: 1, + } + require.NoError(t, db.Create(&noPortMonitor).Error) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + us.checkHost(ctx, &uptimeHost) + + var refreshed models.UptimeHost + require.NoError(t, db.Where("id = ?", uptimeHost.ID).First(&refreshed).Error) + assert.Equal(t, "pending", refreshed.Status, + "host status must not change when no TCP dial was attempted") +} diff --git a/docs/features.md b/docs/features.md index 2bbb7e77a..d7882dd82 100644 --- a/docs/features.md +++ b/docs/features.md @@ -203,7 +203,15 @@ Supports both local Docker installations and remote Docker servers, perfect for --- -### 🔀 Hecate Tunnel & Pathway Manager +### � Orthrus — Remote Tunnel Agent + +Your HomeLab is behind a firewall? Orthrus is a small agent you install on any remote machine. It dials outward to Charon over a secure connection — no open inbound ports required. Once connected, Charon can discover and proxy Docker containers on that machine just like local ones. + +→ [Learn More](features/orthrus.md) + +--- + +### �🔀 Hecate Tunnel & Pathway Manager Connect remote servers that sit behind firewalls or NAT routers—no open inbound ports required. Choose how each remote server reaches Charon from three simple connection modes, all managed from the Remote Servers page. diff --git a/docs/features/hecate.md b/docs/features/hecate.md new file mode 100644 index 000000000..2e2228ce6 --- /dev/null +++ b/docs/features/hecate.md @@ -0,0 +1,106 @@ +--- +title: Hecate — Tunnel & Pathway Manager +description: Choose how each remote server connects to Charon — direct, via agent, or through a VPN +category: features +--- + +# Hecate — Tunnel & Pathway Manager + +Think of Hecate as a traffic controller standing at a crossroads. When Charon needs to reach a remote server, Hecate decides which road to take: type in the address directly, send a message through the Orthrus agent you installed, or route through your VPN. + +You interact with Hecate every time you add a Remote Server — it's the part that asks "how do you want to connect?" + +--- + +## When Do You Need Hecate? + +**Direct Mode users:** You don't need to configure Hecate at all. Just type in the IP address or hostname and you're done. + +**Agent Mode users (Orthrus):** Register an Orthrus agent first (see the [Orthrus guide](orthrus.md)). Hecate then fills in the connection address automatically — no IP address hunting required. + +**Provider Mode users:** Your server is already on a VPN (like NetBird or Tailscale). Go to **Settings → Tunnel Providers**, add your VPN credentials there first, then Hecate can find your device on that network. + +--- + +## The Three Connection Modes + +When you add a Remote Server, you choose one of these modes: + +| Mode | When To Use | What You Provide | +|---|---|---| +| **Direct** | The machine is on your local network or has a public IP | Hostname or IP address + port | +| **Agent** | You installed Orthrus on the remote machine | Just pick the agent from the list | +| **Provider** | The machine is already on a VPN, no agent needed | Pick the VPN provider + the device | + +### Direct Mode + +You know where the server is — type in its address. This is the simplest option. Use it for machines on your home network or servers with a static public IP. + +### Agent Mode + +You installed the [Orthrus agent](orthrus.md) on the remote machine. Select it from the dropdown, and Charon fills in the connection details automatically from the agent's network information. You never have to know the IP address. + +### Provider Mode + +Your remote machine is already connected to a VPN service like NetBird, Tailscale, Cloudflare Tunnel, or ZeroTier. Add your VPN credentials once in **Settings → Tunnel Providers**, then pick that provider and the specific device when configuring the Remote Server. No agent installation needed. + +--- + +## Viewing Your Agents + +Go to **Remote Agents** in the sidebar to see all your Orthrus agents at a glance. Each one shows a live status badge: + +| Badge | Meaning | +|---|---| +| 🟢 Green | Connected and healthy | +| 🟡 Yellow | Connecting or experiencing delays | +| 🔴 Red | Unreachable | + +Click any agent to see its details, change its name, view install instructions, or assign it to a tunnel provider. + +--- + +## Supported Tunnel Providers + +These VPN and tunnel services work with Provider Mode: + +| Provider | What You Need | +|---|---| +| NetBird | NetBird API key | +| Tailscale | Tailscale API key | +| Cloudflare | Cloudflare Tunnel credentials | +| ZeroTier | ZeroTier network ID + node details | + +To add a provider: **Settings → Tunnel Providers → Add Provider** → choose your type → enter credentials → save. + +--- + +## Assigning a Tunnel to an Orthrus Agent + +If your agent is on a VPN, you can tell Charon exactly where on that network it lives. After you do this, Charon remembers the address — so the next time you add a Remote Server using this agent, everything fills in automatically. + +1. Go to **Remote Agents** and open the agent +2. Under **Network Assignment**, pick your Provider and the Device that represents your remote machine +3. Click **Save** + +--- + +## Uptime Monitoring Integration + +Remote servers managed through Orthrus agents work with [Uptime Monitoring](uptime-monitoring.md). Enable monitoring on any proxy host that routes to a remote container, and Charon will alert you if that service goes down — even if it's on a machine behind a firewall on the other side of the world. + +--- + +## Troubleshooting + +| Problem | Likely Cause | Fix | +|---|---|---| +| Provider shows an error state | Bad credentials or expired API key | Re-enter credentials in **Settings → Tunnel Providers** | +| Agent Mode address not filling in | No network assignment set on the agent | Open the agent → assign a Provider + Device → save | +| Tunnel keeps restarting | VPN provider is temporarily unreachable | This is normal — Hecate retries automatically with increasing delays | +| Device not listed in Provider Mode | Provider not yet configured | Add the provider in Settings first | + +--- + +*Need to set up an Orthrus agent first? See the [Orthrus guide](orthrus.md).* +*Ready to connect a remote Docker host? Follow the [Remote Docker Setup Guide](../guides/remote-docker-setup.md).* diff --git a/docs/features/orthrus.md b/docs/features/orthrus.md new file mode 100644 index 000000000..943aa6357 --- /dev/null +++ b/docs/features/orthrus.md @@ -0,0 +1,120 @@ +--- +title: Orthrus — Remote Tunnel Agent +description: Connect to Docker on a remote machine through a secure outbound tunnel — no open ports required +category: features +--- + +# Orthrus — Remote Tunnel Agent + +Imagine your HomeLab server is locked in a basement room with no way in from the outside. Orthrus is a small messenger you install *inside* that room. It reaches out to Charon and says "hey, I'm here — talk to me." Charon can then see what's running on that machine, even though it can never knock on the door itself. + +No port-forwarding. No firewall rules. No public IP address needed on the remote machine. + +--- + +## What Problem Does Orthrus Solve? + +Most home servers sit behind a router (a NAT firewall). From the internet's point of view, the server is invisible — nobody outside can start a conversation with it. + +Charon normally needs to reach your server directly, so this is a problem. + +**Orthrus flips the conversation.** Instead of Charon trying to reach your server, your server reaches out to Charon first. Once that outbound connection is open, Charon can talk back through it — seeing your Docker containers as if they were right next door. + +--- + +## How It Works + +1. **You install the Orthrus agent** on your remote machine (one command). +2. **The agent dials outward** to your Charon instance over a secure, encrypted connection — just like your browser visits a website. +3. **Charon keeps that connection open** and uses it to ask "what containers are running?" +4. **You see those containers in Charon** and can route websites to them, just like local ones. + +**Disconnections are handled automatically** — if the network hiccups, the agent reconnects on its own with no action required from you. + +> **Note:** Orthrus is read-only. It can list containers, images, and networks — but it cannot start, stop, delete, or modify anything on your remote machine. This is by design and cannot be changed. + +--- + +## Setting Up an Orthrus Agent + +### Step 1 — Register the Agent in Charon + +1. In the Charon sidebar, click **Remote Agents** +2. Click **Add Agent** +3. Give it a friendly name (e.g. "HomeLab Server" or "NAS") +4. Click **Create** + +### Step 2 — Save the Auth Key + +> ⚠️ **Save this key now.** It starts with `ch_orthrus_` and is shown **once only**. If you lose it, delete the agent and create a new one. + +Copy the key somewhere safe — a password manager, a note, anything. Once you close this screen, Charon will never show the full key again. + +### Step 3 — Install the Agent on Your Remote Machine + +Charon gives you a ready-made install snippet. Pick the method that fits your setup: + +| Method | Best For | +|---|---| +| Docker Compose | Servers already running Docker | +| systemd | Bare-metal Linux servers | +| Kubernetes | K8s clusters — deploys as a DaemonSet | +| Homebrew | macOS machines | +| Tarball | Any Linux without a package manager | + +1. Click the **Install** tab on the agent page +2. Choose your preferred method +3. Copy the snippet +4. On your **remote machine**, paste and run it (replace `` with the key you saved) + +### Step 4 — Watch It Go Online + +Back in Charon → **Remote Agents**, your agent should flip to **Online** within 10–30 seconds. + +That's it. You can now use this agent when [adding a Remote Server](../guides/remote-docker-setup.md). + +--- + +## Agent Status Reference + +| Status | Meaning | What To Do | +|---|---|---| +| ✅ Online | Connected and healthy | Nothing — you're good | +| ❌ Offline | Lost connection or not started | Check the agent is running on the remote machine | +| 🟡 Pending | Registered but never connected yet | Run the install snippet on the remote machine | + +--- + +## What Orthrus Can (and Cannot) Do + +Orthrus only ever lets Charon **read** information from your remote Docker. It cannot touch anything. + +**It CAN:** +- List running containers and their details +- List images, networks, and volumes +- Stream container logs (for display in Charon) +- Report Docker system info + +**It CANNOT:** +- Start, stop, restart, or delete containers +- Create or remove networks or volumes +- Pull images +- Run commands inside containers + +This restriction is enforced at every single request — there is no way to turn it off. + +--- + +## Troubleshooting + +| Problem | Likely Cause | Fix | +|---|---|---| +| Agent stays **Pending** | Snippet not run yet | Run it on the remote machine | +| Agent shows **Offline** | Agent process stopped | Restart the agent service or container | +| Agent goes **Offline** after reboot | Not set to start automatically | Use the systemd snippet, or add `restart: always` to Docker Compose | +| Auth key lost | Page closed before saving | Delete the agent and create a new one — the key cannot be recovered | +| Agent connects but no containers appear | Docker socket not mounted | Add `/var/run/docker.sock:/var/run/docker.sock:ro` to the agent's volume list | + +--- + +*Ready to connect your first remote server? Follow the [Remote Docker Setup Guide](../guides/remote-docker-setup.md).* diff --git a/docs/features/uptime-monitoring.md b/docs/features/uptime-monitoring.md index 1159b02b4..d729c6862 100644 --- a/docs/features/uptime-monitoring.md +++ b/docs/features/uptime-monitoring.md @@ -518,6 +518,10 @@ Use this API to integrate Charon's uptime data with: - [Notification Configuration Guide](notifications.md) - [Proxy Host Setup](../getting-started.md) + +--- + +*Monitoring a service on a remote Docker host? See [Connecting a Remote Docker Host](../guides/remote-docker-setup.md).* - [Troubleshooting Guide](../troubleshooting/) - [Security Best Practices](../security.md) diff --git a/docs/guides/remote-docker-setup.md b/docs/guides/remote-docker-setup.md new file mode 100644 index 000000000..341951ed7 --- /dev/null +++ b/docs/guides/remote-docker-setup.md @@ -0,0 +1,157 @@ +--- +title: Connecting a Remote Docker Host +description: Step-by-step guide to managing Docker on a remote machine through Charon using the Orthrus agent +category: guides +--- + +# Connecting a Remote Docker Host + +Your HomeLab is in the basement. Charon is running somewhere else — maybe on a cloud server, maybe in your office. This guide connects them safely, without opening any ports on your HomeLab or touching your router. + +By the end, Charon will list your remote machine's Docker containers just like local ones, and you can route websites to them with a single click. + +--- + +## Before You Start + +Make sure you have: + +- [ ] Charon is running and you can log in +- [ ] The remote machine (HomeLab) has Docker installed +- [ ] The remote machine can reach the internet (standard outbound HTTPS on port 443) +- [ ] You can run commands on the remote machine — via SSH, a keyboard, or a terminal app + +--- + +## Step 1 — Register an Agent in Charon + + + +1. In the Charon sidebar, click **Remote Agents** +2. Click **Add Agent** +3. Type a name for this machine — something like "HomeLab" or "NAS Box" +4. Click **Create** + +> ⚠️ **Copy the auth key before closing this screen.** It starts with `ch_orthrus_` and is shown **one time only**. Paste it into a note or your password manager immediately. If you lose it, you'll need to delete the agent and start over. + + + +--- + +## Step 2 — Install the Agent on Your Remote Machine + +Still in Charon, click the **Install** tab on the agent page. You'll see several ready-made snippets. **Docker Compose is the easiest if your remote machine already runs Docker.** + + + +### Using Docker Compose (recommended) + +On your **remote machine**, open a terminal and: + +1. Create a new folder for the agent: `mkdir orthrus-agent && cd orthrus-agent` +2. Copy the Docker Compose snippet from the Charon UI +3. Create a file called `docker-compose.yml` and paste the snippet into it +4. Replace `` with the key you saved in Step 1 +5. Run: + +```bash +docker compose up -d +``` + +The agent starts in the background and immediately tries to connect to Charon. + +> **Other install options:** If you prefer not to use Docker, the **Install** tab also provides a systemd unit file (for Linux servers), a Homebrew formula (for macOS), a Kubernetes DaemonSet, and a plain tarball. All work the same way — paste in your auth key and run it. + +--- + +## Step 3 — Verify the Agent Is Online + + + +Switch back to your **Charon browser tab** and look at **Remote Agents**. Your agent should show a green **Online** badge within 10–30 seconds. + +**Still showing "Pending"?** That means the agent hasn't connected yet. Double-check: +- The `docker-compose.yml` is saved correctly and `docker compose up -d` ran without errors +- The auth key in the file matches what Charon gave you (no extra spaces or missing characters) +- The remote machine has outbound internet access + +--- + +## Step 4 — (Optional) Assign a VPN Tunnel + +> **Skip this step** if your remote machine has a direct IP address that Charon can see — for example it's on the same local network, or it has a public IP. Go straight to Step 5. + +If your remote machine is behind a NAT (no public IP) **and** you also use a VPN like NetBird or Tailscale, you can tell Hecate which VPN device is your remote machine. This lets Charon resolve the address automatically. + +1. First, add your VPN provider credentials: **Settings → Tunnel Providers → Add Provider** +2. Then open your agent in **Remote Agents** +3. Under **Network Assignment**, pick your provider and the device that represents the remote machine +4. Click **Save** + +Not sure which option applies to you? The [Hecate guide](../features/hecate.md) explains each connection mode in plain English. + +--- + +## Step 5 — Add the Remote Machine as a Docker Host + + + +1. Go to **Settings → Docker** (you may also find this under **Remote Servers**) +2. Click **Add Remote Host** +3. Set **Connection Mode** to **Agent** +4. In the **Agent** dropdown, choose the agent you just registered +5. Click **Test Connection** + +If you see a green success message, Charon can reach your remote Docker. Click **Save**. + +If the test fails, check that the agent still shows **Online** in Remote Agents. + +--- + +## Step 6 — Use Your Remote Containers + + + +Your remote machine now appears as a Docker source anywhere Charon asks "which container?". + +1. Go to **Hosts → Add Host** +2. Click **Select from Docker** +3. In the host dropdown, select your remote machine (it'll show the name you gave it) +4. Browse the containers running there and pick one +5. Fill in your domain name and click **Save** + +Charon now proxies traffic through the Orthrus tunnel to reach that container. From the visitor's point of view, it's just a normal website with HTTPS. + +--- + +## (Optional) Add Uptime Monitoring + +Want to know immediately if a remote service goes down? Enable uptime monitoring on any proxy host that points to a remote container: + +1. Open the proxy host in **Hosts** +2. Click **Edit** +3. Scroll to **Uptime Monitoring** and toggle it on +4. Click **Save** + +Charon will check the service regularly and alert you through your configured notification channels if it becomes unreachable. + +→ Learn more: [Uptime Monitoring](../features/uptime-monitoring.md) + +--- + +## Troubleshooting + +| Problem | Likely Cause | Fix | +|---|---|---| +| Agent stays **Pending** | Install snippet not run yet | Run `docker compose up -d` on the remote machine | +| **Test Connection** fails | Agent is offline | Check the agent container is running: `docker ps` | +| No containers listed | Docker socket not mounted in agent | Add `-v /var/run/docker.sock:/var/run/docker.sock:ro` to the agent's volumes | +| Auth key rejected | Key copied incorrectly | Delete the agent, create a new one, copy the key carefully | +| Agent disconnects repeatedly | Network instability | Normal — the agent reconnects automatically, no action needed | + +--- + +## What's Next? + +- **[Orthrus guide](../features/orthrus.md)** — More detail on the agent, install methods, and the read-only safety filter +- **[Hecate guide](../features/hecate.md)** — All three connection modes explained, plus VPN provider setup diff --git a/docs/index.md b/docs/index.md index 26071e01c..3798c00f2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -33,6 +33,14 @@ description: Charon documentation home. A modern, user-friendly reverse proxy ma --- +## 🌐 Remote Access + +**[Orthrus Tunnel Agent](features/orthrus.md)** — Connect a home server behind a firewall — no port forwarding needed +**[Hecate Agent Manager](features/hecate.md)** — Choose how each remote server connects +**[Connecting a Remote Docker Host](guides/remote-docker-setup.md)** — Step-by-step guide + +--- + ## ❓ Need Help? **[💬 Ask a Question](https://github.com/Wikid82/charon/discussions)** — No question is too basic diff --git a/docs/plans/current_spec.md b/docs/plans/current_spec.md index 34c7d9189..73ed04c7c 100644 --- a/docs/plans/current_spec.md +++ b/docs/plans/current_spec.md @@ -1,682 +1,711 @@ ---- -post_title: Caddy Version Drift in GitHub Security Root-Cause Plan -categories: - - plans -tags: - - security - - trivy - - docker - - github-actions - - supply-chain -summary: Investigation and durable remediation plan to identify why GitHub Security still reports Caddy 2.11.2 after Dockerfile was updated to 2.11.3, with concrete verification commands, failure-mode matrix, and commit slicing. -post_date: 2026-05-24 ---- - -## Introduction - -### Overview - -Dockerfile now pins Caddy to 2.11.3, but GitHub Security still shows Trivy -alerts indicating 2.11.2. This plan defines how to identify the exact reporting -source (workflow, category, artifact, branch, platform, or cache path), prove -root cause with evidence, and implement durable fixes so alert state converges -with actual shipped images. - -### Objectives - -- Identify the precise workflow/category/artifact still emitting Caddy 2.11.2. -- Determine why that path bypassed or outlived the Dockerfile update. -- Define durable remediation over one-off closures. -- Validate remediation with repeatable commands and expected outcomes. - -### Scope - -In scope: - -- GitHub Actions workflows that build images and upload SARIF. -- Trivy/Grype/SBOM/provenance pipeline wiring. -- PR/push/nightly/weekly branch and category behavior. -- Local task/skill wiring that influences operator assumptions. -- Minimal config updates needed for long-term correctness. - -Out of scope: - -- Unrelated frontend/backend product behavior. -- Dependency policy changes not related to the stale Caddy signal. - -## Requirements - -### EARS Requirements - -- WHEN Dockerfile pins Caddy to 2.11.3, THE SYSTEM SHALL report Caddy 2.11.3 - in all image-based security scans for the same source revision. -- WHEN SARIF is uploaded from multiple workflows, THE SYSTEM SHALL use - unambiguous categories that map to active workflows only. -- IF a workflow scans a partial artifact (for example only one binary), THEN THE - SYSTEM SHALL not be treated as authoritative for full container component - status. -- WHEN builds are skipped or use stale tags/artifacts, THE SYSTEM SHALL emit an - explicit signal that scan freshness is unknown. -- WHEN multi-arch images are published, THE SYSTEM SHALL verify Caddy version - per relevant platform digest, not only by floating tag. - -## Research Findings - -### Confirmed Version Pin - -- Dockerfile sets CADDY_VERSION=2.11.3 and CADDY_CANDIDATE_VERSION=2.11.3. -- Caddy is built from source in caddy-builder and copied into runtime. - -### Exact Files and Workflows to Inspect - -#### Primary CI and Security Sources - -- [.github/workflows/docker-build.yml](.github/workflows/docker-build.yml) -- [.github/workflows/security-pr.yml](.github/workflows/security-pr.yml) -- [.github/workflows/security-weekly-rebuild.yml](.github/workflows/security-weekly-rebuild.yml) -- [.github/workflows/nightly-build.yml](.github/workflows/nightly-build.yml) -- [.github/workflows/supply-chain-pr.yml](.github/workflows/supply-chain-pr.yml) -- [.github/workflows/supply-chain-verify.yml](.github/workflows/supply-chain-verify.yml) - -#### Scan Configuration and Ignore Policy - -- [trivy.yaml](trivy.yaml) -- [.trivyignore](.trivyignore) -- [.grype.yaml](.grype.yaml) - -#### Task Wiring and Local Operator Paths - -- [.vscode/tasks.json](.vscode/tasks.json) -- [.github/skills/scripts/skill-runner.sh](.github/skills/scripts/skill-runner.sh) -- [.github/skills/security-scan-trivy-scripts/run.sh](.github/skills/security-scan-trivy-scripts/run.sh) -- [.github/skills/security-scan-docker-image-scripts/run.sh](.github/skills/security-scan-docker-image-scripts/run.sh) - -#### Documentation that Influences Scan Interpretation - -- [SECURITY.md](SECURITY.md) -- [docs/guides/supply-chain-security-developer-guide.md](docs/guides/supply-chain-security-developer-guide.md) -- [ARCHITECTURE.md](ARCHITECTURE.md) - -### High-Signal Observations +# Plan: Orthrus/Hecate Docs + FeedbackWidget Docs Link -- docker-build.yml uploads Trivy SARIF under multiple categories, including a - legacy compatibility alias: - .github/workflows/docker-publish.yml:build-and-push. -- security-pr.yml does filesystem scan of extracted /app/charon binary, - not full image and not /usr/bin/caddy; this path cannot be authoritative for - Caddy image component status. -- nightly-build.yml scans category trivy-nightly and uses separate branch - flow; stale findings can persist if nightly diverges from main or is not - rebuilt after merge. -- docker-build.yml has skip logic for chore/Renovate patterns; if build is - skipped on a critical version-bump commit, canonical scan categories may not - refresh. -- supply-chain-verify.yml has pull-request tag logic that still references - pr- style in some paths while docker-build.yml emits immutable - pr-- tags; this can cause wrong-artifact lookups in some modes. -- Local Trivy skill scans repository filesystem (trivy fs /app), which can - disagree with image scans and should not be used to infer GitHub Security - image alert closure behavior. +**Status:** Draft — Pending Review +**Date:** 2025-06 +**Scope:** Documentation authoring (ELI5 feature pages) + Frontend widget enhancement -## Probable Failure Modes - -### FM-1 Stale SARIF Category Track (Legacy Alias) - -- Source: docker-build.yml compatibility upload to - .github/workflows/docker-publish.yml:build-and-push. -- Effect: old category can keep/open alerts even after active workflow updates. - -### FM-2 Scanner Targets Wrong Artifact - -- Source: security-pr.yml scans only extracted /app/charon. -- Effect: misses Caddy binary (/usr/bin/caddy) status and gives false closure - confidence. - -### FM-3 Build Skipped, Scan Never Refreshed - -- Source: skip logic on chore/bot commits in docker-build.yml. -- Effect: post-merge categories not refreshed, old alerts remain open. - -### FM-4 Branch Mismatch - -- Source: nightly/dev/main have separate scan categories and schedules. -- Effect: Security tab still shows 2.11.2 from non-main category/ref. - -### FM-5 Stale Cached Build Output - -- Source: BuildKit cache behavior in certain workflows/stages. -- Effect: rebuilt image may still embed older Caddy artifact in one path. - -### FM-6 PR Tag and Artifact Resolution Drift - -- Source: mutable/legacy tag expectations (pr-) vs immutable tags - (pr--). -- Effect: scanner pulls unexpected image. +--- -### FM-7 Trivy DB/Cache Timing and Feed Drift +## 1. Introduction -- Source: DB cache freshness differences between runs. -- Effect: inconsistent vulnerability metadata across runs. +### Overview -### FM-8 Multi-Arch Manifest Mismatch +This plan covers two independent, non-blocking deliverables: -- Source: one architecture updated while another remains old. -- Effect: alerts persist for platform-specific digest even if amd64 looks fixed. +1. **Docs Deliverable** — Write ELI5-level documentation files for two undocumented features (Orthrus and Hecate) and a remote Docker setup guide, fix a broken link, and update index/nav entries to surface these pages. +2. **Widget Deliverable** — Add a "View Documentation" third link to the floating `FeedbackWidget` React component so users can navigate to the docs site directly from anywhere in the UI. -### FM-9 Old SARIF Still Open on Default Branch +### Objectives -- Source: no subsequent successful upload for same category/ref to close prior - result set. -- Effect: stale findings remain visible. +- Close the broken `features/hecate.md` reference in `docs/features.md` line 226. +- Create `docs/features/orthrus.md` — dedicated ELI5 explainer for the Orthrus tunnel agent. +- Create `docs/features/hecate.md` — ELI5 explainer for the Hecate Tunnel & Pathway Manager. +- Create `docs/guides/remote-docker-setup.md` — step-by-step guide for connecting a remote HomeLab/server via Orthrus. +- Update `docs/index.md` to surface these three new pages. +- Update `docs/features.md` to add an Orthrus entry and fix the Hecate link. +- Add a third "View Docs" link to `FeedbackWidget.tsx` with full i18n and accessibility support. +- Update `frontend/src/components/__tests__/FeedbackWidget.test.tsx` to cover the new link. +- Update `frontend/src/locales/en/translation.json` with the new i18n keys. -## Technical Specifications +--- -### Investigation Data Model +## 2. Research Findings + +### 2.1 Docs Site Architecture + +- **Framework:** No `mkdocs.yml` was found anywhere in the repository. The docs site is authored as raw Markdown under `docs/` and served via GitHub Pages. +- **Base URL:** `https://wikid82.github.io/Charon/` (from `README.md` line 134). +- **Navigation:** Purely file-system-based relative links; there is no central nav config file to update. +- **Existing docs gaps:** + - `docs/features/hecate.md` — **does not exist** but is linked from `docs/features.md` line 226 — this is an active broken link (bug fix). + - `docs/features/orthrus.md` — does not exist, no link yet. + - `docs/guides/remote-docker-setup.md` — does not exist, no link yet. + +### 2.2 Orthrus System + +- **Package:** `backend/internal/orthrus/` +- **What it is:** A reverse-WebSocket tunnel agent system. An `OrthrusAgent` binary runs on the remote machine, connects outbound via WebSocket to Charon's management interface, and multiplexes streams over yamux. Charon uses these multiplexed streams to talk to Docker on the remote machine. +- **Why it exists:** Remote Docker hosts behind NAT/firewalls cannot accept inbound TCP connections. Orthrus flips the direction — the remote agent dials outward to Charon. +- **Muzzle filter (`muzzle.go`):** Restricts Docker API access to a read-only allowlist (`/containers/json`, `/images/json`, `/_ping`, `/info`, `/version`, `/events`, `/volumes`, `/networks`, `/system/df`). Dynamic read-only patterns: `/containers/*/json`, `/containers/*/logs`, `/containers/*/stats`, `/containers/*/top`. All non-GET methods blocked (except HEAD `/_ping`). HTTP 403 for disallowed paths. +- **Key model fields (`models/orthrus_agent.go`):** + - `UUID`, `Name`, `Status` — `OrthrusStatus`: "online" / "offline" / "pending" + - `AuthKeyHash` — bcrypt hash; `json:"-"` (never exposed); plain key shown once at provisioning, prefixed `ch_orthrus_` + - `Capabilities` — JSON array, e.g. `["docker", "tcp:5432"]` + - `AgentCertPEM` — mTLS cert from Charon's internal CA + - `HecateTunnelUUID` — links agent to a Hecate tunnel provider + - `ResolvedAddress` — cached connectivity address + - `ExternalProxyPort` — TCP port for inter-container Docker API access (0 = disabled) + - `LastHeartbeat`, `LastSeen` +- **Install surfaces (`snippets.go`):** Docker Compose, systemd, tarball, Homebrew, Kubernetes DaemonSet — delivered via `GET /orthrus/agents/:uuid/snippets`. +- **REST API (`orthrus_handler.go`):** + - `GET /management/orthrus/agents` — list agents + - `POST /management/orthrus/agents` — provision (returns one-time auth key) + - `GET /management/orthrus/agents/:uuid` — get one agent + - `PATCH /management/orthrus/agents/:uuid` — update + - `DELETE /management/orthrus/agents/:uuid` — delete + - `POST /management/orthrus/agents/:uuid/revoke` — revoke auth key + - `GET /management/orthrus/agents/:uuid/snippets` — install instructions + - `GET /management/orthrus/agents/:uuid/proxy-status` — live external proxy state +- **WebSocket endpoint:** `GET /api/v1/ws/orthrus/connect` — Bearer token auth (bcrypt), HeartbeatTimeout 10 seconds. +- **`RemoteServer` linkage:** `ConnectionTypeOrthrus = "orthrus"` in `models/remote_server.go`; `OrthrusAgentUUID *string` field links a host config to its agent. + +### 2.3 Hecate System + +- **Package:** `backend/internal/hecate/` +- **What it is:** The Tunnel & Pathway Manager. Manages third-party tunneling providers (Cloudflare, Tailscale, ZeroTier, NetBird) and integrates the Orthrus agent protocol. `TunnelManager` supervises lifecycle of all active tunnel providers with exponential backoff restart (5s → 10s → 30s → 60s). +- **Currently registered provider:** `netbird` (`NewHecateService` in `services/hecate_service.go`). Architecture supports cloudflare, tailscale, zerotier via `RegisterFactory()`. +- **`HecateService`:** CRUD for `TunnelConfig` records; delegates start/stop to `TunnelManager`. Credentials encrypted AES-GCM before DB storage. If `IsActive=true` at creation, tunnel starts immediately. +- **Connection modes (from `docs/features.md`):** + - **Direct** — manual hostname/IP + - **Agent** — pick an Orthrus agent; address resolved from `OrthrusAgent.ResolvedAddress` + - **Provider** — pick a VPN tunnel device directly (no agent required) +- **Relationship to Orthrus:** Each Orthrus agent can be assigned a `HecateTunnelUUID` pointing to a provider tunnel, giving it a `ResolvedAddress`. Remote Servers then use `ConnectionTypeOrthrus`. + +### 2.4 FeedbackWidget + +- **File:** `frontend/src/components/FeedbackWidget.tsx` +- **Current links:** 2 — "Report a Bug" (`GITHUB_BUG_URL`) and "Request a Feature" (`GITHUB_FEATURE_URL`) +- **Structure:** `