diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 9d046c3..f3d19ed 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -91,16 +91,26 @@ jobs: if: github.event_name == 'workflow_run' run: | CONCLUSION="${{ github.event.workflow_run.conclusion }}" + BRANCH="${{ github.event.workflow_run.head_branch }}" SHA="${{ github.event.workflow_run.head_sha }}" echo "CodeQL deep scan conclusion : $CONCLUSION" echo "Scanned commit SHA : $SHA" + echo "Head branch : $BRANCH" + # Branch guard: only master commits deploy to production. + # The workflow_run trigger already filters branches: [master], but this + # explicit check makes the policy visible in the job log and provides a + # hard error if the filter is ever widened accidentally. + if [ "$BRANCH" != "master" ]; then + echo "::error::Deploy blocked — head_branch=$BRANCH (only master is allowed to deploy to production)." + exit 1 + fi if [ "$CONCLUSION" != "success" ]; then echo "::error::CodeQL deep scan did not pass (conclusion=$CONCLUSION)." echo " Deployment is blocked. Review findings before retrying:" echo " https://github.com/${{ github.repository }}/security/code-scanning" exit 1 fi - echo "✓ CodeQL gate passed — safe to deploy SHA $SHA" + echo "✓ CodeQL gate passed — safe to deploy SHA $SHA (branch=$BRANCH)" # --------------------------------------------------------------------------- # JOB: validate diff --git a/infra/nginx/api.conf b/infra/nginx/api.conf index 7ba06e3..571089d 100644 --- a/infra/nginx/api.conf +++ b/infra/nginx/api.conf @@ -112,9 +112,12 @@ server { # Enables runtime DNS resolution for Docker service names (e.g., grafana:3000). # Without this, Nginx fails at config-load with: "host not found in upstream". # Docker's embedded resolver is at 127.0.0.11:53. - # valid=10s caches DNS lookups for 10 seconds. + # valid=5s caches DNS lookups for 5 seconds — short enough that after a + # blue-green switch nginx re-resolves the new container within one health + # check cycle. ipv6=off stops AAAA queries that Docker bridge networks do + # not answer, which can add latency or cause spurious resolution failures. # ───────────────────────────────────────────────────────────────────────────── - resolver 127.0.0.11 valid=10s; + resolver 127.0.0.11 valid=5s ipv6=off; resolver_timeout 5s; # Variable-based backend URL — resolved at request time via Docker DNS (127.0.0.11). diff --git a/scripts/deploy-bluegreen.sh b/scripts/deploy-bluegreen.sh index 2e6a7c4..718fd5e 100644 --- a/scripts/deploy-bluegreen.sh +++ b/scripts/deploy-bluegreen.sh @@ -157,6 +157,42 @@ BLUE_NAME="api-blue" GREEN_NAME="api-green" APP_PORT=3000 NETWORK="api_network" +# Pinned curl container for in-network health probes. +# Running on api_network exercises Docker DNS + bridge routing — the same +# path that nginx uses — catching connectivity issues that docker exec +# localhost bypasses (docker exec goes direct to the container loopback). +_FT_CURL_IMG="curlimages/curl:8.7.1" +# In-network curl helper with local fallback. +# +# Primary: short-lived curlimages/curl container on api_network. +# Exercises Docker DNS + bridge routing (same path nginx uses). +# Fallback: docker exec curl when the curl image cannot be pulled +# or Docker Hub is unreachable. Covers cold-VPS / egress-blocked cases. +# +# Usage: _ft_net_curl +# The first argument is the container name — used ONLY for the fallback. +# Remaining arguments are passed verbatim to curl. +_ft_net_curl() { + local _target_container="$1"; shift + # Primary: in-network (Docker DNS + bridge routing) + if docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" "$@" >/dev/null 2>&1; then + return 0 + fi + # Fallback: exec into target container's loopback (skips Docker DNS but + # confirms HTTP server is alive inside the container) + docker exec "$_target_container" curl -sf --max-time 3 "$@" >/dev/null 2>&1 +} +# Variant that captures the response body or HTTP status code instead of +# just testing. Used where we need the response text for status checks. +# Usage: _ft_net_curl_out +_ft_net_curl_out() { + local _target_container="$1"; shift + local _out + _out=$(docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" "$@" 2>/dev/null) \ + || _out=$(docker exec "$_target_container" curl --max-time 3 "$@" 2>/dev/null) \ + || _out="" + printf '%s' "$_out" +} SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" @@ -431,6 +467,20 @@ docker network create --driver bridge "$NETWORK" 2>/dev/null \ && _ft_log "msg='api_network created'" \ || _ft_log "msg='api_network already exists'" +# GLOBAL PORT-LEAK GUARD -- api-blue/api-green MUST NOT bind host ports. +# All API traffic flows: Cloudflare → nginx (binds 80/443) → api_network. +# nginx is exempt; api containers with host ports bypass the nginx layer +# and would expose the API without TLS or rate-limiting. +_API_PORT_LEAKS=$(docker ps --format '{{.Names}} {{.Ports}}' 2>/dev/null \ + | grep -E '^api-(blue|green)' \ + | grep -E '(0\.0\.0\.0:|127\.0\.0\.1:)[0-9]+->') || true +if [ -n "${_API_PORT_LEAKS:-}" ]; then + _ft_log "level=ERROR msg='API container has host port bindings — forbidden. Remove and recreate without -p.' leaks=${_API_PORT_LEAKS}" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=api_port_leak_detected" +fi +unset _API_PORT_LEAKS +_ft_log "msg='port-leak guard passed — no API containers with host port bindings'" + # NGINX CONTAINER GUARD -- nginx MUST run as a Docker container on api_network. # With container-name upstreams (server api-blue:3000), Docker's embedded DNS # (127.0.0.11) is required for name resolution. This only works from WITHIN @@ -445,11 +495,17 @@ if ! docker inspect nginx >/dev/null 2>&1; then # Write a bootstrap config pointing at api-blue (default first-deploy slot) # so nginx can start without waiting for an API container. if [ ! -f "$NGINX_CONF" ]; then + # Permission check: ensure deploy user can write to nginx live dir + if [ ! -w "$(dirname "$NGINX_CONF")" ]; then + sudo chown -R "$(id -un):$(id -gn)" "$(dirname "$NGINX_CONF")" + fi + _NGINX_GUARD_TMP="$(mktemp /tmp/api-nginx-guard.XXXXXX.conf)" sed \ -e "s|__ACTIVE_CONTAINER__|api-blue|g" \ -e "s|__API_HOSTNAME__|${API_HOSTNAME}|g" \ - "$NGINX_TEMPLATE" > "$NGINX_CONF" - _ft_log "msg='bootstrap nginx config written' target=api-blue path=$NGINX_CONF" + "$NGINX_TEMPLATE" > "$_NGINX_GUARD_TMP" + mv "$_NGINX_GUARD_TMP" "$NGINX_CONF" + _ft_log "msg='bootstrap nginx config written (atomic)' target=api-blue path=$NGINX_CONF" fi # Kill any ghost docker-proxy holdind host ports before starting nginx pkill docker-proxy 2>/dev/null || true @@ -520,7 +576,13 @@ _ft_log "msg='deploy metadata' sha=$IMAGE_SHA image=$IMAGE script_dir=$SCRIPT_DI # --------------------------------------------------------------------------- _ft_state "PULL_IMAGE" "msg='pulling container image' sha=$IMAGE_SHA" -timeout 300 docker pull "$IMAGE" +# Explicit pull with hard error. +# Without this guard a missing image would cause docker run to attempt a +# background pull inside a 60-s timeout, racing the readiness loop. +if ! timeout 120 docker pull "$IMAGE"; then + _ft_log "level=ERROR msg='image pull failed' image=$IMAGE" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=image_pull_failed image=$IMAGE" +fi _ft_log "msg='image pulled' image=$IMAGE" # --------------------------------------------------------------------------- @@ -557,23 +619,30 @@ if ! docker ps -a --format '{{.Names}}' | grep -Eq '^api-(blue|green)$'; then _ft_log "msg='bootstrap: api-blue started' image=$IMAGE" - # Wait for /ready — same polling logic as [4/7] HEALTH_CHECK_INTERNAL + # Grace window: give the process time to bind and initialise workers. + # /ready can lag the HTTP server bind by ~1–3 s while workers start. + sleep 2 + + # Bootstrap readiness: use docker exec (only safe choice when no other + # container is guaranteed to be running yet on api_network). _BOOT_OK=false for _bi in $(seq 1 20); do - if timeout 4 curl -sf "http://api-blue:${APP_PORT}/ready" >/dev/null 2>&1; then + if docker exec api-blue curl -sf --max-time 4 "http://localhost:${APP_PORT}/ready" >/dev/null 2>&1; then _ft_log "msg='bootstrap: api-blue ready' attempt=$_bi" _BOOT_OK=true break fi _ft_log "msg='bootstrap: waiting for api-blue readiness' attempt=$_bi/20" - sleep 3 + sleep 2 done if [ "$_BOOT_OK" != "true" ]; then - _ft_log "level=ERROR msg='bootstrap: api-blue did not become ready after 60s'" + _ft_log "level=ERROR msg='bootstrap: api-blue did not become ready after 60s — container PRESERVED for debugging'" + # DO NOT remove the container on bootstrap failure: + # - Preserves logs and state for post-mortem: docker logs api-blue + # - Removing here loses all debugging visibility + # - Operator can inspect and restart manually docker logs api-blue --tail 50 >&2 || true - docker stop --time 10 api-blue 2>/dev/null || true - docker rm api-blue || true _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=bootstrap_api_ready_timeout" fi unset _bi _BOOT_OK @@ -588,20 +657,34 @@ if ! docker ps -a --format '{{.Names}}' | grep -Eq '^api-(blue|green)$'; then cp "$NGINX_BOOT_TMP" "$NGINX_CONF" rm -f "$NGINX_BOOT_TMP" - if docker exec nginx nginx -t 2>&1; then - docker exec nginx nginx -s reload - _ft_log "msg='bootstrap: nginx reloaded to api-blue'" - else - _ft_log "level=ERROR msg='bootstrap: nginx config test failed — leaving existing config'" + # Nginx network attachment guard — must be on api_network before reload. + _NGINX_BOOT_NET=$(docker inspect nginx \ + --format='{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' 2>/dev/null || echo "") + if ! echo "$_NGINX_BOOT_NET" | grep -q "$NETWORK"; then + _ft_log "level=ERROR msg='bootstrap: nginx not attached to api_network' networks=${_NGINX_BOOT_NET}" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_network_mismatch_bootstrap" fi + unset _NGINX_BOOT_NET - # Persist slot state + # Fail-fast: any nginx test/reload failure is a hard error at bootstrap. + if ! docker exec nginx nginx -t 2>&1; then + _ft_log "level=ERROR msg='bootstrap: nginx config test failed'" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_config_test_failed_bootstrap" + fi + docker exec nginx nginx -s reload \ + || { _ft_log "level=ERROR msg='bootstrap: nginx reload failed'"; _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_reload_failed_bootstrap"; } + _ft_log "msg='bootstrap: nginx reloaded to api-blue'" + + # Persist slot state (atomic write already in _ft_write_slot) _ft_write_slot "blue" # Snapshot last-known-good - printf 'blue\n%s\n%s\n' "$IMAGE_SHA" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" > "$LAST_GOOD_FILE" + _SNAP_BOOT_TMP=$(mktemp "${SNAP_DIR}/last-good.XXXXXX") + printf 'slot=blue container=api-blue ts=%s\n' "$(date -Iseconds)" > "$_SNAP_BOOT_TMP" + mv "$_SNAP_BOOT_TMP" "$LAST_GOOD_FILE" + unset _SNAP_BOOT_TMP - _ft_exit 0 "DEPLOY_SUCCESS" "reason=bootstrap_success slot=blue image=$IMAGE" + _ft_exit 0 "BOOTSTRAP_SUCCESS" "slot=blue image=$IMAGE" fi # --------------------------------------------------------------------------- @@ -616,6 +699,28 @@ ACTIVE=$(_ft_resolve_slot) || { ACTIVE=$(printf '%s' "$ACTIVE" | tr -d '[:space:]') _ft_validate_slot "$ACTIVE" || exit 1 +# SLOT REPAIR — heal slot file drift from reality. +# If the slot file says "green" but api-green is gone (OOM/manual removal), +# flip the effective slot to whatever container IS actually running. +# This prevents a deploy from treating a missing container as the "active" one. +if [ "$ACTIVE" = "green" ] && ! docker inspect api-green >/dev/null 2>&1; then + _ft_log "msg='slot repair: green missing — switching effective slot to blue' original_slot=green" + ACTIVE="blue" + _ft_write_slot "blue" +elif [ "$ACTIVE" = "blue" ] && ! docker inspect api-blue >/dev/null 2>&1; then + # Both containers may be missing on a clean restart; this is ok — the + # BOOTSTRAP GUARD above will catch it. Here we only switch when the + # opposite slot is actually running. + if docker inspect api-green >/dev/null 2>&1; then + _ft_log "msg='slot repair: blue missing but green running — switching effective slot to green' original_slot=blue" + ACTIVE="green" + _ft_write_slot "green" + else + _ft_log "level=WARN msg='slot repair: neither container running — first deploy or crash; slot kept as blue'" + fi +fi +_ft_validate_slot "$ACTIVE" || exit 1 + if [ "$ACTIVE" = "blue" ]; then ACTIVE_NAME=$BLUE_NAME INACTIVE="green"; INACTIVE_NAME=$GREEN_NAME @@ -648,10 +753,9 @@ _ft_state "IDEMPOTENCY" "msg='checking if target SHA already deployed' sha=$IMAG _RUNNING_IMAGE=$(docker inspect --format '{{.Config.Image}}' "$ACTIVE_NAME" 2>/dev/null || echo "") if [ "$_RUNNING_IMAGE" = "$IMAGE" ]; then - # SHA matches -- only skip if the active container is also healthy. - # If it is unhealthy, proceed so the deploy restarts it cleanly. - _IDEMPOTENT_HEALTH=$(timeout 4 curl -s --max-time 3 \ - "http://$ACTIVE_NAME:$APP_PORT/ready" 2>/dev/null || echo "") + # In-network health check: exercises Docker DNS + bridge routing. + _IDEMPOTENT_HEALTH=$(_ft_net_curl_out "$ACTIVE_NAME" \ + -s --max-time 3 "http://$ACTIVE_NAME:$APP_PORT/ready") if echo "$_IDEMPOTENT_HEALTH" | grep -q '"status":"ready"' 2>/dev/null; then _ft_log "msg='target SHA already running and healthy -- nothing to do' container=$ACTIVE_NAME image=$IMAGE" _ft_exit 0 "DEPLOY_SUCCESS" "reason=idempotent_noop sha=$IMAGE_SHA container=$ACTIVE_NAME" @@ -670,9 +774,13 @@ if [ "$_RUNNING_IMAGE" = "$IMAGE" ]; then _ft_state "START_INACTIVE" "msg='starting inactive container' name=$INACTIVE_NAME" if docker ps -a --format '{{.Names}}' | grep -Eq "^${INACTIVE_NAME}$"; then - _ft_log "msg='removing stale container' name=$INACTIVE_NAME" + _ft_log "msg='renaming stale container for audit trail' name=$INACTIVE_NAME" docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true - docker rm "$INACTIVE_NAME" + # Rename instead of hard-rm so a post-mortem can inspect the old container + # state. The -old- suffix lets the zombie purge below collect it. + _STALE_TS=$(date +%s) + docker rename "$INACTIVE_NAME" "${INACTIVE_NAME}-old-${_STALE_TS}" 2>/dev/null \ + || docker rm "$INACTIVE_NAME" fi timeout 60 docker run -d \ @@ -708,18 +816,24 @@ _ft_state "HEALTH_CHECK_INTERNAL" "msg='waiting for container readiness'" sleep 5 HEALTH_ENDPOINT="/ready" -# CONNECTIVITY PRE-CHECK -- confirm port is reachable before entering retry loop. -# Fail fast rather than burning all MAX_HEALTH_ATTEMPTS on a misconfigured port. +# CONNECTIVITY PRE-CHECK (in-network) +# Probe /health via a short-lived curl container on api_network to verify: +# - Docker DNS resolution of $INACTIVE_NAME +# - Bridge routing to the container +# - HTTP server is bound and responding +# This exercises the same network path nginx uses, catching issues that +# docker exec localhost would silently skip. _CONN_ATTEMPTS=0 _CONN_OK=false while [ "$_CONN_ATTEMPTS" -lt 5 ]; do _CONN_ATTEMPTS=$((_CONN_ATTEMPTS + 1)) - if timeout 3 curl -sf "http://$INACTIVE_NAME:$APP_PORT/health" >/dev/null 2>&1; then + if _ft_net_curl "$INACTIVE_NAME" \ + -sf --max-time 3 "http://$INACTIVE_NAME:$APP_PORT/health"; then _CONN_OK=true break fi _ft_log "msg='connectivity pre-check waiting' attempt=$_CONN_ATTEMPTS/5 container=$INACTIVE_NAME" - sleep $((RANDOM % 3 + 1)) + sleep 2 done if [ "$_CONN_OK" = "false" ]; then _ft_log "level=ERROR msg='container not reachable after connectivity pre-check' container=$INACTIVE_NAME" @@ -735,7 +849,8 @@ _ft_log "msg='connectivity pre-check passed' container=$INACTIVE_NAME" ATTEMPT=0 until true; do ATTEMPT=$((ATTEMPT + 1)) - STATUS=$(timeout 5 curl --max-time 4 -s -o /dev/null -w "%{http_code}" \ + STATUS=$(_ft_net_curl_out "$INACTIVE_NAME" \ + --max-time 4 -s -o /dev/null -w "%{http_code}" \ "http://$INACTIVE_NAME:$APP_PORT${HEALTH_ENDPOINT}" || echo "000") if [ "$STATUS" = "200" ]; then @@ -782,9 +897,11 @@ mkdir -p "$NGINX_BACKUP_DIR" NGINX_BACKUP="$NGINX_BACKUP_DIR/api.conf.bak.$(date +%s)" NGINX_TMP="$(mktemp /tmp/api-nginx.XXXXXX.conf)" -# PRE-RELOAD GATE: confirm container is still ready before pointing nginx at it -if ! timeout 4 curl -sf "http://$INACTIVE_NAME:$APP_PORT/ready" >/dev/null 2>&1; then - _ft_log "level=ERROR msg='pre-reload gate failed: container not ready, aborting nginx reload' container=$INACTIVE_NAME" +# PRE-RELOAD GATE (in-network with fallback): confirm container is still ready +# before pointing nginx at it. +if ! _ft_net_curl "$INACTIVE_NAME" \ + -sf --max-time 4 "http://$INACTIVE_NAME:$APP_PORT/ready"; then + _ft_log "level=ERROR msg='pre-reload gate failed: container not ready' container=$INACTIVE_NAME" docker logs "$INACTIVE_NAME" --tail 50 >&2 || true docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true docker rm "$INACTIVE_NAME" || true @@ -803,13 +920,25 @@ rm -f "$NGINX_TMP" # Prune old backups (keep last 5) to avoid unbounded growth ls -1t "$NGINX_BACKUP_DIR"/api.conf.bak.* 2>/dev/null | tail -n +6 | xargs rm -f 2>/dev/null || true +# Nginx network attachment guard: verify nginx is on api_network before every +# reload. If nginx was accidentally disconnected, Docker DNS resolution of +# api-blue/api-green will silently fail inside nginx. +_NGINX_RELOAD_NET=$(docker inspect nginx \ + --format='{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' 2>/dev/null || echo "") +if ! echo "$_NGINX_RELOAD_NET" | grep -q "$NETWORK"; then + _ft_log "level=ERROR msg='nginx not attached to api_network at reload time' networks=${_NGINX_RELOAD_NET}" + cp "$NGINX_BACKUP" "$NGINX_CONF" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_network_mismatch" +fi +unset _NGINX_RELOAD_NET + if ! docker exec nginx nginx -t 2>&1; then _ft_log "level=ERROR msg='nginx config test failed -- restoring backup'" cp "$NGINX_BACKUP" "$NGINX_CONF" _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_config_test_failed" fi - -docker exec nginx nginx -s reload +docker exec nginx nginx -s reload \ + || { cp "$NGINX_BACKUP" "$NGINX_CONF"; _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_reload_failed"; } _ft_log "msg='nginx reloaded' upstream=$INACTIVE_NAME:$APP_PORT" # Upstream sanity check -- confirm nginx config actually points at the new container. @@ -832,12 +961,24 @@ _ft_write_slot "$INACTIVE" # Small settle window to stabilize TLS/keep-alive/edge cases sleep 2 -# POST-SWITCH ROUTING VERIFICATION -# Tests: nginx HTTPS stack is responding + can reach the new container. -# Uses 127.0.0.1 to avoid Cloudflare IP allowlist (same pattern as public check). -_ft_log "msg='post-switch nginx routing verification'" -if ! _ft_retry_curl "https://127.0.0.1/health" 5 --insecure; then - _ft_log "level=ERROR msg='post-switch nginx routing verification failed -- nginx cannot reach new container'" +# POST-SWITCH ROUTING VERIFICATION (in-network) +# Run a short-lived curl container on api_network to probe nginx/health. +# This exercises: Docker DNS resolution of 'nginx', bridge routing nginx→container, +# nginx upstream substitution, and proxy-pass to $INACTIVE_NAME:$APP_PORT. +# Same network path that real client traffic takes after the slot switch. +_ft_log "msg='post-switch nginx routing verification (in-network)'" +_POST_SWITCH_OK=false +for _ps in 1 2 3 4 5; do + if _ft_net_curl "nginx" \ + -sf --max-time 5 "http://nginx/health"; then + _POST_SWITCH_OK=true + break + fi + _ft_log "msg='post-switch in-network check waiting' attempt=$_ps/5" + sleep 2 +done +if [ "$_POST_SWITCH_OK" != "true" ]; then + _ft_log "level=ERROR msg='post-switch routing verification failed — nginx cannot reach new container'" _ft_snapshot cp "$NGINX_BACKUP" "$NGINX_CONF" if docker exec nginx nginx -t 2>&1 && docker exec nginx nginx -s reload; then @@ -850,6 +991,7 @@ if ! _ft_retry_curl "https://127.0.0.1/health" 5 --insecure; then docker rm "$INACTIVE_NAME" || true _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=post_switch_routing_failed container=$INACTIVE_NAME" fi +unset _POST_SWITCH_OK _ps _ft_log "msg='post-switch routing verification passed'" # --------------------------------------------------------------------------- @@ -915,8 +1057,8 @@ if [ "$_PUB_PASSED" != "true" ]; then unset _PUB_PASSED _attempt _PUB_STATUS _PUB_BODY _NGINX_CONTAINER if docker ps --format '{{.Names}}' | grep -q "^${ACTIVE_NAME}$"; then - _ACTIVE_HEALTH=$(timeout 4 curl -s --max-time 3 \ - "http://$ACTIVE_NAME:$APP_PORT/ready" 2>/dev/null || echo "") + _ACTIVE_HEALTH=$(_ft_net_curl_out "$ACTIVE_NAME" \ + -s --max-time 3 "http://$ACTIVE_NAME:$APP_PORT/ready") if echo "$_ACTIVE_HEALTH" | grep -q '"status":"ready"' 2>/dev/null; then _ft_log "msg='deploy failed but active container healthy -- skipping rollback' container=$ACTIVE_NAME" unset _ACTIVE_HEALTH @@ -975,8 +1117,8 @@ if [ "$_STABLE" = "false" ]; then docker rm "$INACTIVE_NAME" || true if docker ps --format '{{.Names}}' | grep -q "^${ACTIVE_NAME}$"; then - _ACTIVE_HEALTH=$(timeout 4 curl -s --max-time 3 \ - "http://$ACTIVE_NAME:$APP_PORT/ready" 2>/dev/null || echo "") + _ACTIVE_HEALTH=$(_ft_net_curl_out "$ACTIVE_NAME" \ + -s --max-time 3 "http://$ACTIVE_NAME:$APP_PORT/ready") if echo "$_ACTIVE_HEALTH" | grep -q '"status":"ready"' 2>/dev/null; then _ft_log "msg='active container healthy after stability failure -- skipping rollback' container=$ACTIVE_NAME" unset _ACTIVE_HEALTH @@ -1017,8 +1159,13 @@ fi # Graceful shutdown: allow in-flight requests to drain before forcing removal. if [ "${SKIP_CLEANUP:-false}" != "true" ]; then docker stop --time 10 "$ACTIVE_NAME" 2>/dev/null || true - docker rm "$ACTIVE_NAME" || true - _ft_log "msg='previous container removed (graceful)' name=$ACTIVE_NAME" + # Rename instead of hard-rm: keeps the previous-active container available + # for 60 s of post-mortem inspection. The -old- suffix is used by + # the zombie purge block below. + _CLEANUP_TS=$(date +%s) + docker rename "$ACTIVE_NAME" "${ACTIVE_NAME}-old-${_CLEANUP_TS}" 2>/dev/null \ + || docker rm "$ACTIVE_NAME" || true + _ft_log "msg='previous container renamed (graceful)' name=$ACTIVE_NAME rename=${ACTIVE_NAME}-old-${_CLEANUP_TS}" else _ft_log "msg='cleanup skipped (first deploy scenario or container already removed)'" fi @@ -1066,8 +1213,9 @@ fi if command -v curl >/dev/null 2>&1; then sleep 2 - # Check internal endpoint (container DNS) - _INT_READY=$(curl -s -m 5 "http://$INACTIVE_NAME:$APP_PORT/ready" 2>/dev/null || echo "") + # Check internal endpoint via in-network curl with fallback. + _INT_READY=$(_ft_net_curl_out "$INACTIVE_NAME" \ + -s --max-time 5 "http://$INACTIVE_NAME:$APP_PORT/ready") _INT_READY_OK=false if echo "$_INT_READY" | grep -q '"status":"ready"' 2>/dev/null; then _INT_READY_OK=true @@ -1172,4 +1320,14 @@ else _ft_log "msg='monitoring stack restarted'" fi +# --------------------------------------------------------------------------- +# ZOMBIE PURGE: remove any api-(blue|green)-old- containers that have +# accumulated from previous deploys. Runs unconditionally so the Docker engine +# does not fill up with stopped containers across multiple deployments. +# --------------------------------------------------------------------------- +_ft_log "msg='running zombie purge'" +docker ps -a --format '{{.Names}}' \ + | grep -E '^api-(blue|green)-old-[0-9]+$' \ + | xargs -r docker rm -f 2>/dev/null || true + _ft_exit 0 "DEPLOY_SUCCESS" "sha=$IMAGE_SHA container=$INACTIVE_NAME slot=$INACTIVE" diff --git a/scripts/vps-readiness-check.sh b/scripts/vps-readiness-check.sh index cbf3750..1e316f2 100644 --- a/scripts/vps-readiness-check.sh +++ b/scripts/vps-readiness-check.sh @@ -87,7 +87,20 @@ if ! docker network ls --format '{{.Name}}' | grep -Eq "^${NETWORK}$"; then else ok "Network '$NETWORK' exists." fi - +# ── AUTO-FIX: Kill ghost docker-proxy processes that may hold stale ports ────── +# +# docker-proxy processes can linger after container removal and hold ports +# 80/443 as ghosts. They are safe to kill (Docker recreates them as needed). +echo "" +echo "--- AUTO-FIX: ghost docker-proxy cleanup ---" +if pgrep -x docker-proxy >/dev/null 2>&1; then + warn "Ghost docker-proxy processes detected — killing stale port holders." + sudo pkill -x docker-proxy 2>/dev/null || true + sleep 1 + ok "Ghost docker-proxy processes cleared." +else + ok "No ghost docker-proxy processes." +fi # ── CHECK 4: Ports 80 and 443 — no non-docker processes ────────────────────── # # Design: we do NOT auto-kill unknown processes. If port 80 or 443 is held by @@ -134,24 +147,29 @@ _check_port() { _check_port 80 _check_port 443 -# ── CHECK 5: No host port bindings on ANY container ──────────────────────────── +# ── CHECK 5: No host port bindings on API containers ──────────────────────── +# +# Production architecture invariant: api-blue and api-green MUST NOT bind host +# ports. All inter-service communication uses Docker DNS on api_network. +# +# nginx is EXEMPT: it intentionally binds 0.0.0.0:80 and 0.0.0.0:443 to receive +# external traffic from Cloudflare. That is the intended, correct behaviour. # -# Production architecture invariant: NO container may bind host ports. -# All inter-service communication uses Docker DNS on api_network. -# A host port binding on any container indicates a misconfigured container -# that could expose services unintentionally or break Docker DNS routing. +# Only api-blue and api-green are checked. Exposing these containers on host ports +# would bypass the nginx layer and could expose the API without TLS or rate-limiting. echo "" -echo "--- CHECK 5: Global host port binding invariant ---" +echo "--- CHECK 5: API container host port binding invariant ---" BOUND=$(docker ps --format '{{.Names}} {{.Ports}}' 2>/dev/null \ + | grep -E '^api-(blue|green) ' \ | grep -E '(0\.0\.0\.0:|127\.0\.0\.1:)[0-9]+->' || true) if [ -n "$BOUND" ]; then - record_failure "Host port bindings detected — violates production architecture:" + record_failure "API container has host port bindings — violates production architecture:" echo "$BOUND" | sed 's/^/ /' - echo " Production pattern: all containers run --network api_network without -p." + echo " Production pattern: api-blue/api-green run --network api_network without -p." echo " Remove and recreate the offending container(s) without port bindings." else - ok "No host port bindings on any running container." + ok "No host port bindings on API containers (api-blue/api-green)." fi # ── CHECK 6: Required env files ────────────────────────────────────────────────