From 4f6f9dbeef3ae1c8715e175679bbaad7d2b260a0 Mon Sep 17 00:00:00 2001 From: bloodearnest Date: Tue, 17 Feb 2026 14:58:52 +0000 Subject: [PATCH] Add a /robots.txt to all sites This will hopefully reduce the amount of spurious logs noise we get from crawlers. Have added it to the disabled changelogs proxy config too, as it was the worst offender, and we may need it again. --- ...logs.opensafely.org.conf.template.disabled | 5 +++++ ci-tests.sh | 19 ++++++++++++++++++- ghcr.io.conf.template | 5 +++++ github.com.conf.template | 5 +++++ 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/changelogs.opensafely.org.conf.template.disabled b/changelogs.opensafely.org.conf.template.disabled index 8a79eba..756920f 100644 --- a/changelogs.opensafely.org.conf.template.disabled +++ b/changelogs.opensafely.org.conf.template.disabled @@ -7,6 +7,11 @@ server { root /var/www/html; listen ${PORT}; + location = /robots.txt { + add_header 'Content-Type' 'text/plain; charset=UTF-8' always; + return 200 "User-agent: *\nDisallow: /\n"; + } + location / { limit_except GET { deny all; } proxy_pass https://changelogs.ubuntu.com; diff --git a/ci-tests.sh b/ci-tests.sh index 8ad6971..41c5a83 100755 --- a/ci-tests.sh +++ b/ci-tests.sh @@ -118,6 +118,12 @@ assert-header() { ### github-proxy.opensafely.org ### +# test robots is disallowed +try github-proxy.opensafely.org/robots.txt 200 +assert-in-body 'User-agent: *' +assert-in-body 'Disallow: /' +assert-header 'Content-Type: text/plain; charset=UTF-8' + # test we can query the clone metadata endpoint try github-proxy.opensafely.org/opensafely/documentation/info/refs?service=git-upload-pack 200 assert-header 'X-GitHub-Request-Id:' @@ -160,6 +166,12 @@ assert-in-body ed25519 ### docker-proxy.opensafely.org ### +# test robots is disallowed +try docker-proxy.opensafely.org/robots.txt 200 +assert-in-body 'User-agent: *' +assert-in-body 'Disallow: /' +assert-header 'Content-Type: text/plain; charset=UTF-8' + # test the initial docker request is rewritten correctly try docker-proxy.opensafely.org/v2/ 401 assert-in-body '{"errors":[{"code":"UNAUTHORIZED","message":"authentication required"}]}' @@ -185,9 +197,14 @@ digest=$(jq -r .config.digest < "$body") try "docker-proxy.opensafely.org/v2/opensafely-core/busybox/blobs/$digest?" 200 "$token" ### changelogs.opensafely.org ### - # This allows us to use the do-release-upgrade tool to perform major backend OS upgrades. # Disabled as we don't typically needed unless we are using do-release-upgrade + #try changelogs.opensafely.org/meta-release-lts 200 +# test robots is disallowed +# try changelogs.opensafely.org/robots.txt 200 +# assert-in-body 'User-agent: *' +# assert-in-body 'Disallow: /' +# assert-header 'Content-Type: text/plain; charset=UTF-8' exit $return_code diff --git a/ghcr.io.conf.template b/ghcr.io.conf.template index eb1ca1d..2d50bbe 100644 --- a/ghcr.io.conf.template +++ b/ghcr.io.conf.template @@ -22,6 +22,11 @@ server { root /var/www/html; listen ${PORT}; + location = /robots.txt { + add_header 'Content-Type' 'text/plain; charset=UTF-8' always; + return 200 "User-agent: *\nDisallow: /\n"; + } + # no buffering proxy_buffering off; proxy_request_buffering off; diff --git a/github.com.conf.template b/github.com.conf.template index 94a3add..6cde127 100644 --- a/github.com.conf.template +++ b/github.com.conf.template @@ -8,6 +8,11 @@ server { root /var/www/html/; listen ${PORT}; + location = /robots.txt { + add_header 'Content-Type' 'text/plain; charset=UTF-8' always; + return 200 "User-agent: *\nDisallow: /\n"; + } + # We `git fetch` commits from a persistant bare repo, which over time can # mean sending a lot of local state up to github as part of fetching. So # this needs to be larger than you'd think