Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 36 additions & 9 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ jobs:
# JSON well-formedness + shape check). ``--verify`` re-checks restored
# caches so a tampered cache cannot enter the matrix.
wpt-corpus:
name: Fetch WPT corpus
name: Fetch WPT + polyfill corpora
needs: [plan]
if: needs.plan.outputs.run-tests == 'true'
runs-on: ubuntu-latest
Expand All @@ -151,24 +151,36 @@ jobs:
with:
persist-credentials: false
- name: Cache WPT corpus (key bumps when fetch script / pinned SHA changes)
id: cache
id: wpt-cache
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: reference/wpt
key: wpt-corpus-${{ hashFiles('scripts/fetch_wpt_corpus.sh') }}
- name: Fetch + verify WPT corpus (cache miss)
if: steps.cache.outputs.cache-hit != 'true'
if: steps.wpt-cache.outputs.cache-hit != 'true'
run: scripts/fetch_wpt_corpus.sh
- name: Re-verify restored cache (defense in depth)
if: steps.cache.outputs.cache-hit == 'true'
- name: Re-verify WPT cache (defense in depth)
if: steps.wpt-cache.outputs.cache-hit == 'true'
run: scripts/fetch_wpt_corpus.sh --verify
- name: Upload corpus for matrix jobs
- name: Cache polyfill corpus (key bumps when fetch script / pinned SHA changes)
id: polyfill-cache
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: reference/polyfill
key: polyfill-corpus-${{ hashFiles('scripts/fetch_polyfill_corpus.sh') }}
- name: Fetch + verify polyfill corpus (cache miss)
if: steps.polyfill-cache.outputs.cache-hit != 'true'
run: scripts/fetch_polyfill_corpus.sh
- name: Re-verify polyfill cache (defense in depth)
if: steps.polyfill-cache.outputs.cache-hit == 'true'
run: scripts/fetch_polyfill_corpus.sh --verify
- name: Upload corpora for matrix jobs
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: wpt-corpus
# Sparse-checkout leaves a populated ``.git/`` (HEAD ref, refs/,
# packed objects, occasional symlinks) — used by the SHA-verify
# step above but unused by matrix consumers, and a known source
# steps above but unused by matrix consumers, and a known source
# of digest-mismatch errors under ``download-artifact@v8`` on
# Windows runners (small files + special git entries trip the
# chunked archive digest check, which v8 defaults to ``error``).
Expand All @@ -177,8 +189,11 @@ jobs:
# cross-OS download deterministic.
path: |
reference/wpt
reference/polyfill
!reference/wpt/.git
!reference/wpt/.git/**
!reference/polyfill/.git
!reference/polyfill/.git/**
retention-days: 1
if-no-files-found: error

Expand Down Expand Up @@ -274,7 +289,13 @@ jobs:
- uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
with:
name: wpt-corpus
path: reference/wpt
# The upload-artifact step strips the common ancestor when given
# multiple paths under one parent (``reference/wpt`` and
# ``reference/polyfill`` → archive root holds ``wpt/...`` and
# ``polyfill/...``). Extract under ``reference/`` so the test
# harness finds the corpora at ``reference/wpt/...`` and
# ``reference/polyfill/...`` where conftest.py looks for them.
path: reference
- name: Set up uv
uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
with:
Expand Down Expand Up @@ -369,7 +390,13 @@ jobs:
- uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
with:
name: wpt-corpus
path: reference/wpt
# The upload-artifact step strips the common ancestor when given
# multiple paths under one parent (``reference/wpt`` and
# ``reference/polyfill`` → archive root holds ``wpt/...`` and
# ``polyfill/...``). Extract under ``reference/`` so the test
# harness finds the corpora at ``reference/wpt/...`` and
# ``reference/polyfill/...`` where conftest.py looks for them.
path: reference
- name: Set up uv
uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
with:
Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,10 @@ addopts = ["--tb=short", "--strict-markers", "--strict-config", "--import-mode=i
xfail_strict = true
filterwarnings = ["once::Warning"]
testpaths = ["tests"]
markers = [
"wpt: marks tests parametrized from the upstream WPT urlpattern corpus",
"polyfill: marks tests parametrized from the WICG urlpattern-polyfill corpus",
]

# ---------------------------------------------------------------------------
# Coverage
Expand Down
118 changes: 118 additions & 0 deletions scripts/fetch_polyfill_corpus.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#!/usr/bin/env bash
# scripts/fetch_polyfill_corpus.sh — CI-targeted polyfill corpus fetcher.
#
# Populates ``reference/polyfill/`` with the
# [WICG urlpattern-polyfill](https://github.com/kenchris/urlpattern-polyfill)
# test fixtures the ``tests/test_polyfill*.py`` suites consume.
#
# The polyfill is the reference JavaScript implementation of the WHATWG
# URLPattern Standard. Running its own test corpus against yarlpattern
# is a second cross-implementation conformance vector beyond the
# upstream WPT corpus that ``scripts/fetch_wpt_corpus.sh`` fetches.
#
# Security posture follows ``scripts/fetch_wpt_corpus.sh`` byte-for-byte:
# pinned SHA, HTTPS-only sparse-checkout, post-fetch SHA verification,
# per-file size cap, JSON well-formedness + shape check, ``--verify``
# mode for re-checking restored caches.

set -euo pipefail
umask 022

export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
export GIT_TERMINAL_PROMPT=0

# ┌─────────────────────────────────────────────────────────────────────┐
# │ Pinned upstream commit │
# └─────────────────────────────────────────────────────────────────────┘
# Bump in lockstep with the ``POLYFILL_REF`` in
# ``scripts/fetch_references.sh`` (the dev-side fetcher). Same convention
# as the WPT pin.
POLYFILL_REF="f147a0f42a94a29ec1dcd229b218f3a700377f91" # 2025-05-07

# ┌─────────────────────────────────────────────────────────────────────┐
# │ Size cap on each parsed JSON fixture │
# └─────────────────────────────────────────────────────────────────────┘
# At the pinned SHA the largest fixture is ~85 KB. 10 MiB gives plenty
# of headroom without exposing a parser-DoS surface to a malicious
# upstream.
MAX_JSON_BYTES=$((10 * 1024 * 1024))

REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
POLY_DIR="$REPO_ROOT/reference/polyfill"

EXPECTED_JSON=(
"test/urlpatterntestdata.json"
"test/urlpattern-compare-test-data.json"
)

fatal() {
printf 'FATAL: %s\n' "$*" >&2
exit 1
}

verify_json() {
local rel="$1"
local full="$POLY_DIR/$rel"

[[ -f "$full" ]] || fatal "missing JSON fixture: $rel"

local size
size="$(wc -c < "$full" | tr -d '[:space:]')"

[[ "$size" =~ ^[0-9]+$ ]] || fatal "could not stat size of $rel"
(( size > 0 )) || fatal "$rel is empty"
(( size <= MAX_JSON_BYTES )) || fatal "$rel is $size bytes, exceeds cap of $MAX_JSON_BYTES"

python3 - "$full" <<'PY' || fatal "JSON validation failed: $rel"
import json
import sys
from pathlib import Path

path = Path(sys.argv[1])
data = json.loads(path.read_text(encoding="utf-8"))
if not isinstance(data, list):
raise SystemExit(f"{path.name}: top-level is {type(data).__name__}, expected list")
non_dict = [i for i, e in enumerate(data) if not isinstance(e, (dict, str))]
if non_dict:
raise SystemExit(f"{path.name}: entries at {non_dict[:5]} are not objects/strings")
PY

printf ' ok %-60s %s bytes\n' "$rel" "$size"
}

verify_corpus() {
printf 'Verifying polyfill corpus at %s\n' "$POLY_DIR"
for f in "${EXPECTED_JSON[@]}"; do verify_json "$f"; done
local actual_ref
actual_ref="$(git -C "$POLY_DIR" rev-parse HEAD 2>/dev/null || echo "<not a git checkout>")"
[[ "$actual_ref" == "$POLYFILL_REF" ]] \
|| fatal "POLYFILL_REF mismatch — expected $POLYFILL_REF, got $actual_ref"
printf 'Polyfill corpus integrity OK (pinned at %s)\n' "$POLYFILL_REF"
}

if [[ "${1:-}" == "--verify" ]]; then
[[ -d "$POLY_DIR/.git" ]] || fatal "$POLY_DIR is not a git checkout (run without --verify first)"
verify_corpus
exit 0
fi

mkdir -p "$POLY_DIR"

if [[ -d "$POLY_DIR/.git" ]] \
&& [[ "$(git -C "$POLY_DIR" rev-parse HEAD 2>/dev/null || true)" == "$POLYFILL_REF" ]]; then
printf 'Polyfill corpus already at %s, skipping fetch.\n' "$POLYFILL_REF"
else
if [[ ! -d "$POLY_DIR/.git" ]]; then
git clone \
--filter=blob:none \
--no-checkout \
"https://github.com/kenchris/urlpattern-polyfill.git" \
"$POLY_DIR"
fi
git -C "$POLY_DIR" sparse-checkout init --no-cone >/dev/null
git -C "$POLY_DIR" sparse-checkout set test
git -C "$POLY_DIR" fetch --filter=blob:none origin "$POLYFILL_REF"
git -C "$POLY_DIR" checkout --quiet "$POLYFILL_REF"
fi

verify_corpus
Loading
Loading