Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
279 changes: 279 additions & 0 deletions .github/scripts/ci_health_report/ci_health_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,279 @@
#!/usr/bin/env python3
"""
CI Health Report

Queries completed GitHub Actions workflow runs over a lookback window,
calculates per-job failure rates, and posts a summary to a GitHub issue.

Required environment variables:
GH_TOKEN - GitHub token with actions:read and issues:write
GH_REPO - Repository in "owner/repo" format
REPORT_ISSUE - Issue number to post the report to
LOOKBACK_DAYS - How many days back to look (default: 30)
"""

import os
import sys
import time
from datetime import datetime, timedelta, timezone
from collections import defaultdict
import urllib.request
import urllib.error
import json

COUNTED_CONCLUSIONS = {"success", "failure"}


def bucket_count(lookback_days):
"""Return the number of trend buckets for a given lookback window.

Uses natural time units so bucket boundaries are semantically meaningful:
- daily for windows up to 14 days
- weekly for windows up to 90 days
- ~monthly (28-day) for longer windows
"""
if lookback_days <= 14:
return lookback_days
elif lookback_days <= 90:
return lookback_days // 7
else:
return lookback_days // 28


def trend_indicator(buckets):
"""Compare first-half vs second-half failure rate and return an arrow + delta string."""
mid = len(buckets) // 2
early, recent = buckets[:mid], buckets[mid:]
e_runs = sum(b["runs"] for b in early)
e_fails = sum(b["failures"] for b in early)
r_runs = sum(b["runs"] for b in recent)
r_fails = sum(b["failures"] for b in recent)
if e_runs == 0 or r_runs == 0:
return "—"
delta = (r_fails / r_runs - e_fails / e_runs) * 100
if abs(delta) < 1.0:
return f"→ {delta:+.1f}%"
return f"{'↑' if delta > 0 else '↓'} {delta:+.1f}%"


def _headers(token):
return {
"Authorization": f"Bearer {token}",
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
}


def _urlopen(req):
"""Execute a request with one automatic retry on rate limit (403/429)."""
try:
with urllib.request.urlopen(req) as resp:
return resp.read()
except urllib.error.HTTPError as e:
if e.code not in (403, 429):
raise RuntimeError(f"GitHub API error {e.code} for {req.full_url}") from e
retry_after = e.headers.get("Retry-After")
reset = e.headers.get("X-RateLimit-Reset")
if retry_after:
wait = int(retry_after) + 5
elif reset:
wait = max(0, int(reset) - int(time.time())) + 5 # Seconds until reset + 5
else:
wait = 60
print(f"Rate limited (HTTP {e.code}). Waiting {wait}s before retry...", file=sys.stderr)
time.sleep(wait)
# Retry once outside the except block so a second failure is handled cleanly.
try:
with urllib.request.urlopen(req) as resp:
return resp.read()
except urllib.error.HTTPError as retry_e:
print(
f"Error: rate limit persists after retry (HTTP {retry_e.code}). Giving up.",
file=sys.stderr,
)
sys.exit(1)


def gh_get(token, path):
"""Fetch a single page from the GitHub API and return parsed JSON."""
url = f"https://api.github.com{path}"
req = urllib.request.Request(url, headers=_headers(token))
return json.loads(_urlopen(req))


def get_runs(token, repo, since):
"""Return all completed workflow runs created on or after `since`."""
runs = []
page = 1
while True:
data = gh_get(token, (
f"/repos/{repo}/actions/runs"
f"?status=completed&created=>={since}&per_page=100&page={page}"
))
batch = data.get("workflow_runs", [])
if not batch:
break
runs.extend(batch)
page += 1
return runs


def get_jobs(token, repo, run_id):
"""Return all jobs for a workflow run."""
jobs = []
page = 1
while True:
data = gh_get(token, (
f"/repos/{repo}/actions/runs/{run_id}/jobs"
f"?per_page=100&page={page}"
))
batch = data.get("jobs", [])
if not batch:
break
jobs.extend(batch)
page += 1
return jobs


def post_comment(token, repo, issue_number, body):
"""Post a comment to a GitHub issue."""
url = f"https://api.github.com/repos/{repo}/issues/{issue_number}/comments"
payload = json.dumps({"body": body}).encode()
req = urllib.request.Request(url, data=payload, headers={
**_headers(token),
"Content-Type": "application/json",
})
_urlopen(req)


def build_report(stats, lookback_days, top_n, now):
"""Build the markdown report string from aggregated job stats."""
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like the failure rates are very interesting to see, but the trend is also equally important. For example, if we're looking over a 30d period - it may have a 40% test failure rate but the trend is going up (perhaps due to a regression that completely created a failure). Or, the trend is going down which tells us there's something else to look at here.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great idea, will address

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thoughts @wolsen ?

CI Health Report

Last 30 days — generated 2026-04-07T09:00:00Z

Job Failure Rates

Workflow Job Runs Failures Rate Trend
Integration smoke 80 56 70.0% ↓ -5.0%
Integration full 80 41 51.2% ↑ +2.5%
Nightly deploy 2 1 50.0%
Tests lint 80 26 32.5% ↑ +45.0%
Tests unit 80 24 30.0% ↓ -40.0%
Tests build 80 8 10.0% → +0.0%

Trend: the 30-day window is divided into equal time buckets (daily for ≤ 14 days, weekly for ≤ 90 days, ~monthly beyond that). The failure rate in the first half of those buckets is compared to the second half: ↑ = getting worse, ↓ = improving, → = stable (< 1 pp change). — = fewer than 2 runs per bucket on average; not enough data.

Top 5 Most Failing Jobs

  1. Integration / smoke — 56 failures (70.0%)
  2. Integration / full — 41 failures (51.2%)
  3. Tests / lint — 26 failures (32.5%)
  4. Tests / unit — 24 failures (30.0%)
  5. Tests / build — 8 failures (10.0%)

Summary

  • Total job runs: 402
  • Total failures: 156
  • Overall failure rate: 38.8%

# Sort by failure rate descending for the main table
rows = sorted(stats.items(), key=lambda x: x[1]["failures"] / x[1]["runs"], reverse=True)

table_lines = []
for (workflow, job), s in rows:
rate = s["failures"] / s["runs"] * 100
min_runs = len(s["buckets"]) * 2
trend = trend_indicator(s["buckets"]) if s["runs"] >= min_runs else "—"
table_lines.append(f"| {workflow} | {job} | {s['runs']} | {s['failures']} | {rate:.1f}% | {trend} |")

# Top N by absolute failure count
top = sorted(stats.items(), key=lambda x: x[1]["failures"], reverse=True)[:top_n]
top_lines = []
for rank, ((workflow, job), s) in enumerate(top, start=1):
rate = s["failures"] / s["runs"] * 100
top_lines.append(f"{rank}. **{workflow} / {job}** — {s['failures']} failures ({rate:.1f}%)")

total_runs = sum(s["runs"] for s in stats.values())
total_failures = sum(s["failures"] for s in stats.values())
overall_rate = (total_failures / total_runs * 100) if total_runs else 0.0

timestamp = now.strftime("%Y-%m-%dT%H:%M:%SZ")

lines = [
"## CI Health Report",
"",
f"_Last {lookback_days} days — generated {timestamp}_",
"",
"### Job Failure Rates",
"",
"| Workflow | Job | Runs | Failures | Rate | Trend |",
"|----------|-----|------|----------|------|-------|",
*table_lines,
"",
f"_Trend: the {lookback_days}-day window is divided into equal time buckets"
" (daily for ≤ 14 days, weekly for ≤ 90 days, ~monthly beyond that)."
" The failure rate in the first half of those buckets is compared to the second half:"
" ↑ = getting worse, ↓ = improving, → = stable (< 1 pp change)."
" — = fewer than 2 runs per bucket on average; not enough data._",
"",
f"### Top {top_n} Most Failing Jobs",
"",
*top_lines,
"",
"### Summary",
"",
f"- **Total job runs:** {total_runs}",
f"- **Total failures:** {total_failures}",
f"- **Overall failure rate:** {overall_rate:.1f}%",
]
return "\n".join(lines)


def main():
token = os.environ.get("GH_TOKEN", "")
repo = os.environ.get("GH_REPO", "")
issue_number = os.environ.get("REPORT_ISSUE", "")
lookback_days_str = os.environ.get("LOOKBACK_DAYS", "")
top_jobs_str = os.environ.get("TOP_JOBS", "")

if not token or not repo or not issue_number or not lookback_days_str or not top_jobs_str:
print("Error: GH_TOKEN, GH_REPO, REPORT_ISSUE, LOOKBACK_DAYS, and TOP_JOBS must all be set.", file=sys.stderr)
sys.exit(1)

lookback_days = int(lookback_days_str)
top_jobs = int(top_jobs_str)

now = datetime.now(timezone.utc)
since_dt = now - timedelta(days=lookback_days)
since = since_dt.strftime("%Y-%m-%dT%H:%M:%SZ")
num_buckets = bucket_count(lookback_days)

print(f"Fetching workflow runs since {since}...")
runs = get_runs(token, repo, since)
print(f"Found {len(runs)} completed runs.")

if not runs:
print("No runs found. Skipping report.")
return

# Aggregate: (workflow_name, job_name) -> {runs, failures, buckets}
# Only "success" and "failure" conclusions are counted; skipped/cancelled are excluded.
# Buckets divide the lookback window into equal time slices (oldest → newest) for trend tracking.
stats = defaultdict(lambda: {
"runs": 0,
"failures": 0,
"buckets": [{"runs": 0, "failures": 0} for _ in range(num_buckets)],
})
window_secs = (now - since_dt).total_seconds()

for i, run in enumerate(runs, start=1):
print(f" Fetching jobs for run {i}/{len(runs)} (id={run['id']})...")
run_dt = datetime.fromisoformat(run["created_at"].replace("Z", "+00:00"))
elapsed = (run_dt - since_dt).total_seconds() # seconds from window start to this run
# clamp: elapsed==window_secs would produce index num_buckets
bucket_idx = min(int(elapsed / window_secs * num_buckets), num_buckets - 1)
bucket_idx = max(0, bucket_idx) # clamp: clock skew can make elapsed slightly negative

jobs = get_jobs(token, repo, run["id"])
for job in jobs:
conclusion = job.get("conclusion")
if conclusion not in COUNTED_CONCLUSIONS:
continue
key = (run["name"], job["name"])
stats[key]["runs"] += 1
stats[key]["buckets"][bucket_idx]["runs"] += 1
if conclusion == "failure":
stats[key]["failures"] += 1
stats[key]["buckets"][bucket_idx]["failures"] += 1

if not stats:
print("No job data collected. Skipping report.")
return

report = build_report(stats, lookback_days, top_jobs, now)

# Write to GitHub step summary if available
summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
if summary_path:
with open(summary_path, "a") as f:
f.write(report + "\n")

print(f"Posting report to issue #{issue_number}...")
post_comment(token, repo, issue_number, report)
print("Report generated successfully.")


if __name__ == "__main__":
main()
64 changes: 64 additions & 0 deletions .github/scripts/ci_health_report/simulate_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env python3
"""
Generates a sample CI health report with synthetic data and writes it to a file.
Usage: python3 simulate_report.py [output.md]
"""

import sys
from collections import defaultdict
from datetime import datetime, timezone

from ci_health_report import build_report

# Each entry: (workflow, job, buckets)
# Buckets run oldest → newest; each is {runs, failures}.
SCENARIOS = [
# Clearly getting worse — failure rate climbing week over week
("Tests", "lint", [{"runs": 20, "failures": 1},
{"runs": 20, "failures": 3},
{"runs": 20, "failures": 8},
{"runs": 20, "failures": 14}]),
# Clearly improving — failure rate falling
("Tests", "unit", [{"runs": 20, "failures": 12},
{"runs": 20, "failures": 8},
{"runs": 20, "failures": 3},
{"runs": 20, "failures": 1}]),
# Flat / stable low failure rate
("Tests", "build", [{"runs": 20, "failures": 2},
{"runs": 20, "failures": 2},
{"runs": 20, "failures": 2},
{"runs": 20, "failures": 2}]),
# Flat / stable high failure rate
("Integration", "smoke", [{"runs": 20, "failures": 14},
{"runs": 20, "failures": 15},
{"runs": 20, "failures": 13},
{"runs": 20, "failures": 14}]),
# Spike in the middle, now recovering
("Integration", "full", [{"runs": 20, "failures": 2},
{"runs": 20, "failures": 18},
{"runs": 20, "failures": 18},
{"runs": 20, "failures": 3}]),
# Sparse — only 2 runs total, should show —
("Nightly", "deploy", [{"runs": 1, "failures": 1},
{"runs": 0, "failures": 0},
{"runs": 0, "failures": 0},
{"runs": 1, "failures": 0}]),
]

stats = defaultdict(lambda: {"runs": 0, "failures": 0, "buckets": []})
for workflow, job, buckets in SCENARIOS:
key = (workflow, job)
stats[key]["runs"] = sum(b["runs"] for b in buckets)
stats[key]["failures"] = sum(b["failures"] for b in buckets)
stats[key]["buckets"] = buckets

now = datetime(2026, 4, 7, 9, 0, 0, tzinfo=timezone.utc)
report = build_report(stats, lookback_days=30, top_n=5, now=now)

output = sys.argv[1] if len(sys.argv) > 1 else "sample_report.md"
with open(output, "w") as f:
f.write(report + "\n")

print(f"Written to {output}")
print()
print(report)
Loading
Loading