From e6dd35dd775bebfcaa267471e82fa8caf8db7adf Mon Sep 17 00:00:00 2001
From: Dave North <6616703+dnorth98@users.noreply.github.com>
Date: Mon, 2 Mar 2026 11:19:02 -0500
Subject: [PATCH 1/2] Retry transient GitHub API errors (502, 503, 504) with
 backoff

GitHub occasionally returns 502 "Unicorn" errors for valid API calls.
Previously these were treated as permanent failures, causing incomplete
results. Now all API calls retry up to 3 times with exponential backoff
(2s, 4s, 8s) before giving up.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 get-deployment-metrics.py | 65 ++++++++++++++++++++++++++++++++-------
 1 file changed, 54 insertions(+), 11 deletions(-)

diff --git a/get-deployment-metrics.py b/get-deployment-metrics.py
index 8e794c5..ceb32c8 100755
--- a/get-deployment-metrics.py
+++ b/get-deployment-metrics.py
@@ -54,6 +54,37 @@ def is_rate_limited(status, response):
     return False
 
 
+TRANSIENT_STATUS_CODES = {502, 503, 504}
+MAX_RETRIES = 3
+RETRY_BACKOFF_BASE = 2  # seconds
+
+
+def api_call_with_retry(api_func, description="API call"):
+    """Call an API function with retries for transient errors (502, 503, 504).
+
+    api_func should be a callable that returns (status, response).
+    """
+    for attempt in range(MAX_RETRIES + 1):
+        status, response = api_func()
+        if status not in TRANSIENT_STATUS_CODES:
+            return status, response
+        if attempt < MAX_RETRIES:
+            wait = RETRY_BACKOFF_BASE ** (attempt + 1)
+            logging.warning(
+                "Transient HTTP {} for {} — retrying in {}s (attempt {}/{})".format(
+                    status, description, wait, attempt + 1, MAX_RETRIES
+                )
+            )
+            time.sleep(wait)
+        else:
+            logging.warning(
+                "Transient HTTP {} for {} — giving up after {} retries".format(
+                    status, description, MAX_RETRIES
+                )
+            )
+    return status, response
+
+
 # Global list to collect output for file writing
 output_lines = []
 
@@ -75,10 +106,11 @@ def get_workflow_runs(org_name, repo_name, workflow_id, date_filter):
 
     while more_results:
         # repos/{org}}/{repo name}/actions/workflows/{workflow id}/runs
-        gh_status, workflow_runs = (
-            github_handle.repos[org_name][repo_name]
+        gh_status, workflow_runs = api_call_with_retry(
+            lambda p=page_to_get: github_handle.repos[org_name][repo_name]
             .actions.workflows[workflow_id]
-            .runs.get(created=date_filter, page=page_to_get)
+            .runs.get(created=date_filter, page=p),
+            description="workflow runs for {}/{}".format(org_name, repo_name),
         )
 
         # Check for rate limiting
@@ -208,7 +240,10 @@ def get_workflow_runs(org_name, repo_name, workflow_id, date_filter):
 
     # Get all the repos in the org
     # /orgs/{org}/repos
-    gh_status, repo_data = github_handle.orgs[args.org_name].repos.get()
+    gh_status, repo_data = api_call_with_retry(
+        lambda: github_handle.orgs[args.org_name].repos.get(),
+        description="repos for {}".format(args.org_name),
+    )
 
     if is_rate_limited(gh_status, repo_data):
         print(
@@ -228,9 +263,12 @@ def get_workflow_runs(org_name, repo_name, workflow_id, date_filter):
 
         # Now for each repo, see if we have a deployment workflow matching the pattern
         # /repos/{org}/{repo name}/actions/workflows
-        gh_status, workflow_data = github_handle.repos[args.org_name][
-            repo_name
-        ].actions.workflows.get()
+        gh_status, workflow_data = api_call_with_retry(
+            lambda r=repo_name: github_handle.repos[args.org_name][
+                r
+            ].actions.workflows.get(),
+            description="workflows for {}".format(repo_name),
+        )
 
         if is_rate_limited(gh_status, workflow_data):
             print(
@@ -352,10 +390,15 @@ def get_workflow_runs(org_name, repo_name, workflow_id, date_filter):
 
                         # How long did this run run for
                         # repos/{org}/{repo}/actions/runs/{run id}/timing
-                        gh_status, workflow_durations = (
-                            github_handle.repos[args.org_name][repo_name]
-                            .actions.runs[job_id]
-                            .timing.get()
+                        gh_status, workflow_durations = api_call_with_retry(
+                            lambda j=job_id, r=repo_name: github_handle.repos[
+                                args.org_name
+                            ][r]
+                            .actions.runs[j]
+                            .timing.get(),
+                            description="timing for run {} in {}".format(
+                                job_id, repo_name
+                            ),
                         )
 
                         if is_rate_limited(gh_status, workflow_durations):

From 10e8934d0770c1963de46f8654a9522c7b63cfd5 Mon Sep 17 00:00:00 2001
From: Dave North <6616703+dnorth98@users.noreply.github.com>
Date: Mon, 2 Mar 2026 11:23:02 -0500
Subject: [PATCH 2/2] Log transient retry attempts at DEBUG level instead of
 WARNING
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Retry warnings were polluting the report output. Successful retries
are not noteworthy — only log them with --verbose. The "giving up"
message stays at WARNING since that indicates incomplete results.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 get-deployment-metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/get-deployment-metrics.py b/get-deployment-metrics.py
index ceb32c8..200d5e8 100755
--- a/get-deployment-metrics.py
+++ b/get-deployment-metrics.py
@@ -70,7 +70,7 @@ def api_call_with_retry(api_func, description="API call"):
             return status, response
         if attempt < MAX_RETRIES:
             wait = RETRY_BACKOFF_BASE ** (attempt + 1)
-            logging.warning(
+            logging.debug(
                 "Transient HTTP {} for {} — retrying in {}s (attempt {}/{})".format(
                     status, description, wait, attempt + 1, MAX_RETRIES
                 )