From f59ea9a90bc392214d3a3c2180d7791b5a1ee235 Mon Sep 17 00:00:00 2001
From: Greg V <greg.vannoni@gmail.com>
Date: Sun, 29 Mar 2026 06:56:24 -0700
Subject: [PATCH 1/3] Fix Fly deployment: convert from HTTP service to
 background worker

The scheduler was being auto-stopped by Fly because fly.toml configured
it as an HTTP service. Since the app is a long-running background worker,
removed [http_service] section and configured as a process instead.
Also adds ignored users filtering and updates org to 2026-ASU-WiCS.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md            | 31 ++++++++++++++++++++++++++++++-
 Dockerfile           |  2 +-
 fly.toml             | 20 ++------------------
 src/github_client.py | 21 ++++++++++++++++++---
 src/main.py          | 12 ++++++++++--
 src/scheduler.py     |  6 +++++-
 6 files changed, 66 insertions(+), 26 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 0c55906..4f5e248 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -5,12 +5,41 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 ## Build & Run Commands
 - Install dependencies: `pip install -r requirements.txt`
 - Run application: `python src/main.py <org_names>`
-- Run scheduler: `python src/scheduler.py`
+- Run scheduler: `python src/scheduler.py <org_names>`
+- Get achievements only: `python src/main.py --achievements-only <org_names>`
 - Run tests: `pytest`
 - Lint code: `flake8 src/`
 - Format code: `black src/`
 - Type check: `mypy src/`
 
+## Environment Setup
+Required environment variables:
+- `GITHUB_TOKEN`: GitHub personal access token
+- `GOOGLE_APPLICATION_CREDENTIALS_JSON`: JSON string of Firestore service account credentials
+- `COLLECTION_INTERVAL`: daily, hourly, minutely, or weekly (default: daily)
+- `COLLECTION_TIME`: time to run collection (default: 00:00)
+- `GITHUB_IGNORED_USERS`: comma-separated list of GitHub usernames to ignore (default: gregv)
+
+## Architecture Overview
+This is a GitHub organization metrics collector with the following key components:
+
+**Core Classes:**
+- `GitHubClient`: Handles GitHub API interactions using aiohttp for async requests
+- `FirestoreClient`: Manages Firestore database operations for data persistence
+- `MetricsCollector`: Orchestrates data collection and coordinates between GitHub and Firestore
+- `AchievementsGenerator`: Generates achievement data based on contributor statistics
+- `Scheduler`: Handles periodic execution of metrics collection
+
+**Data Flow:**
+1. GitHub API → contributor stats, repository data, pull requests, commits
+2. Firestore structure: organizations/{org}/repositories/{repo}/contributors/{user}
+3. Achievements are generated from aggregated contributor data and stored separately
+
+**Key Operations:**
+- Organization processing: fetches all repos → all contributors → individual stats
+- Achievement generation: aggregates data across repositories to create achievements
+- Scheduled execution: runs collection at configurable intervals
+
 ## Code Style Guidelines
 - Use Python 3.11+ features
 - Format with black (line length: 88)
diff --git a/Dockerfile b/Dockerfile
index 210516c..5216a7a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -27,4 +27,4 @@ RUN useradd -m appuser
 USER appuser
 
 # Run the scheduler
-CMD ["python", "src/scheduler.py", "2024-Arizona-Opportunity-Hack"]
\ No newline at end of file
+CMD ["python", "src/scheduler.py", "2026-ASU-WiCS-Opportunity-Hack"]
diff --git a/fly.toml b/fly.toml
index f0a52b7..7325f45 100644
--- a/fly.toml
+++ b/fly.toml
@@ -8,27 +8,11 @@ primary_region = 'lax'
 
 [build]
 
-[env]
-  PORT = '8080'
-
-[[mounts]]
-  source = 'github_stats_data'
-  destination = '/data'
-
-[http_service]
-  internal_port = 8080
-  force_https = true
-  auto_stop_machines = 'stop'
-  auto_start_machines = true
-  min_machines_running = 1
-  processes = ['app']
+[processes]
+  app = "python src/scheduler.py 2026-ASU-WiCS-Opportunity-Hack"
 
 [[vm]]
   size = 'shared-cpu-1x'
   memory = '512mb'
   cpu_kind = 'shared'
   cpus = 1
-
-[[metrics]]
-  port = 9091
-  path = '/metrics'
diff --git a/src/github_client.py b/src/github_client.py
index 07f0851..311a2ff 100644
--- a/src/github_client.py
+++ b/src/github_client.py
@@ -1,12 +1,13 @@
 import aiohttp
 import asyncio
-from typing import List, Dict, Any
+from typing import List, Dict, Any, Set
 import logging
+import os
 
 logger = logging.getLogger(__name__)
 
 class GitHubClient:
-    def __init__(self, token: str):
+    def __init__(self, token: str, ignored_users: Set[str] = None):
         self.token = token
         self.base_url = "https://api.github.com"
         self.headers = {
@@ -14,6 +15,8 @@ def __init__(self, token: str):
             "Accept": "application/vnd.github.v3+json"
         }
         self.session = None
+        self.ignored_users = ignored_users or set()
+        logger.info(f"GitHubClient initialized with {len(self.ignored_users)} ignored users: {self.ignored_users}")
 
     async def ensure_session(self):
         if self.session is None or self.session.closed:
@@ -31,7 +34,19 @@ async def get_organization_repos(self, org_name: str) -> List[Dict[str, Any]]:
     async def get_repo_contributors(self, repo_full_name: str) -> List[Dict[str, Any]]:
         await self.ensure_session()
         url = f"{self.base_url}/repos/{repo_full_name}/contributors"
-        return await self.get_paginated_data(url)
+        contributors = await self.get_paginated_data(url)
+        
+        # Filter out ignored users
+        filtered_contributors = [
+            contributor for contributor in contributors 
+            if contributor.get('login') not in self.ignored_users
+        ]
+        
+        if len(contributors) != len(filtered_contributors):
+            ignored_count = len(contributors) - len(filtered_contributors)
+            logger.info(f"Filtered out {ignored_count} ignored contributors from {repo_full_name}")
+        
+        return filtered_contributors
 
     async def get_contributor_stats(self, org_name: str, repo_name: str, contributor_login: str) -> Dict[str, Any]:
         await self.ensure_session()
diff --git a/src/main.py b/src/main.py
index 7bad44f..9c437c4 100644
--- a/src/main.py
+++ b/src/main.py
@@ -19,7 +19,11 @@ async def process_organization(org_name: str, github_token: str, firestore_crede
     github_client = None
     firestore_client = None
     try:
-        github_client = GitHubClient(github_token)
+        # Parse ignored users from environment variable
+        ignored_users_str = os.getenv("GITHUB_IGNORED_USERS", "gregv")
+        ignored_users = set(user.strip() for user in ignored_users_str.split(",") if user.strip())
+        
+        github_client = GitHubClient(github_token, ignored_users)
         firestore_client = FirestoreClient(firestore_credentials)
         metrics_collector = MetricsCollector(github_client, firestore_client)
 
@@ -50,7 +54,11 @@ async def get_achievements(org_name: str, github_token: str, firestore_credentia
     github_client = None
     firestore_client = None
     try:
-        github_client = GitHubClient(github_token)
+        # Parse ignored users from environment variable
+        ignored_users_str = os.getenv("GITHUB_IGNORED_USERS", "gregv")
+        ignored_users = set(user.strip() for user in ignored_users_str.split(",") if user.strip())
+        
+        github_client = GitHubClient(github_token, ignored_users)
         firestore_client = FirestoreClient(firestore_credentials)
         metrics_collector = MetricsCollector(github_client, firestore_client)
         
diff --git a/src/scheduler.py b/src/scheduler.py
index 2a03e22..3c2efa3 100644
--- a/src/scheduler.py
+++ b/src/scheduler.py
@@ -31,10 +31,14 @@ def __init__(self, github_token: str, firestore_credentials: str, organizations:
         self.github_client = None
         self.firestore_client = None
         self.metrics_collector = None
+        
+        # Parse ignored users from environment variable
+        ignored_users_str = os.getenv("GITHUB_IGNORED_USERS", "gregv")
+        self.ignored_users = set(user.strip() for user in ignored_users_str.split(",") if user.strip())
 
     async def setup(self):
         """Set up the clients and collector."""
-        self.github_client = GitHubClient(self.github_token)
+        self.github_client = GitHubClient(self.github_token, self.ignored_users)
         self.firestore_client = FirestoreClient(self.firestore_credentials)
         self.metrics_collector = MetricsCollector(self.github_client, self.firestore_client)
 

From 86df03bcbb2e451b7f7c0f52d0cfaacc230e0e7b Mon Sep 17 00:00:00 2001
From: Greg V <greg.vannoni@gmail.com>
Date: Sun, 29 Mar 2026 07:05:15 -0700
Subject: [PATCH 2/3] Keep mounts in fly.toml to match existing Fly machine
 volume

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 fly.toml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fly.toml b/fly.toml
index 7325f45..3a5f9b9 100644
--- a/fly.toml
+++ b/fly.toml
@@ -8,6 +8,10 @@ primary_region = 'lax'
 
 [build]
 
+[[mounts]]
+  source = 'github_stats_data'
+  destination = '/data'
+
 [processes]
   app = "python src/scheduler.py 2026-ASU-WiCS-Opportunity-Hack"
 

From 1296f8a5734b2a7f08457b555b6cc81a781915c1 Mon Sep 17 00:00:00 2001
From: Greg V <greg.vannoni@gmail.com>
Date: Mon, 30 Mar 2026 11:59:53 -0700
Subject: [PATCH 3/3] Track unique files changed per contributor and improve
 achievement explainability

- Fetch individual commit details to get accurate additions/deletions (fixes broken
  list-commits endpoint) and track unique significant files per contributor
- Exclude boilerplate files (.md, .txt, .lock, README, LICENSE, etc.) from file counts
- Add team field to contributor stats (set from repo_name) to fix team achievement generation
- Add MIN_FILES_FOR_PRODUCTIVE_TEAM threshold (10) to prevent low-substance teams from
  winning Most Productive Team
- Add "Ready for a Boost" mentor opportunity entries for teams with low activity
- Add units to bare-number achievement values (commits, PRs, issues, reviews)
- Improve achievement descriptions to explain exactly how each metric is calculated

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/achievements_generator.py | 79 ++++++++++++++++++++++++++++-------
 src/github_client.py          | 48 +++++++++++++++++++--
 2 files changed, 107 insertions(+), 20 deletions(-)

diff --git a/src/achievements_generator.py b/src/achievements_generator.py
index d4e9135..d245022 100644
--- a/src/achievements_generator.py
+++ b/src/achievements_generator.py
@@ -8,6 +8,8 @@
 logger = logging.getLogger(__name__)
 
 class AchievementsGenerator:
+    MIN_FILES_FOR_PRODUCTIVE_TEAM = 10
+
     def __init__(self):
         logger.info("AchievementsGenerator initialized")
 
@@ -41,6 +43,9 @@ def generate_achievements(self, org_name: str, repositories: List[Dict[str, Any]
             # Generate team achievements
             achievements.extend(self._find_most_productive_team(org_name, repos_data, contributors_data))
             achievements.extend(self._find_most_collaborative_team(org_name, repos_data, contributors_data))
+
+            # Generate mentor opportunity entries for teams that could use support
+            achievements.extend(self._find_teams_ready_for_boost(org_name, repos_data, contributors_data))
             
             logger.info(f"Generated {len(achievements)} achievements for {org_name}")
             return achievements
@@ -128,7 +133,7 @@ def _find_epic_pr(self, org_name: str, repos_data: Dict[str, Any], contributors_
                     },
                     "value": formatted_size,
                     "icon": "merge",
-                    "description": "Largest pull request merged",
+                    "description": "Largest merged PR by lines added + deleted",
                     "repo": largest_pr_repo,
                     "prNumber": str(largest_pr_number) if largest_pr_number else ""
                 }]
@@ -173,9 +178,9 @@ def _find_night_owl(self, org_name: str, repos_data: Dict[str, Any], contributor
                         "team": latest_contributor.get('team', ""),
                         "githubUsername": latest_contributor['login']
                     },
-                    "value": latest_total_commits,
+                    "value": f"{latest_total_commits} commits",
                     "icon": "accessTime",
-                    "description": "Most commits made at night",
+                    "description": "Commits between 10pm–4am MST",
                     "repo": latest_repo                    
                 }]
             return []
@@ -209,7 +214,7 @@ def _find_code_surgeon(self, org_name: str, repos_data: Dict[str, Any], contribu
                     },
                     "value": formatted_deletions,
                     "icon": "delete",
-                    "description": "Most code deleted",
+                    "description": "Most lines of code removed across commits",
                     "repo": max_deletion_repo
                 }]
             return []
@@ -239,9 +244,9 @@ def _find_pr_master(self, org_name: str, repos_data: Dict[str, Any], contributor
                         "team": max_pr_contributor.get('team', ""),
                         "githubUsername": max_pr_contributor['login']
                     },
-                    "value": str(max_prs),
+                    "value": f"{max_prs} PRs",
                     "icon": "pull_request",
-                    "description": "Most PRs created",
+                    "description": "Most pull requests created",
                     "repo": max_pr_repo
                 }]
             return []
@@ -271,9 +276,9 @@ def _find_issue_resolver(self, org_name: str, repos_data: Dict[str, Any], contri
                         "team": max_issue_contributor.get('team', ""),
                         "githubUsername": max_issue_contributor['login']
                     },
-                    "value": str(max_issues_closed),
+                    "value": f"{max_issues_closed} issues",
                     "icon": "task_alt",
-                    "description": "Most issues closed",
+                    "description": "Most GitHub issues closed",
                     "repo": max_issue_repo
                 }]
             return []
@@ -304,9 +309,9 @@ def _find_review_champion(self, org_name: str, repos_data: Dict[str, Any], contr
                         "team": max_review_contributor.get('team', ""),
                         "githubUsername": max_review_contributor['login']
                     },
-                    "value": str(max_reviews),
+                    "value": f"{max_reviews} reviews",
                     "icon": "rate_review",
-                    "description": "Most PR reviews submitted",
+                    "description": "Most pull request reviews submitted",
                     "repo": max_review_repo
                 }]
             return []
@@ -336,9 +341,9 @@ def _find_weekend_warrior(self, org_name: str, repos_data: Dict[str, Any], contr
                         "team": max_weekend_contributor.get('team', ""),
                         "githubUsername": max_weekend_contributor['login']
                     },
-                    "value": str(max_weekend_commits),
+                    "value": f"{max_weekend_commits} commits",
                     "icon": "weekend",
-                    "description": "Most commits on weekends",
+                    "description": "Most commits on Saturday & Sunday",
                     "repo": max_weekend_repo
                 }]
             return []
@@ -407,16 +412,19 @@ def _find_most_productive_team(self, org_name: str, repos_data: Dict[str, Any],
                 tasks_completed = 0
                 members_count = len(team_members)
                 
+                files_changed = 0
                 for member in team_members:
                     # Count completed tasks (commits + PRs merged + issues closed)
                     tasks_completed += member.get('commits', 0)
                     tasks_completed += member.get('pull_requests', {}).get('merged', 0)
                     tasks_completed += member.get('issues', {}).get('closed', 0)
-                
-                if tasks_completed > 0 and members_count > 0:
+                    files_changed += member.get('unique_files_changed', 0)
+
+                if tasks_completed > 0 and members_count > 0 and files_changed >= self.MIN_FILES_FOR_PRODUCTIVE_TEAM:
                     team_productivity[team_name] = {
                         "tasks": tasks_completed,
                         "members": members_count,
+                        "files_changed": files_changed,
                         "members_data": team_members
                     }
             
@@ -436,7 +444,8 @@ def _find_most_productive_team(self, org_name: str, repos_data: Dict[str, Any],
                 "value": f"{team_data['tasks']} tasks",
                 "icon": "group",
                 "members": team_data["members"],
-                "description": "Completed the most tasks during the hackathon",
+                "filesChanged": team_data["files_changed"],
+                "description": "Most commits + merged PRs + issues closed",
                 "teamPage": team_page
             }]
         except Exception as e:
@@ -493,9 +502,47 @@ def _find_most_collaborative_team(self, org_name: str, repos_data: Dict[str, Any
                 "value": f"{team_data['collaboration']} PRs reviewed",
                 "icon": "merge",
                 "members": team_data["members"],
-                "description": "Highest number of PR reviews and comments",
+                "description": "Highest number of pull request reviews",
                 "teamPage": team_page
             }]
         except Exception as e:
             logger.error(f"Error finding most collaborative team achievement: {str(e)}")
+            return []
+
+    def _find_teams_ready_for_boost(self, org_name: str, repos_data: Dict[str, Any], contributors_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Find teams with low GitHub activity so mentors can offer support."""
+        try:
+            teams = defaultdict(list)
+            for contributor in contributors_data:
+                team = contributor.get('team')
+                if team:
+                    teams[team].append(contributor)
+
+            if not teams:
+                return []
+
+            boost_teams = []
+            for team_name, team_members in teams.items():
+                total_commits = sum(member.get('commits', 0) for member in team_members)
+                files_changed = sum(member.get('unique_files_changed', 0) for member in team_members)
+                members_count = len(team_members)
+
+                if files_changed < self.MIN_FILES_FOR_PRODUCTIVE_TEAM or total_commits < 5:
+                    team_page = team_name.lower().replace(' ', '-')
+                    boost_teams.append({
+                        "title": "Ready for a Boost",
+                        "team": team_name,
+                        "value": f"{files_changed} files, {total_commits} commits",
+                        "icon": "rocket_launch",
+                        "members": members_count,
+                        "filesChanged": files_changed,
+                        "description": "This team might benefit from some mentor guidance to get rolling!",
+                        "teamPage": team_page,
+                        "type": "mentor_opportunity"
+                    })
+
+            logger.info(f"Found {len(boost_teams)} teams ready for a boost in {org_name}")
+            return boost_teams
+        except Exception as e:
+            logger.error(f"Error finding teams ready for boost: {str(e)}")
             return []
\ No newline at end of file
diff --git a/src/github_client.py b/src/github_client.py
index 311a2ff..7f66f6c 100644
--- a/src/github_client.py
+++ b/src/github_client.py
@@ -4,6 +4,15 @@
 import logging
 import os
 
+# Files and extensions excluded from unique_files_changed count
+# These are boilerplate/docs that don't represent substantive code work
+EXCLUDED_FILENAMES = {
+    "README.md", "CLAUDE.md", "LICENSE", "LICENSE.md",
+    ".gitignore", ".gitattributes", "CONTRIBUTING.md",
+    "CODE_OF_CONDUCT.md", "CHANGELOG.md",
+}
+EXCLUDED_EXTENSIONS = {".md", ".txt", ".lock"}
+
 logger = logging.getLogger(__name__)
 
 class GitHubClient:
@@ -56,6 +65,7 @@ async def get_contributor_stats(self, org_name: str, repo_name: str, contributor
             "login": contributor_login,
             "org_name": org_name,
             "repo_name": repo_name,
+            "team": repo_name,
             "commits": 0,
             "additions": 0,
             "deletions": 0,
@@ -81,6 +91,7 @@ async def get_contributor_stats(self, org_name: str, repo_name: str, contributor
             "latest_commit_time": None,
             "latest_commit_id": None,
             "largest_pr": None,
+            "unique_files_changed": 0,
             "avatar_url": None,
             "name": None
         }
@@ -179,10 +190,38 @@ async def get_contributor_stats(self, org_name: str, repo_name: str, contributor
                 stats["latest_commit_id"] = last_commit["sha"]
                 
 
-        for commit in commits:
-            if "stats" in commit:
-                stats["additions"] += commit["stats"].get("additions", 0)
-                stats["deletions"] += commit["stats"].get("deletions", 0)
+        # Fetch individual commit details for accurate stats and file tracking
+        # The list-commits endpoint doesn't return stats or files
+        unique_files = set()
+        semaphore = asyncio.Semaphore(10)
+
+        async def fetch_commit_detail(sha: str) -> dict | None:
+            async with semaphore:
+                url = f"{self.base_url}/repos/{repo_full_name}/commits/{sha}"
+                try:
+                    async with self.session.get(url) as response:
+                        if response.status == 200:
+                            return await response.json()
+                except Exception as e:
+                    logger.warning(f"Error fetching commit detail {sha}: {str(e)}")
+                return None
+
+        commit_details = await asyncio.gather(
+            *[fetch_commit_detail(c["sha"]) for c in commits]
+        )
+
+        for detail in commit_details:
+            if detail:
+                stats["additions"] += detail.get("stats", {}).get("additions", 0)
+                stats["deletions"] += detail.get("stats", {}).get("deletions", 0)
+                for f in detail.get("files", []):
+                    filename = f.get("filename", "")
+                    basename = os.path.basename(filename)
+                    _, ext = os.path.splitext(basename)
+                    if basename not in EXCLUDED_FILENAMES and ext not in EXCLUDED_EXTENSIONS:
+                        unique_files.add(filename)
+
+        stats["unique_files_changed"] = len(unique_files)
 
         # Fetch PR stats
         prs_url = f"{self.base_url}/repos/{repo_full_name}/pulls"
@@ -213,6 +252,7 @@ async def get_contributor_stats(self, org_name: str, repo_name: str, contributor
                                 "title": pr_data.get("title"),
                                 "additions": pr_data.get("additions", 0),
                                 "deletions": pr_data.get("deletions", 0),
+                                "changed_files": pr_data.get("changed_files", 0),
                                 "merged_at": pr_data.get("merged_at")
                             }