From f59ea9a90bc392214d3a3c2180d7791b5a1ee235 Mon Sep 17 00:00:00 2001 From: Greg V Date: Sun, 29 Mar 2026 06:56:24 -0700 Subject: [PATCH 1/3] Fix Fly deployment: convert from HTTP service to background worker The scheduler was being auto-stopped by Fly because fly.toml configured it as an HTTP service. Since the app is a long-running background worker, removed [http_service] section and configured as a process instead. Also adds ignored users filtering and updates org to 2026-ASU-WiCS. Co-Authored-By: Claude Opus 4.6 (1M context) --- CLAUDE.md | 31 ++++++++++++++++++++++++++++++- Dockerfile | 2 +- fly.toml | 20 ++------------------ src/github_client.py | 21 ++++++++++++++++++--- src/main.py | 12 ++++++++++-- src/scheduler.py | 6 +++++- 6 files changed, 66 insertions(+), 26 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 0c55906..4f5e248 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -5,12 +5,41 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Build & Run Commands - Install dependencies: `pip install -r requirements.txt` - Run application: `python src/main.py ` -- Run scheduler: `python src/scheduler.py` +- Run scheduler: `python src/scheduler.py ` +- Get achievements only: `python src/main.py --achievements-only ` - Run tests: `pytest` - Lint code: `flake8 src/` - Format code: `black src/` - Type check: `mypy src/` +## Environment Setup +Required environment variables: +- `GITHUB_TOKEN`: GitHub personal access token +- `GOOGLE_APPLICATION_CREDENTIALS_JSON`: JSON string of Firestore service account credentials +- `COLLECTION_INTERVAL`: daily, hourly, minutely, or weekly (default: daily) +- `COLLECTION_TIME`: time to run collection (default: 00:00) +- `GITHUB_IGNORED_USERS`: comma-separated list of GitHub usernames to ignore (default: gregv) + +## Architecture Overview +This is a GitHub organization metrics collector with the following key components: + +**Core Classes:** +- `GitHubClient`: Handles GitHub API interactions using aiohttp for async requests +- `FirestoreClient`: Manages Firestore database operations for data persistence +- `MetricsCollector`: Orchestrates data collection and coordinates between GitHub and Firestore +- `AchievementsGenerator`: Generates achievement data based on contributor statistics +- `Scheduler`: Handles periodic execution of metrics collection + +**Data Flow:** +1. GitHub API → contributor stats, repository data, pull requests, commits +2. Firestore structure: organizations/{org}/repositories/{repo}/contributors/{user} +3. Achievements are generated from aggregated contributor data and stored separately + +**Key Operations:** +- Organization processing: fetches all repos → all contributors → individual stats +- Achievement generation: aggregates data across repositories to create achievements +- Scheduled execution: runs collection at configurable intervals + ## Code Style Guidelines - Use Python 3.11+ features - Format with black (line length: 88) diff --git a/Dockerfile b/Dockerfile index 210516c..5216a7a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,4 +27,4 @@ RUN useradd -m appuser USER appuser # Run the scheduler -CMD ["python", "src/scheduler.py", "2024-Arizona-Opportunity-Hack"] \ No newline at end of file +CMD ["python", "src/scheduler.py", "2026-ASU-WiCS-Opportunity-Hack"] diff --git a/fly.toml b/fly.toml index f0a52b7..7325f45 100644 --- a/fly.toml +++ b/fly.toml @@ -8,27 +8,11 @@ primary_region = 'lax' [build] -[env] - PORT = '8080' - -[[mounts]] - source = 'github_stats_data' - destination = '/data' - -[http_service] - internal_port = 8080 - force_https = true - auto_stop_machines = 'stop' - auto_start_machines = true - min_machines_running = 1 - processes = ['app'] +[processes] + app = "python src/scheduler.py 2026-ASU-WiCS-Opportunity-Hack" [[vm]] size = 'shared-cpu-1x' memory = '512mb' cpu_kind = 'shared' cpus = 1 - -[[metrics]] - port = 9091 - path = '/metrics' diff --git a/src/github_client.py b/src/github_client.py index 07f0851..311a2ff 100644 --- a/src/github_client.py +++ b/src/github_client.py @@ -1,12 +1,13 @@ import aiohttp import asyncio -from typing import List, Dict, Any +from typing import List, Dict, Any, Set import logging +import os logger = logging.getLogger(__name__) class GitHubClient: - def __init__(self, token: str): + def __init__(self, token: str, ignored_users: Set[str] = None): self.token = token self.base_url = "https://api.github.com" self.headers = { @@ -14,6 +15,8 @@ def __init__(self, token: str): "Accept": "application/vnd.github.v3+json" } self.session = None + self.ignored_users = ignored_users or set() + logger.info(f"GitHubClient initialized with {len(self.ignored_users)} ignored users: {self.ignored_users}") async def ensure_session(self): if self.session is None or self.session.closed: @@ -31,7 +34,19 @@ async def get_organization_repos(self, org_name: str) -> List[Dict[str, Any]]: async def get_repo_contributors(self, repo_full_name: str) -> List[Dict[str, Any]]: await self.ensure_session() url = f"{self.base_url}/repos/{repo_full_name}/contributors" - return await self.get_paginated_data(url) + contributors = await self.get_paginated_data(url) + + # Filter out ignored users + filtered_contributors = [ + contributor for contributor in contributors + if contributor.get('login') not in self.ignored_users + ] + + if len(contributors) != len(filtered_contributors): + ignored_count = len(contributors) - len(filtered_contributors) + logger.info(f"Filtered out {ignored_count} ignored contributors from {repo_full_name}") + + return filtered_contributors async def get_contributor_stats(self, org_name: str, repo_name: str, contributor_login: str) -> Dict[str, Any]: await self.ensure_session() diff --git a/src/main.py b/src/main.py index 7bad44f..9c437c4 100644 --- a/src/main.py +++ b/src/main.py @@ -19,7 +19,11 @@ async def process_organization(org_name: str, github_token: str, firestore_crede github_client = None firestore_client = None try: - github_client = GitHubClient(github_token) + # Parse ignored users from environment variable + ignored_users_str = os.getenv("GITHUB_IGNORED_USERS", "gregv") + ignored_users = set(user.strip() for user in ignored_users_str.split(",") if user.strip()) + + github_client = GitHubClient(github_token, ignored_users) firestore_client = FirestoreClient(firestore_credentials) metrics_collector = MetricsCollector(github_client, firestore_client) @@ -50,7 +54,11 @@ async def get_achievements(org_name: str, github_token: str, firestore_credentia github_client = None firestore_client = None try: - github_client = GitHubClient(github_token) + # Parse ignored users from environment variable + ignored_users_str = os.getenv("GITHUB_IGNORED_USERS", "gregv") + ignored_users = set(user.strip() for user in ignored_users_str.split(",") if user.strip()) + + github_client = GitHubClient(github_token, ignored_users) firestore_client = FirestoreClient(firestore_credentials) metrics_collector = MetricsCollector(github_client, firestore_client) diff --git a/src/scheduler.py b/src/scheduler.py index 2a03e22..3c2efa3 100644 --- a/src/scheduler.py +++ b/src/scheduler.py @@ -31,10 +31,14 @@ def __init__(self, github_token: str, firestore_credentials: str, organizations: self.github_client = None self.firestore_client = None self.metrics_collector = None + + # Parse ignored users from environment variable + ignored_users_str = os.getenv("GITHUB_IGNORED_USERS", "gregv") + self.ignored_users = set(user.strip() for user in ignored_users_str.split(",") if user.strip()) async def setup(self): """Set up the clients and collector.""" - self.github_client = GitHubClient(self.github_token) + self.github_client = GitHubClient(self.github_token, self.ignored_users) self.firestore_client = FirestoreClient(self.firestore_credentials) self.metrics_collector = MetricsCollector(self.github_client, self.firestore_client) From 86df03bcbb2e451b7f7c0f52d0cfaacc230e0e7b Mon Sep 17 00:00:00 2001 From: Greg V Date: Sun, 29 Mar 2026 07:05:15 -0700 Subject: [PATCH 2/3] Keep mounts in fly.toml to match existing Fly machine volume Co-Authored-By: Claude Opus 4.6 (1M context) --- fly.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fly.toml b/fly.toml index 7325f45..3a5f9b9 100644 --- a/fly.toml +++ b/fly.toml @@ -8,6 +8,10 @@ primary_region = 'lax' [build] +[[mounts]] + source = 'github_stats_data' + destination = '/data' + [processes] app = "python src/scheduler.py 2026-ASU-WiCS-Opportunity-Hack" From 1296f8a5734b2a7f08457b555b6cc81a781915c1 Mon Sep 17 00:00:00 2001 From: Greg V Date: Mon, 30 Mar 2026 11:59:53 -0700 Subject: [PATCH 3/3] Track unique files changed per contributor and improve achievement explainability - Fetch individual commit details to get accurate additions/deletions (fixes broken list-commits endpoint) and track unique significant files per contributor - Exclude boilerplate files (.md, .txt, .lock, README, LICENSE, etc.) from file counts - Add team field to contributor stats (set from repo_name) to fix team achievement generation - Add MIN_FILES_FOR_PRODUCTIVE_TEAM threshold (10) to prevent low-substance teams from winning Most Productive Team - Add "Ready for a Boost" mentor opportunity entries for teams with low activity - Add units to bare-number achievement values (commits, PRs, issues, reviews) - Improve achievement descriptions to explain exactly how each metric is calculated Co-Authored-By: Claude Opus 4.6 (1M context) --- src/achievements_generator.py | 79 ++++++++++++++++++++++++++++------- src/github_client.py | 48 +++++++++++++++++++-- 2 files changed, 107 insertions(+), 20 deletions(-) diff --git a/src/achievements_generator.py b/src/achievements_generator.py index d4e9135..d245022 100644 --- a/src/achievements_generator.py +++ b/src/achievements_generator.py @@ -8,6 +8,8 @@ logger = logging.getLogger(__name__) class AchievementsGenerator: + MIN_FILES_FOR_PRODUCTIVE_TEAM = 10 + def __init__(self): logger.info("AchievementsGenerator initialized") @@ -41,6 +43,9 @@ def generate_achievements(self, org_name: str, repositories: List[Dict[str, Any] # Generate team achievements achievements.extend(self._find_most_productive_team(org_name, repos_data, contributors_data)) achievements.extend(self._find_most_collaborative_team(org_name, repos_data, contributors_data)) + + # Generate mentor opportunity entries for teams that could use support + achievements.extend(self._find_teams_ready_for_boost(org_name, repos_data, contributors_data)) logger.info(f"Generated {len(achievements)} achievements for {org_name}") return achievements @@ -128,7 +133,7 @@ def _find_epic_pr(self, org_name: str, repos_data: Dict[str, Any], contributors_ }, "value": formatted_size, "icon": "merge", - "description": "Largest pull request merged", + "description": "Largest merged PR by lines added + deleted", "repo": largest_pr_repo, "prNumber": str(largest_pr_number) if largest_pr_number else "" }] @@ -173,9 +178,9 @@ def _find_night_owl(self, org_name: str, repos_data: Dict[str, Any], contributor "team": latest_contributor.get('team', ""), "githubUsername": latest_contributor['login'] }, - "value": latest_total_commits, + "value": f"{latest_total_commits} commits", "icon": "accessTime", - "description": "Most commits made at night", + "description": "Commits between 10pm–4am MST", "repo": latest_repo }] return [] @@ -209,7 +214,7 @@ def _find_code_surgeon(self, org_name: str, repos_data: Dict[str, Any], contribu }, "value": formatted_deletions, "icon": "delete", - "description": "Most code deleted", + "description": "Most lines of code removed across commits", "repo": max_deletion_repo }] return [] @@ -239,9 +244,9 @@ def _find_pr_master(self, org_name: str, repos_data: Dict[str, Any], contributor "team": max_pr_contributor.get('team', ""), "githubUsername": max_pr_contributor['login'] }, - "value": str(max_prs), + "value": f"{max_prs} PRs", "icon": "pull_request", - "description": "Most PRs created", + "description": "Most pull requests created", "repo": max_pr_repo }] return [] @@ -271,9 +276,9 @@ def _find_issue_resolver(self, org_name: str, repos_data: Dict[str, Any], contri "team": max_issue_contributor.get('team', ""), "githubUsername": max_issue_contributor['login'] }, - "value": str(max_issues_closed), + "value": f"{max_issues_closed} issues", "icon": "task_alt", - "description": "Most issues closed", + "description": "Most GitHub issues closed", "repo": max_issue_repo }] return [] @@ -304,9 +309,9 @@ def _find_review_champion(self, org_name: str, repos_data: Dict[str, Any], contr "team": max_review_contributor.get('team', ""), "githubUsername": max_review_contributor['login'] }, - "value": str(max_reviews), + "value": f"{max_reviews} reviews", "icon": "rate_review", - "description": "Most PR reviews submitted", + "description": "Most pull request reviews submitted", "repo": max_review_repo }] return [] @@ -336,9 +341,9 @@ def _find_weekend_warrior(self, org_name: str, repos_data: Dict[str, Any], contr "team": max_weekend_contributor.get('team', ""), "githubUsername": max_weekend_contributor['login'] }, - "value": str(max_weekend_commits), + "value": f"{max_weekend_commits} commits", "icon": "weekend", - "description": "Most commits on weekends", + "description": "Most commits on Saturday & Sunday", "repo": max_weekend_repo }] return [] @@ -407,16 +412,19 @@ def _find_most_productive_team(self, org_name: str, repos_data: Dict[str, Any], tasks_completed = 0 members_count = len(team_members) + files_changed = 0 for member in team_members: # Count completed tasks (commits + PRs merged + issues closed) tasks_completed += member.get('commits', 0) tasks_completed += member.get('pull_requests', {}).get('merged', 0) tasks_completed += member.get('issues', {}).get('closed', 0) - - if tasks_completed > 0 and members_count > 0: + files_changed += member.get('unique_files_changed', 0) + + if tasks_completed > 0 and members_count > 0 and files_changed >= self.MIN_FILES_FOR_PRODUCTIVE_TEAM: team_productivity[team_name] = { "tasks": tasks_completed, "members": members_count, + "files_changed": files_changed, "members_data": team_members } @@ -436,7 +444,8 @@ def _find_most_productive_team(self, org_name: str, repos_data: Dict[str, Any], "value": f"{team_data['tasks']} tasks", "icon": "group", "members": team_data["members"], - "description": "Completed the most tasks during the hackathon", + "filesChanged": team_data["files_changed"], + "description": "Most commits + merged PRs + issues closed", "teamPage": team_page }] except Exception as e: @@ -493,9 +502,47 @@ def _find_most_collaborative_team(self, org_name: str, repos_data: Dict[str, Any "value": f"{team_data['collaboration']} PRs reviewed", "icon": "merge", "members": team_data["members"], - "description": "Highest number of PR reviews and comments", + "description": "Highest number of pull request reviews", "teamPage": team_page }] except Exception as e: logger.error(f"Error finding most collaborative team achievement: {str(e)}") + return [] + + def _find_teams_ready_for_boost(self, org_name: str, repos_data: Dict[str, Any], contributors_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Find teams with low GitHub activity so mentors can offer support.""" + try: + teams = defaultdict(list) + for contributor in contributors_data: + team = contributor.get('team') + if team: + teams[team].append(contributor) + + if not teams: + return [] + + boost_teams = [] + for team_name, team_members in teams.items(): + total_commits = sum(member.get('commits', 0) for member in team_members) + files_changed = sum(member.get('unique_files_changed', 0) for member in team_members) + members_count = len(team_members) + + if files_changed < self.MIN_FILES_FOR_PRODUCTIVE_TEAM or total_commits < 5: + team_page = team_name.lower().replace(' ', '-') + boost_teams.append({ + "title": "Ready for a Boost", + "team": team_name, + "value": f"{files_changed} files, {total_commits} commits", + "icon": "rocket_launch", + "members": members_count, + "filesChanged": files_changed, + "description": "This team might benefit from some mentor guidance to get rolling!", + "teamPage": team_page, + "type": "mentor_opportunity" + }) + + logger.info(f"Found {len(boost_teams)} teams ready for a boost in {org_name}") + return boost_teams + except Exception as e: + logger.error(f"Error finding teams ready for boost: {str(e)}") return [] \ No newline at end of file diff --git a/src/github_client.py b/src/github_client.py index 311a2ff..7f66f6c 100644 --- a/src/github_client.py +++ b/src/github_client.py @@ -4,6 +4,15 @@ import logging import os +# Files and extensions excluded from unique_files_changed count +# These are boilerplate/docs that don't represent substantive code work +EXCLUDED_FILENAMES = { + "README.md", "CLAUDE.md", "LICENSE", "LICENSE.md", + ".gitignore", ".gitattributes", "CONTRIBUTING.md", + "CODE_OF_CONDUCT.md", "CHANGELOG.md", +} +EXCLUDED_EXTENSIONS = {".md", ".txt", ".lock"} + logger = logging.getLogger(__name__) class GitHubClient: @@ -56,6 +65,7 @@ async def get_contributor_stats(self, org_name: str, repo_name: str, contributor "login": contributor_login, "org_name": org_name, "repo_name": repo_name, + "team": repo_name, "commits": 0, "additions": 0, "deletions": 0, @@ -81,6 +91,7 @@ async def get_contributor_stats(self, org_name: str, repo_name: str, contributor "latest_commit_time": None, "latest_commit_id": None, "largest_pr": None, + "unique_files_changed": 0, "avatar_url": None, "name": None } @@ -179,10 +190,38 @@ async def get_contributor_stats(self, org_name: str, repo_name: str, contributor stats["latest_commit_id"] = last_commit["sha"] - for commit in commits: - if "stats" in commit: - stats["additions"] += commit["stats"].get("additions", 0) - stats["deletions"] += commit["stats"].get("deletions", 0) + # Fetch individual commit details for accurate stats and file tracking + # The list-commits endpoint doesn't return stats or files + unique_files = set() + semaphore = asyncio.Semaphore(10) + + async def fetch_commit_detail(sha: str) -> dict | None: + async with semaphore: + url = f"{self.base_url}/repos/{repo_full_name}/commits/{sha}" + try: + async with self.session.get(url) as response: + if response.status == 200: + return await response.json() + except Exception as e: + logger.warning(f"Error fetching commit detail {sha}: {str(e)}") + return None + + commit_details = await asyncio.gather( + *[fetch_commit_detail(c["sha"]) for c in commits] + ) + + for detail in commit_details: + if detail: + stats["additions"] += detail.get("stats", {}).get("additions", 0) + stats["deletions"] += detail.get("stats", {}).get("deletions", 0) + for f in detail.get("files", []): + filename = f.get("filename", "") + basename = os.path.basename(filename) + _, ext = os.path.splitext(basename) + if basename not in EXCLUDED_FILENAMES and ext not in EXCLUDED_EXTENSIONS: + unique_files.add(filename) + + stats["unique_files_changed"] = len(unique_files) # Fetch PR stats prs_url = f"{self.base_url}/repos/{repo_full_name}/pulls" @@ -213,6 +252,7 @@ async def get_contributor_stats(self, org_name: str, repo_name: str, contributor "title": pr_data.get("title"), "additions": pr_data.get("additions", 0), "deletions": pr_data.get("deletions", 0), + "changed_files": pr_data.get("changed_files", 0), "merged_at": pr_data.get("merged_at") }