diff --git a/.chroma_db/chroma.sqlite3 b/.chroma_db/chroma.sqlite3 new file mode 100644 index 0000000..9d62a79 Binary files /dev/null and b/.chroma_db/chroma.sqlite3 differ diff --git a/.gitignore b/.gitignore index f4e3a68..5749dc6 100644 Binary files a/.gitignore and b/.gitignore differ diff --git a/PROJECT_STRUCTURE_GUIDE.md b/PROJECT_STRUCTURE_GUIDE.md new file mode 100644 index 0000000..ccfe08e --- /dev/null +++ b/PROJECT_STRUCTURE_GUIDE.md @@ -0,0 +1,478 @@ +# ๐Ÿ“š InspectAI - Complete Project Structure Guide + +## ๐ŸŽฏ Overview +**InspectAI** is a **production-grade multi-agent AI code review system** that automatically analyzes GitHub pull requests using 12 specialized AI agents working in parallel. + +**What it does:** +- ๐Ÿค– Automatically reviews code in GitHub PRs +- ๐Ÿ› Detects bugs, security issues, and code quality problems +- ๐Ÿ’ก Provides intelligent suggestions and fixes +- โšก Runs 12 agents in parallel for speed +- ๐Ÿ”’ Works with multiple LLM providers (Gemini, OpenAI, Bytez) + +--- + +## ๐Ÿ“ Root-Level Files & Directories + +### Files in Root Directory + +| File | Purpose | +|------|---------| +| **README.md** | Project overview with features, setup, and commands | +| **requirements.txt** | Python dependencies (FastAPI, LangChain, LLaMA, etc.) | +| **requirements-prod.txt** | Production-ready dependencies | +| **.env.example** | Template for environment configuration | +| **.env** | (created by you) Contains API keys and secrets | +| **Dockerfile** | Docker container configuration for deployment | +| **render.yaml** | Configuration for Render.com deployment | +| **.gitignore** | Git ignore patterns | +| **config/default_config.py** | Central configuration hub | +| **test_local_review.py** | Local testing script | + +### Root Directories + +| Directory | Purpose | +|-----------|---------| +| **src/** | Main source code | +| **config/** | Configuration files | +| **docs/** | Documentation and guides | +| **examples/** | Example scripts | +| **scripts/** | Deployment and setup scripts | +| **tests/** | Unit tests | + +--- + +## ๐Ÿ”ง Configuration Directory: `config/` + +### `config/default_config.py` - THE CENTRAL HUB โญ + +This is the **single source of truth** for all configuration: + +```python +DEFAULT_PROVIDER = "gemini" # Choose: "openai", "bytez", or "gemini" + +ORCHESTRATOR_CONFIG = { + "analysis": {...}, # Code style & quality review + "bug_detection": {...}, # Finding bugs + "security": {...}, # Security vulnerabilities + "test_generation": {...}, # Creating tests + "documentation": {...}, # Generating docs + "research": {...}, # Research/understanding code + "generation": {...}, # Code generation +} +``` + +**Key Settings:** +- `temperature`: How creative (0.1=focused, 0.7=creative) +- `max_tokens`: Response length limit +- `confidence_threshold`: How certain the AI must be +- `similarity_threshold`: For deduplication (85%) + +--- + +## ๐Ÿ“ Documentation: `docs/` + +| File | Contains | +|------|----------| +| **GITHUB_PR_INTEGRATION.md** | How agents post GitHub PR comments | +| **GCP_DEPLOYMENT.md** | Deploy to Google Cloud | +| **LLM_PROVIDER_GUIDE.md** | Setup OpenAI/Gemini/Bytez | +| **LANGGRAPH_GUIDE.md** | LangGraph workflow architecture | +| **enhanced_pr_review_example.py** | Example PR review code | + +--- + +## ๐Ÿš€ Main Source Code: `src/` + +### `src/main.py` - Entry Point +Provides the main CLI interface: +```python +python -m src.main review myfile.py # Review a file +python -m src.main pr owner/repo 123 # Review a PR +python -m src.main server --port 8000 # Start server +``` + +--- + +## ๐Ÿค– Agents: `src/agents/` + +### Architecture: Hierarchical Multi-Agent System + +``` +OrchestratorAgent (main coordinator) +โ”œโ”€โ”€ CodeAnalysisAgent (4 sub-agents) +โ”œโ”€โ”€ BugDetectionAgent (4 sub-agents) +โ”œโ”€โ”€ SecurityAnalysisAgent (4 sub-agents) +โ”œโ”€โ”€ TestGenerationAgent +โ”œโ”€โ”€ DocumentationAgent +โ”œโ”€โ”€ ResearchAgent +โ””โ”€โ”€ CodeGenerationAgent +``` + +### Core Files + +| File | Purpose | +|------|---------| +| **base_agent.py** | Abstract base class for all agents | +| **code_analysis_agent.py** | Orchestrator for code quality | +| **bug_detection_agent.py** | Orchestrator for bug finding | +| **security_agent.py** | Orchestrator for security scanning | +| **test_generation_agent.py** | Creates unit tests | +| **documentation_agent.py** | Generates documentation | +| **research_agent.py** | Context research and understanding | +| **code_generation_agent.py** | Code generation suggestions | +| **specialized_agent.py** | Generic specialized agent | +| **filter_pipeline.py** | Deduplicates and validates findings | + +### Sub-Agents Explained + +#### Code Review Sub-Agents (`code_review/`) + +| Agent | Detects | +|-------|---------| +| **NamingReviewer** | Poor variable/function names, PEP 8 naming | +| **QualityReviewer** | Complexity, best practices, anti-patterns | +| **DuplicationDetector** | Repeated code patterns | +| **PEP8Reviewer** | Style violations, formatting issues | + +#### Bug Detection Sub-Agents (`bug_detection/`) + +| Agent | Detects | +|-------|---------| +| **LogicErrorDetector** | Off-by-one errors, algorithm mistakes | +| **EdgeCaseAnalyzer** | None checks, boundary conditions | +| **TypeErrorDetector** | Type mismatches, type safety issues | +| **RuntimeIssueDetector** | Resource leaks, performance issues | + +#### Security Sub-Agents (`security/`) + +| Agent | Detects | +|-------|---------| +| **InjectionScanner** | SQL injection, command injection | +| **AuthScanner** | Authentication flaws | +| **DataExposureScanner** | Hardcoded secrets, data leaks | +| **DependencyScanner** | Unsafe library versions | + +--- + +## ๐ŸŒ API Server: `src/api/` + +### `src/api/server.py` - FastAPI Web Server + +Provides REST endpoints: + +``` +POST /review - Code review task +POST /pr-review - GitHub PR review +POST /webhook/github - GitHub webhook (automatic PR reviews) +GET /health - Health check +POST /analyze - Generic analysis +``` + +**Request Models:** +```python +ReviewRequest # Code + task type +PRReviewRequest # Repo + PR number +TaskResponse # Status, results +``` + +### `src/api/webhooks.py` - GitHub Integration ๐Ÿ”— + +Handles GitHub webhooks for **automatic PR reviews**: + +**Commands you can use in PR comments:** +``` +/inspectai_review # Quick review of changed lines +/inspectai_bugs # Deep bug scan +/inspectai_refactor # Refactoring suggestions +/inspectai_help # Show all commands +``` + +**What happens:** +1. Developer opens PR +2. GitHub sends webhook to your server +3. Server processes all changed files +4. AI agents analyze the code +5. Comments posted on the PR + +--- + +## ๐ŸŽญ Orchestrator: `src/orchestrator/` + +### `src/orchestrator/orchestrator.py` - Main Coordinator + +Coordinates all agents for different task types: + +```python +SUPPORTED_TASKS = [ + "code_improvement", # CodeAnalysisAgent + "bug_fix", # BugDetectionAgent + "security_audit", # SecurityAnalysisAgent + "test_generation", # TestGenerationAgent + "documentation", # DocumentationAgent + "full_review", # All agents + "pr_review" # PR-specific review +] +``` + +**Workflow:** +1. Receive code + task type +2. Select appropriate agents +3. Run agents in parallel +4. Aggregate and filter results +5. Return structured findings + +--- + +## ๐Ÿง  LLM Management: `src/llm/` + +### `src/llm/factory.py` - Provider Factory โญ + +**Single point for LLM configuration:** +```python +def get_llm_client(temperature=0.2, max_tokens=2048): + # Returns appropriate client based on provider + # Handles OpenAI, Gemini, or Bytez +``` + +### `src/llm/client.py` - LLM Client + +Base class for all LLM interactions: +```python +response = client.generate(prompt, temperature=0.2) +tokens = client.count_tokens(text) +``` + +### `src/llm/local_client.py` - Local Model Support + +Run LLMs locally without API calls: +```python +from src.llm.local_client import LocalLLMClient +client = LocalLLMClient(model="mistral") +``` + +--- + +## ๐Ÿ’พ Memory System: `src/memory/` + +### `src/memory/agent_memory.py` - Conversation Memory + +Maintains conversation history: +```python +memory = AgentMemory(max_history=10) +memory.add_message("user", "Analyze this") +memory.add_message("assistant", "Analysis...") +history = memory.get_history() +``` + +### `src/memory/pr_memory.py` - PR-Specific Context + +Stores PR findings and context: +```python +pr_memory = get_pr_memory(owner, repo, pr_number) +pr_memory.add_finding(finding) +bugs = pr_memory.get_bugs() +``` + +### `src/memory/vector_store.py` - Semantic Search + +Vector database for code context: +```python +vector_store.add_documents(code_chunks) +similar = vector_store.search("authentication", top_k=5) +``` + +--- + +## ๐Ÿ” GitHub Integration: `src/github/` + +### `src/github/client.py` - GitHub API Wrapper + +Functions: +```python +client.clone_repo(owner/repo) # Clone repository +files = client.get_pr_files(owner, repo, pr_num) +client.post_review_comment(owner, repo, pr_num, comment) +client.post_inline_comment(owner, repo, pr_num, comment, file, line) +``` + +--- + +## ๐Ÿ› ๏ธ Utilities: `src/utils/` + +| File | Purpose | +|------|---------| +| **logger.py** | Structured logging system | +| **language_detection.py** | Detect code language (Python, JS, etc.) | + +--- + +## ๐Ÿ“Š LangGraph Workflows: `src/langgraph_workflows/` + +Advanced workflow orchestration using LangGraph: + +| File | Purpose | +|------|---------| +| **review_workflow.py** | Main PR review workflow | +| **state.py** | Workflow state management | +| **agent_nodes.py** | Agent nodes for workflow | + +--- + +## ๐Ÿงช Tests: `tests/` + +| File | Tests | +|------|-------| +| **test_agents.py** | Individual agent tests | +| **test_imports.py** | Import validation | +| **test_orchestrator.py** | Orchestrator coordination | +| **test_vector_store.py** | Vector database | +| **test_polyglot.py** | Multi-language support | +| **sample_code_with_issues.py** | Sample buggy code | + +--- + +## ๐Ÿ“œ Scripts: `scripts/` + +| Script | Purpose | +|--------|---------| +| **deploy_gcp.sh** | Deploy to Google Cloud Run | +| **setup_gcp.sh** | Setup GCP environment | +| **start_webhook_server.sh** | Start webhook server | + +--- + +## ๐ŸŽฏ Examples: `examples/` + +| File | Example | +|------|---------| +| **langgraph_workflow_example.py** | LangGraph workflow | +| **enhanced_pr_review_example.py** | PR review workflow | + +--- + +## ๐Ÿ“ฆ Deployment Files + +| File | Purpose | +|------|---------| +| **Dockerfile** | Docker container | +| **render.yaml** | Render.com deployment config | +| **.gcloudignore** | Google Cloud ignore patterns | + +--- + +## ๐Ÿ”„ Data Flow Diagram + +``` +GitHub PR Opened + โ†“ +[GitHub Webhook] โ†’ Server + โ†“ +[Orchestrator] selects agents + โ†“ +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ Parallel Agent Execution โ”‚ +โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค +โ”‚ CodeAnalysisAgent โ”‚ +โ”‚ BugDetectionAgent โ”‚ +โ”‚ SecurityAnalysisAgent โ”‚ +โ”‚ TestGenerationAgent โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ†“ +[Filter Pipeline] - Dedup & Validate + โ†“ +[Aggregate Findings] + โ†“ +[Format Report] + โ†“ +[GitHub Client] - Post Comments + โ†“ +PR Comment Posted โœ… +``` + +--- + +## ๐Ÿš€ Quick Start + +1. **Setup Environment:** + ```bash + cp .env.example .env + # Edit .env with your API keys + ``` + +2. **Test Locally:** + ```bash + python test_local_review.py + ``` + +3. **Run Server:** + ```bash + uvicorn src.api.server:app --reload --port 8000 + ``` + +4. **Deploy:** + ```bash + ./scripts/deploy_gcp.sh + ``` + +--- + +## ๐Ÿ“‹ Configuration Priority + +``` +1. Environment Variables (.env) +2. Command-line Arguments +3. config/default_config.py +4. Built-in defaults +``` + +--- + +## ๐ŸŽ“ Key Concepts + +| Concept | Meaning | +|---------|---------| +| **Agent** | AI component that performs specific task | +| **Sub-Agent** | Specialized agent that handles one aspect | +| **Orchestrator** | Coordinates multiple agents | +| **Filter Pipeline** | Removes duplicate/low-quality findings | +| **Vector Store** | Semantic search database | +| **Webhook** | GitHub notifies server of events | +| **LLM** | Large Language Model (AI) | + +--- + +## ๐Ÿ” How to Find Things + +| Want to... | Look in... | +|-----------|-----------| +| Add a new agent | `src/agents/` | +| Change API response | `src/api/server.py` | +| Modify PR comments | `src/api/webhooks.py` | +| Adjust confidence threshold | `config/default_config.py` | +| Fix GitHub auth | `src/github/client.py` | +| Add LLM provider | `src/llm/factory.py` | +| Update memory logic | `src/memory/` | + +--- + +## โš™๏ธ Environment Variables Explained + +```env +# LLM Provider +GEMINI_API_KEY=your_key # Google Gemini API +OPENAI_API_KEY=your_key # OpenAI GPT-4 +BYTEZ_API_KEY=your_key # Bytez API + +# GitHub +GITHUB_TOKEN=your_token # Personal Access Token +GITHUB_WEBHOOK_SECRET=random # Webhook verification + +# Server +PORT=8000 # Server port +LOG_LEVEL=INFO # Logging verbosity +``` + +--- + +Generated: 2025-12-02 diff --git a/README.md b/README.md index 8be32da..c4eb47b 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,8 @@ Production-grade multi-agent system for automated code review, bug detection, an **Live Demo**: Install on your repo and try `/inspectai_review` on any PR! +**Testing PR Descriptions**: Push to a PR and watch the bot post a description comment with file change summaries! ๐Ÿš€ + --- ## ๐Ÿ—๏ธ Technical Choices Summary diff --git a/src/api/webhooks.py b/src/api/webhooks.py index a5c50d0..a5a031a 100644 --- a/src/api/webhooks.py +++ b/src/api/webhooks.py @@ -381,144 +381,92 @@ async def process_pr_review( logger.info(f"Processing PR review for {repo_full_name}#{pr_number} (action: {action})") + # Only process on PR open/push/reopen + if action not in ["opened", "synchronize", "reopened"]: + return { + "status": "ignored", + "message": "PR action does not trigger review" + } + try: - # Check rate limit before starting expensive operations - try: - github_check = GitHubClient.from_installation(installation_id) if installation_id else GitHubClient() - rate_status = github_check.get_rate_limit_status() - remaining = rate_status.get('remaining', 0) - - if remaining < 50: # Need at least 50 API calls for a PR review - reset_time = rate_status.get('reset', 0) - wait_until = datetime.fromtimestamp(reset_time).strftime('%H:%M:%S') if reset_time else 'unknown' - logger.warning( - f"GitHub API rate limit too low ({remaining} remaining). " - f"Skipping PR review for {repo_full_name}#{pr_number}. " - f"Rate limit resets at {wait_until}" - ) - return { - "status": "rate_limited", - "message": f"GitHub API rate limit too low ({remaining} remaining). Will retry after reset.", - "reset_at": reset_time - } - except Exception as e: - logger.warning(f"Could not check rate limit: {e}. Proceeding anyway...") + # Get PR details and files + github_client = GitHubClient(token=os.getenv("GITHUB_TOKEN")) + pr = github_client.get_pull_request(repo_full_name, pr_number) - # Initialize orchestrator - config = copy.deepcopy(ORCHESTRATOR_CONFIG) - from config.default_config import DEFAULT_PROVIDER, GEMINI_MODEL, BYTEZ_MODEL, OPENAI_MODEL - provider = os.getenv("LLM_PROVIDER", DEFAULT_PROVIDER) + # Generate PR description with LLM explanations + logger.info(f"Generating PR description for {repo_full_name}#{pr_number}") + from src.utils.pr_description_generator import PRDescriptionGenerator, FileChange, analyze_diff_with_llm - # Set model based on provider - model_map = { - "gemini": GEMINI_MODEL, - "bytez": BYTEZ_MODEL, - "openai": OPENAI_MODEL - } + # Prepare FileChange objects with LLM-powered explanations + files_changed = [] + for pr_file in pr.files: + file_change = FileChange( + filename=pr_file.filename, + status=pr_file.status, + additions=pr_file.additions, + deletions=pr_file.deletions, + changes=pr_file.additions + pr_file.deletions, + ) + + # Get LLM explanation for the diff (if available) + if pr_file.patch and pr_file.status == "modified": + try: + logger.info(f"[PR_DESC] Analyzing diff for {pr_file.filename}...") + explanation = analyze_diff_with_llm( + pr_file.filename, + pr_file.patch, + llm_client=None # Will use default client + ) + file_change.explanation = explanation + logger.info(f"[PR_DESC] Got explanation: {explanation[:80]}...") + except Exception as e: + logger.warning(f"[PR_DESC] LLM analysis failed for {pr_file.filename}: {e}") + file_change.explanation = f"Modified {pr_file.filename}" + elif pr_file.status == "added": + file_change.explanation = f"New file with {pr_file.additions} lines" + + files_changed.append(file_change) + + # Generate changelog-style description with LLM explanations + pr_generator = PRDescriptionGenerator() + generated_description = pr_generator.generate_changelog_description( + files_changed=files_changed, + pr_title=pr.title + ) - for key in config: - if isinstance(config[key], dict): - config[key]["provider"] = provider - config[key]["model"] = model_map.get(provider, GEMINI_MODEL) + logger.info(f"Generated PR description for {repo_full_name}#{pr_number}") - orchestrator = OrchestratorAgent(config) + # Add action-specific header to clarify what changes this describes + action_emoji = "๐Ÿ“‚" if action == "opened" else "โšก" if action == "synchronize" else "๐Ÿ”„" + action_text = "Initial submission" if action == "opened" else "Latest push" if action == "synchronize" else "Reopened" + + description_with_context = f"{action_emoji} **{action_text}**\n\n{generated_description}" + # Post PR description as a comment instead of updating PR body try: - # Run PR review - task = { - "type": "pr_review", - "input": { - "repo_url": repo_full_name, - "pr_number": pr_number, - "post_comments": True # Auto-post review comments - } - } - - result = orchestrator.process_task(task) - logger.info(f"PR review completed for {repo_full_name}#{pr_number}") - - # Generate PR description if PR just opened - if action == "opened": - try: - logger.info(f"Generating PR description for {repo_full_name}#{pr_number}") - - # Get PR files and changes - github_client = GitHubClient() - pr = github_client.get_pull_request(repo_full_name, pr_number) - - # Build code changes data for PR description generator - code_changes = [] - for pr_file in pr.files: - code_changes.append({ - "filename": pr_file.filename, - "status": pr_file.status, - "additions": pr_file.additions, - "deletions": pr_file.deletions - }) - - # Extract bugs and analysis from the review result - bugs_data = result.get("bug_detection", {}) if isinstance(result, dict) else {} - analysis_data = result.get("analysis", {}) if isinstance(result, dict) else {} - - # Prepare input for PR description generator - description_input = { - "code_changes": code_changes, - "bugs": { - "bug_count": bugs_data.get("bug_count", 0) if isinstance(bugs_data, dict) else 0, - "bugs": bugs_data.get("bugs", []) if isinstance(bugs_data, dict) else [] - }, - "security": result.get("security", {}) if isinstance(result, dict) else {}, - "analysis": { - "suggestions": analysis_data.get("suggestions", []) if isinstance(analysis_data, dict) else [] - } - } - - # Generate description - pr_description_result = orchestrator.agents["pr_description"].process(description_input) - - if pr_description_result.get("status") == "success": - generated_title = pr_description_result.get("title", "") - generated_description = pr_description_result.get("description", "") - pr_type = pr_description_result.get("pr_type", "general") - - logger.info(f"Generated PR description: {pr_type}") - logger.info(f"Generated title: {generated_title}") - - # Update PR description on GitHub - try: - github_client.update_pr_body( - repo_full_name, - pr_number, - generated_description - ) - logger.info(f"Updated PR description for {repo_full_name}#{pr_number}") - result["pr_description"] = { - "status": "updated", - "title": generated_title, - "type": pr_type - } - except Exception as e: - logger.warning(f"Failed to update PR description: {e}") - result["pr_description"] = { - "status": "generated_not_posted", - "title": generated_title, - "type": pr_type, - "error": str(e) - } - else: - logger.warning(f"Failed to generate PR description: {pr_description_result.get('error')}") - - except Exception as e: - logger.warning(f"Error generating PR description: {e}", exc_info=True) - - return result - - finally: - orchestrator.cleanup() - + github_client.post_pr_comment( + repo_full_name, + pr_number, + description_with_context + ) + logger.info(f"Posted PR description as comment for {repo_full_name}#{pr_number}") + except Exception as e: + logger.warning(f"Failed to post PR description comment: {e}") + + logger.info(f"PR description complete for {repo_full_name}#{pr_number}") + + return { + "status": "success", + "message": "PR description generated successfully", + "pr_number": pr_number + } + except Exception as e: - logger.error(f"PR review failed for {repo_full_name}#{pr_number}: {e}", exc_info=True) - return {"status": "error", "error": str(e)} + logger.error(f"Error processing PR review: {e}", exc_info=True) + return { + "status": "error", + "message": f"PR review processing failed: {str(e)}" + } async def handle_agent_command( diff --git a/src/github/client.py b/src/github/client.py index a054f3a..a13d460 100644 --- a/src/github/client.py +++ b/src/github/client.py @@ -904,11 +904,26 @@ def update_pr_body(self, repo_url: str, pr_number: int, body: str) -> Dict[str, owner, repo = self._parse_repo_url(repo_url) logger.info(f"Updating PR description for {owner}/{repo}#{pr_number}") + logger.debug(f"Using token: {self.token[:20] if self.token else 'None'}...") - return self._api_put( - f"repos/{owner}/{repo}/pulls/{pr_number}", - {"body": body} - ) + # Use PATCH for updating PR body + url = f"{self.BASE_URL}/repos/{owner}/{repo}/pulls/{pr_number}" + logger.debug(f"PATCH URL: {url}") + + try: + response = self.session.patch(url, json={"body": body}, timeout=30) + logger.debug(f"Response status: {response.status_code}") + logger.debug(f"Response headers: {dict(response.headers)}") + response.raise_for_status() + logger.info(f"Successfully updated PR description") + return response.json() + except Exception as e: + logger.error(f"Failed to update PR: {e}") + if 'response' in locals(): + logger.error(f"Response status: {response.status_code}") + logger.error(f"Response text: {response.text}") + logger.error(f"Response headers: {dict(response.headers)}") + raise def cleanup(self) -> None: """Clean up temporary directories.""" diff --git a/src/utils/pr_description_generator.py b/src/utils/pr_description_generator.py new file mode 100644 index 0000000..ac0fcbf --- /dev/null +++ b/src/utils/pr_description_generator.py @@ -0,0 +1,404 @@ +""" +PR Description Generator - Automatically generates GitHub PR descriptions. + +Generates human-readable summaries of PR changes in the style of GitHub Copilot AI, +with LLM-powered analysis to explain the logical changes. + +Features: +- What changed (files modified, added, removed) +- Why it changed (LLM analyzes diffs to explain logic changes) +- Key statistics (additions, deletions, files touched) +- Human-readable explanations of each file's changes +- Clear formatting similar to GitHub's PR review style +""" + +from typing import List, Dict, Any, Optional +import re +from dataclasses import dataclass +import logging + +logger = logging.getLogger(__name__) + + +@dataclass +class FileChange: + """Represents a file change in the PR.""" + filename: str + status: str # "added", "modified", "removed" + additions: int + deletions: int + changes: int + diff: Optional[str] = None # The actual diff content for LLM analysis + explanation: Optional[str] = None # LLM-generated explanation + + +def analyze_diff_with_llm(filename: str, diff: str, llm_client=None) -> str: + """ + Analyze a code diff using LLM to generate human-readable explanation. + + Args: + filename: The name of the changed file + diff: The git diff content + llm_client: Optional LLM client (uses Gemini by default) + + Returns: + Human-readable explanation of the changes + """ + if not diff or not diff.strip(): + return "No diff available" + + try: + # Import here to avoid circular imports + from src.llm.factory import get_llm_client + + if llm_client is None: + llm_client = get_llm_client() + + # Create prompt for diff analysis + prompt = f"""Analyze this code diff and provide a brief, human-readable explanation (1-2 sentences max) of what changed and why. + +File: {filename} + +Diff: +```diff +{diff[:2000]} +``` + +Focus on: +- What functionality changed +- Any significant logic changes +- Why this change was likely made + +Keep it concise and technical. Don't mention file stats.""" + + # Call LLM using chat method (synchronous) + messages = [{"role": "user", "content": prompt}] + response = llm_client.chat( + messages=messages, + max_tokens=200, + temperature=0.3 + ) + + explanation = response.strip() if response else "Changes to this file" + logger.info(f"[PR_DESC] LLM analysis for {filename}: {explanation[:100]}...") + return explanation + + except Exception as e: + logger.warning(f"[PR_DESC] LLM analysis failed for {filename}: {e}") + return f"Modified {filename}" + + +class PRDescriptionGenerator: + """Generates GitHub PR descriptions in Copilot AI style.""" + + def __init__(self): + """Initialize the PR description generator.""" + self.file_categories = { + "tests": [".test.py", ".spec.py", "test_", "_test.py", "tests/"], + "docs": [".md", ".rst", ".txt", "docs/", "README", "CHANGELOG"], + "config": ["config/", ".yml", ".yaml", ".json", ".toml", ".cfg", "setup.py", "package.json"], + "ci": [".github/", ".gitlab-ci.yml", "Jenkinsfile", ".circleci"], + "types": [".pyi", "py.typed"], + } + + def categorize_file(self, filename: str) -> str: + """Categorize a file by type.""" + filename_lower = filename.lower() + + for category, patterns in self.file_categories.items(): + if any(pattern in filename_lower for pattern in patterns): + return category + + # Determine by extension + if filename.endswith(".py"): + return "python" + elif filename.endswith((".js", ".ts", ".jsx", ".tsx")): + return "javascript" + elif filename.endswith((".java", ".kt")): + return "java" + elif filename.endswith((".go",)): + return "go" + elif filename.endswith((".rb",)): + return "ruby" + else: + return "other" + + def extract_key_functions(self, files_changed: List[FileChange], limit: int = 3) -> List[str]: + """Extract key changed files (modified/removed, not tests/docs).""" + main_files = [ + f.filename for f in files_changed + if f.status in ["modified", "removed"] and self.categorize_file(f.filename) not in ["tests", "docs", "config"] + ] + return main_files[:limit] + + def generate_description( + self, + pr_title: str, + pr_body: Optional[str], + files_changed: List[FileChange], + commit_messages: Optional[List[str]] = None, + ) -> str: + """ + Generate a PR description in Copilot AI style. + + Args: + pr_title: The PR title + pr_body: Existing PR body/description (optional) + files_changed: List of FileChange objects + commit_messages: List of commit messages for context + + Returns: + Formatted PR description string + """ + parts = [] + + # 1. Pull request overview with main files + key_files = self.extract_key_functions(files_changed) + + overview = self._generate_overview(pr_title, key_files, files_changed) + parts.append(overview) + + # 2. Key Changes section + key_changes = self._generate_key_changes(files_changed) + if key_changes: + parts.append("\n## Key Changes\n") + parts.append(key_changes) + + # 3. File Summary (breakdown by type) + file_summary = self._generate_file_summary(files_changed) + if file_summary: + parts.append("\n## Files Changed\n") + parts.append(file_summary) + + # 4. Statistics + stats = self._generate_statistics(files_changed) + parts.append("\n## Statistics\n") + parts.append(stats) + + # 5. Testing considerations (if tests were modified) + if any(f.status == "added" and "test" in f.filename.lower() for f in files_changed): + parts.append("\n## Testing\n") + parts.append("Tests have been added to verify the changes.\n") + + return "".join(parts) + + def _generate_overview( + self, + pr_title: str, + key_files: List[str], + files_changed: List[FileChange], + ) -> str: + """Generate the overview section.""" + total_files = len(files_changed) + added_files = sum(1 for f in files_changed if f.status == "added") + modified_files = sum(1 for f in files_changed if f.status == "modified") + removed_files = sum(1 for f in files_changed if f.status == "removed") + + overview = f"# {pr_title}\n\n" + overview += "## Pull request overview\n" + + # Main description + if key_files: + file_list = ", ".join([f"`{f}`" for f in key_files]) + overview += f"This PR updates {file_list}" + else: + overview += "This PR makes updates to the codebase" + + # Summary stats + changes = [] + if modified_files > 0: + changes.append(f"modifying {modified_files} file{'s' if modified_files != 1 else ''}") + if added_files > 0: + changes.append(f"adding {added_files} new file{'s' if added_files != 1 else ''}") + if removed_files > 0: + changes.append(f"removing {removed_files} file{'s' if removed_files != 1 else ''}") + + if changes: + overview += ", " + ", ".join(changes) + "." + else: + overview += "." + + overview += f"\n\n" + + return overview + + def _generate_key_changes(self, files_changed: List[FileChange]) -> str: + """Generate the Key Changes section with LLM explanations.""" + # Group changes by type + added = [f for f in files_changed if f.status == "added"] + modified = [f for f in files_changed if f.status == "modified"] + removed = [f for f in files_changed if f.status == "removed"] + + changes_lines = [] + + if modified: + changes_lines.append("**Modified files:**") + for f in modified[:5]: # Show top 5 + explanation = f.explanation or f"Modified `{f.filename}`" + changes_lines.append(f"- `{f.filename}` (+{f.additions}/-{f.deletions})") + changes_lines.append(f" - {explanation}") + if len(modified) > 5: + changes_lines.append(f"- ...and {len(modified) - 5} more modified files") + + if added: + if changes_lines: + changes_lines.append("") + changes_lines.append("**Added files:**") + for f in added[:5]: + explanation = f.explanation or f"New file with {f.additions} lines" + changes_lines.append(f"- `{f.filename}` ({f.additions} lines)") + if explanation and "new" not in explanation.lower(): + changes_lines.append(f" - {explanation}") + if len(added) > 5: + changes_lines.append(f"- ...and {len(added) - 5} more new files") + + if removed: + if changes_lines: + changes_lines.append("") + changes_lines.append("**Removed files:**") + for f in removed[:5]: + changes_lines.append(f"- `{f.filename}`") + if len(removed) > 5: + changes_lines.append(f"- ...and {len(removed) - 5} more removed files") + + return "\n".join(changes_lines) if changes_lines else "" + + def _generate_file_summary(self, files_changed: List[FileChange]) -> str: + """Generate file category summary.""" + categories = {} + for f in files_changed: + cat = self.categorize_file(f.filename) + if cat not in categories: + categories[cat] = [] + categories[cat].append(f) + + summary_lines = [] + + for category in ["python", "javascript", "java", "go", "ruby", "tests", "docs", "config", "ci", "other"]: + if category in categories: + files = categories[category] + count = len(files) + + # Calculate totals for this category + total_add = sum(f.additions for f in files) + total_del = sum(f.deletions for f in files) + + cat_name = category.capitalize() + if category == "tests": + cat_name = "Tests" + elif category == "docs": + cat_name = "Documentation" + elif category == "config": + cat_name = "Configuration" + elif category == "ci": + cat_name = "CI/CD" + elif category == "javascript": + cat_name = "JavaScript/TypeScript" + + summary_lines.append(f"- **{cat_name}**: {count} file{'s' if count != 1 else ''} (+{total_add}/-{total_del})") + + return "\n".join(summary_lines) if summary_lines else "" + + def _generate_statistics(self, files_changed: List[FileChange]) -> str: + """Generate statistics section.""" + total_files = len(files_changed) + total_additions = sum(f.additions for f in files_changed) + total_deletions = sum(f.deletions for f in files_changed) + + added_files = sum(1 for f in files_changed if f.status == "added") + modified_files = sum(1 for f in files_changed if f.status == "modified") + removed_files = sum(1 for f in files_changed if f.status == "removed") + + stats = f"""| Metric | Value | +|--------|-------| +| **Files changed** | {total_files} | +| **Files added** | {added_files} | +| **Files modified** | {modified_files} | +| **Files removed** | {removed_files} | +| **Total additions** | +{total_additions} | +| **Total deletions** | -{total_deletions} | +| **Net change** | +{total_additions - total_deletions} | +""" + return stats + + def generate_changelog_description( + self, + files_changed: List[FileChange], + pr_title: Optional[str] = None, + ) -> str: + """ + Generate a changelog-style PR description with LLM explanations. + + Suitable for automatically generated descriptions that focus on: + - What files were modified/added/removed + - Line statistics + - Human-readable explanations of what changed logically + - Clean summary suitable for release notes + + Args: + files_changed: List of FileChange objects + pr_title: Optional PR title to include + + Returns: + Changelog-style description with LLM explanations + """ + parts = [] + + if pr_title: + parts.append(f"## {pr_title}\n") + + parts.append("### Modified\n") + modified = [f for f in files_changed if f.status == "modified"] + if modified: + for f in modified: + parts.append(f"- `{f.filename}` (+{f.additions}/-{f.deletions})\n") + if f.explanation: + parts.append(f" > {f.explanation}\n") + else: + parts.append("_No files modified_\n") + + parts.append("\n### Added\n") + added = [f for f in files_changed if f.status == "added"] + if added: + for f in added: + parts.append(f"- `{f.filename}` ({f.additions} lines)\n") + if f.explanation: + parts.append(f" > {f.explanation}\n") + else: + parts.append("_No new files_\n") + + parts.append("\n### Removed\n") + removed = [f for f in files_changed if f.status == "removed"] + if removed: + for f in removed: + parts.append(f"- `{f.filename}`\n") + else: + parts.append("_No files removed_\n") + + # Summary + total_add = sum(f.additions for f in files_changed) + total_del = sum(f.deletions for f in files_changed) + total_files = len(files_changed) + + parts.append(f"\n### Summary\n") + parts.append(f"- **{total_files}** files changed\n") + parts.append(f"- **+{total_add}** additions\n") + parts.append(f"- **-{total_del}** deletions\n") + + return "".join(parts) + + +def format_file_change( + filename: str, + status: str, + additions: int = 0, + deletions: int = 0, +) -> FileChange: + """Helper to create FileChange objects.""" + return FileChange( + filename=filename, + status=status, + additions=additions, + deletions=deletions, + changes=additions + deletions, + )