From d25946002ba0acaabb91683b7e9feb5e0953c336 Mon Sep 17 00:00:00 2001 From: Himanshu Jhawar Date: Tue, 9 Dec 2025 09:05:49 -0500 Subject: [PATCH] Add benchmark test files with 27 seeded bugs for InspectAI evaluation --- tests/benchmark/SCORING_GUIDE.md | 117 +++++++++++ tests/benchmark/seeded_bugs_api.py | 205 ++++++++++++++++++ tests/benchmark/seeded_bugs_python.py | 291 ++++++++++++++++++++++++++ 3 files changed, 613 insertions(+) create mode 100644 tests/benchmark/SCORING_GUIDE.md create mode 100644 tests/benchmark/seeded_bugs_api.py create mode 100644 tests/benchmark/seeded_bugs_python.py diff --git a/tests/benchmark/SCORING_GUIDE.md b/tests/benchmark/SCORING_GUIDE.md new file mode 100644 index 0000000..5cfdc04 --- /dev/null +++ b/tests/benchmark/SCORING_GUIDE.md @@ -0,0 +1,117 @@ +# BENCHMARK SCORING GUIDE +# ======================= +# +# This document tracks the seeded bugs and InspectAI's detection performance. +# Run the InspectAI commands on the PR and fill in the results below. + +## Seeded Bugs Summary + +| File | Total Bugs | Security | Logic | Resource | Concurrency | Error Handling | +|------|------------|----------|-------|----------|-------------|----------------| +| seeded_bugs_python.py | 15 | 5 | 7 | 1 | 1 | 1 | +| seeded_bugs_api.py | 12 | 7 | 4 | 0 | 1 | 0 | +| **TOTAL** | **27** | **12** | **11** | **1** | **2** | **1** | + +--- + +## Detailed Bug List (Ground Truth) + +### File 1: seeded_bugs_python.py + +| # | Bug Type | Severity | Category | Line | Description | +|---|----------|----------|----------|------|-------------| +| 1 | SQL Injection | HIGH | Security | 31 | User input directly in SQL query | +| 2 | Hardcoded Secret | CRITICAL | Security | 41-42 | API key and password in source | +| 3 | Off-by-One | MEDIUM | Logic | 56 | range(len+1) causes IndexError | +| 4 | Missing Null Check | MEDIUM | Logic | 66 | No None check before access | +| 5 | Resource Leak | MEDIUM | Resource | 76 | File never closed | +| 6 | Wrong Operator | HIGH | Logic | 86 | Using 'is' instead of '==' | +| 7 | Wrong Formula | LOW | Logic | 99 | Dividing by (n-1) not n | +| 8 | Race Condition | HIGH | Concurrency | 107 | Non-atomic counter increment | +| 9 | XSS | HIGH | Security | 118 | Unescaped user input in HTML | +| 10 | Weak Crypto | CRITICAL | Security | 131 | MD5 for password hashing | +| 11 | Missing Return | MEDIUM | Logic | 141 | No return True for valid case | +| 12 | Mutable Default | MEDIUM | Logic | 152 | Default arg `[]` persists | +| 13 | Unhandled Exception | MEDIUM | Error | 160 | No try-except for json.loads | +| 14 | Path Traversal | HIGH | Security | 168 | User can access any file | +| 15 | Infinite Loop | HIGH | Logic | 181 | Binary search doesn't converge | + +### File 2: seeded_bugs_api.py + +| # | Bug Type | Severity | Category | Line | Description | +|---|----------|----------|----------|------|-------------| +| 16 | Command Injection | CRITICAL | Security | 29 | shell=True with user input | +| 17 | Insecure Deserialize | CRITICAL | Security | 41 | pickle.loads on untrusted data | +| 18 | Weak Session | HIGH | Security | 51 | Predictable session tokens | +| 19 | Missing AuthZ | HIGH | Security | 62 | No permission check for delete | +| 20 | ReDoS | MEDIUM | Security | 74 | Evil regex pattern | +| 21 | Float Comparison | MEDIUM | Logic | 88 | Using == for floats | +| 22 | TOCTOU | HIGH | Concurrency | 96 | Check-then-use race condition | +| 23 | Precision Loss | MEDIUM | Logic | 105 | int() truncates cents | +| 24 | Weak Regex | LOW | Logic | 116 | Email regex too permissive | +| 25 | Type Confusion | MEDIUM | Logic | 123 | No type validation before divide | +| 26 | Timing Attack | MEDIUM | Security | 126 | String compare short-circuits | +| 27 | Info Disclosure | LOW | Security | 137 | Error reveals username exists | + +--- + +## Testing Procedure + +1. Open PR from `test-benchmark` branch to `main` +2. Run these commands and record findings: + +### Command 1: `/inspectai_review` +- [ ] Run command +- Findings count: ___ +- True Positives: ___ +- False Positives: ___ + +### Command 2: `/inspectai_bugs` +- [ ] Run command +- Findings count: ___ +- True Positives: ___ +- False Positives: ___ + +### Command 3: `/inspectai_security` +- [ ] Run command +- Findings count: ___ +- True Positives: ___ +- False Positives: ___ + +--- + +## Scoring Template + +After running commands, fill in this table: + +| Command | Bugs Found | True Positives | False Positives | Recall | Precision | +|---------|------------|----------------|-----------------|--------|-----------| +| /inspectai_review | | | | | | +| /inspectai_bugs | | | | | | +| /inspectai_security | | | | | | + +### Formulas: +- **Recall** = True Positives / Total Seeded Bugs (27) +- **Precision** = True Positives / (True Positives + False Positives) +- **F1 Score** = 2 × (Precision × Recall) / (Precision + Recall) + +--- + +## Categories Breakdown (After Testing) + +| Category | Total | Found | Recall | +|----------|-------|-------|--------| +| Security (CRITICAL) | 4 | | | +| Security (HIGH) | 5 | | | +| Security (MEDIUM) | 3 | | | +| Security (LOW) | 1 | | | +| Logic Errors | 11 | | | +| Resource Leaks | 1 | | | +| Concurrency | 2 | | | +| Error Handling | 1 | | | + +--- + +## Notes + + diff --git a/tests/benchmark/seeded_bugs_api.py b/tests/benchmark/seeded_bugs_api.py new file mode 100644 index 0000000..81680e9 --- /dev/null +++ b/tests/benchmark/seeded_bugs_api.py @@ -0,0 +1,205 @@ +""" +BENCHMARK TEST FILE #2 - API/Web Application Bugs +================================================= +This file simulates a web API with common security and logic bugs. +Contains 10 additional seeded bugs. + +DO NOT FIX THESE BUGS - They are intentional for benchmarking purposes. +================================================= +""" + +from typing import Optional, Dict, List, Any +import re +import pickle +import subprocess + + +class UserService: + """Service for managing users.""" + + def __init__(self): + self.users: Dict[str, Dict] = {} + self.session_tokens: Dict[str, str] = {} + + # ========================================================================= + # BUG #16: Command Injection (SECURITY - CRITICAL) + # User input passed directly to shell command + # ========================================================================= + def ping_server(self, hostname: str) -> str: + """Ping a server to check if it's online.""" + # BUG: Command injection - hostname not sanitized + result = subprocess.run( + f"ping -c 1 {hostname}", + shell=True, # BUG: shell=True with user input + capture_output=True, + text=True + ) + return result.stdout + + # ========================================================================= + # BUG #17: Insecure Deserialization (SECURITY - CRITICAL) + # Using pickle to deserialize untrusted data + # ========================================================================= + def load_user_preferences(self, data: bytes) -> dict: + """Load user preferences from serialized data.""" + # BUG: Pickle deserialization of untrusted data - RCE vulnerability + return pickle.loads(data) + + # ========================================================================= + # BUG #18: Broken Authentication (SECURITY - HIGH) + # Weak session token generation + # ========================================================================= + def create_session(self, user_id: str) -> str: + """Create a session token for a user.""" + # BUG: Predictable session token based on user_id + import time + token = f"{user_id}_{int(time.time())}" # Easily guessable! + self.session_tokens[token] = user_id + return token + + # ========================================================================= + # BUG #19: Missing Authorization Check (SECURITY - HIGH) + # Any user can delete any other user + # ========================================================================= + def delete_user(self, target_user_id: str, requesting_user_id: str) -> bool: + """Delete a user account.""" + # BUG: No authorization check - any user can delete any user + if target_user_id in self.users: + del self.users[target_user_id] + return True + return False + + # ========================================================================= + # BUG #20: ReDoS Vulnerability (SECURITY - MEDIUM) + # Regex pattern vulnerable to catastrophic backtracking + # ========================================================================= + def validate_input(self, text: str) -> bool: + """Validate user input format.""" + # BUG: ReDoS - evil regex with nested quantifiers + pattern = r"^(a+)+$" + return bool(re.match(pattern, text)) + + +class PaymentProcessor: + """Service for processing payments.""" + + def __init__(self): + self.transactions: List[Dict] = [] + + # ========================================================================= + # BUG #21: Floating Point Comparison (LOGIC - MEDIUM) + # Comparing floats for equality + # ========================================================================= + def verify_payment(self, expected: float, received: float) -> bool: + """Verify that the received payment matches expected amount.""" + # BUG: Floating point comparison - 0.1 + 0.2 != 0.3 + return expected == received + + # ========================================================================= + # BUG #22: TOCTOU Race Condition (CONCURRENCY - HIGH) + # Time-of-check to time-of-use vulnerability + # ========================================================================= + def process_withdrawal(self, account_id: str, amount: float, balance: Dict[str, float]) -> bool: + """Process a withdrawal if sufficient balance exists.""" + # BUG: TOCTOU - balance can change between check and update + if balance.get(account_id, 0) >= amount: + # Gap here where another thread could modify balance + balance[account_id] -= amount + return True + return False + + # ========================================================================= + # BUG #23: Integer Overflow (LOGIC - MEDIUM) + # Not handling large numbers properly + # ========================================================================= + def calculate_total_with_fee(self, amount: int, fee_percent: int) -> int: + """Calculate total amount including fee.""" + # BUG: Potential overflow in multiplication before division + # In Python this won't overflow but the logic is still wrong for cents + fee = amount * fee_percent / 100 + return int(amount + fee) # Precision loss! + + +class DataValidator: + """Utility class for data validation.""" + + # ========================================================================= + # BUG #24: Incorrect Regex for Email (LOGIC - LOW) + # Overly permissive email regex + # ========================================================================= + def is_valid_email(self, email: str) -> bool: + """Check if email is valid.""" + # BUG: Overly simple regex - accepts invalid emails like "a@b" + pattern = r".+@.+" + return bool(re.match(pattern, email)) + + # ========================================================================= + # BUG #25: Type Confusion (LOGIC - MEDIUM) + # Not validating input type before operations + # ========================================================================= + def safe_divide(self, a: Any, b: Any) -> float: + """Safely divide two numbers.""" + # BUG: No type validation - will fail silently with strings + if b == 0: + return 0.0 + return a / b # Will raise TypeError if a or b is not a number + + +# ============================================================================= +# Additional standalone functions with bugs +# ============================================================================= + +# ========================================================================= +# BUG #26: Timing Attack Vulnerability (SECURITY - MEDIUM) +# String comparison short-circuits on mismatch +# ========================================================================= +def verify_api_key(provided_key: str, stored_key: str) -> bool: + """Verify an API key.""" + # BUG: Timing attack - comparison short-circuits + return provided_key == stored_key # Should use hmac.compare_digest + + +# ========================================================================= +# BUG #27: Improper Error Message (SECURITY - LOW) +# Leaking sensitive information in error message +# ========================================================================= +def authenticate_user(username: str, password: str, users_db: Dict) -> Dict: + """Authenticate a user and return their profile.""" + user = users_db.get(username) + + if not user: + # BUG: Information disclosure - reveals if username exists + raise ValueError(f"User '{username}' does not exist") + + if user["password"] != password: + # BUG: Should not differentiate between bad user and bad password + raise ValueError("Incorrect password") + + return user + + +# ============================================================================= +# GROUND TRUTH - Bug Summary for Evaluation +# ============================================================================= +SEEDED_BUGS_FILE2 = { + "security": [ + {"id": 16, "type": "Command Injection", "severity": "CRITICAL", "line": 29}, + {"id": 17, "type": "Insecure Deserialization", "severity": "CRITICAL", "line": 41}, + {"id": 18, "type": "Weak Session Token", "severity": "HIGH", "line": 51}, + {"id": 19, "type": "Missing Authorization", "severity": "HIGH", "line": 62}, + {"id": 20, "type": "ReDoS", "severity": "MEDIUM", "line": 74}, + {"id": 26, "type": "Timing Attack", "severity": "MEDIUM", "line": 126}, + {"id": 27, "type": "Information Disclosure", "severity": "LOW", "line": 137}, + ], + "logic": [ + {"id": 21, "type": "Float Comparison", "severity": "MEDIUM", "line": 88}, + {"id": 23, "type": "Precision Loss", "severity": "MEDIUM", "line": 105}, + {"id": 24, "type": "Weak Regex", "severity": "LOW", "line": 116}, + {"id": 25, "type": "Type Confusion", "severity": "MEDIUM", "line": 123}, + ], + "concurrency": [ + {"id": 22, "type": "TOCTOU Race", "severity": "HIGH", "line": 96}, + ], +} + +TOTAL_BUGS_FILE2 = 12 diff --git a/tests/benchmark/seeded_bugs_python.py b/tests/benchmark/seeded_bugs_python.py new file mode 100644 index 0000000..640bac2 --- /dev/null +++ b/tests/benchmark/seeded_bugs_python.py @@ -0,0 +1,291 @@ +""" +BENCHMARK TEST FILE - Contains intentionally seeded bugs for testing InspectAI +================================================================================ +This file contains 15 seeded bugs across different categories: +- Security vulnerabilities (SQL injection, hardcoded secrets, XSS) +- Logic errors (off-by-one, wrong operator, missing return) +- Null/None handling issues +- Resource leaks +- Race conditions +- Type errors + +DO NOT FIX THESE BUGS - They are intentional for benchmarking purposes. +================================================================================ +""" + +import os +import sqlite3 +import hashlib +import threading +from typing import List, Optional, Dict, Any + + +# ============================================================================= +# BUG #1: SQL Injection Vulnerability (SECURITY - HIGH) +# The user_id is directly interpolated into the SQL query +# ============================================================================= +def get_user_by_id(user_id: str) -> dict: + """Fetch a user from the database by their ID.""" + conn = sqlite3.connect("users.db") + cursor = conn.cursor() + + # BUG: SQL Injection - user_id is not parameterized + query = f"SELECT * FROM users WHERE id = '{user_id}'" + cursor.execute(query) + + result = cursor.fetchone() + conn.close() + return {"id": result[0], "name": result[1]} if result else None + + +# ============================================================================= +# BUG #2: Hardcoded Secret/API Key (SECURITY - CRITICAL) +# API keys should never be hardcoded in source code +# ============================================================================= +API_KEY = "sk-live-abc123def456ghi789jkl012mno345pqr678" +DATABASE_PASSWORD = "super_secret_password_123!" + +def make_api_request(endpoint: str) -> dict: + """Make an authenticated API request.""" + headers = {"Authorization": f"Bearer {API_KEY}"} + # Simulated request + return {"status": "ok", "endpoint": endpoint} + + +# ============================================================================= +# BUG #3: Off-by-One Error (LOGIC - MEDIUM) +# Loop should use range(len(items)) or enumerate, not len(items) + 1 +# ============================================================================= +def process_items(items: List[str]) -> List[str]: + """Process each item in the list.""" + results = [] + + # BUG: Off-by-one - will cause IndexError on last iteration + for i in range(len(items) + 1): + results.append(items[i].upper()) + + return results + + +# ============================================================================= +# BUG #4: Missing Null Check (LOGIC - MEDIUM) +# Accessing attributes without checking if object is None +# ============================================================================= +def get_user_email(user: Optional[Dict]) -> str: + """Get the user's email address.""" + # BUG: No null check - will raise KeyError/TypeError if user is None + return user["email"].lower() + + +# ============================================================================= +# BUG #5: Resource Leak - File Not Closed (RESOURCE - MEDIUM) +# File handle is never closed, leading to resource leak +# ============================================================================= +def read_config_file(path: str) -> str: + """Read configuration from a file.""" + # BUG: File is never closed - resource leak + f = open(path, "r") + content = f.read() + # Missing: f.close() or use 'with' statement + return content + + +# ============================================================================= +# BUG #6: Wrong Comparison Operator (LOGIC - HIGH) +# Using = instead of == in comparison (though Python will error, +# using 'is' instead of '==' for value comparison is the realistic bug) +# ============================================================================= +def check_status(status: str) -> bool: + """Check if the status indicates success.""" + # BUG: Using 'is' instead of '==' for string comparison + # This may work for small strings due to interning but is incorrect + if status is "success": + return True + elif status is "pending": + return False + return False + + +# ============================================================================= +# BUG #7: Integer Division Truncation (LOGIC - LOW) +# In Python 3 this is fine, but the logic error is wrong formula +# ============================================================================= +def calculate_average(numbers: List[int]) -> float: + """Calculate the average of a list of numbers.""" + if not numbers: + return 0 + + # BUG: Using len(numbers) - 1 instead of len(numbers) + total = sum(numbers) + return total / (len(numbers) - 1) # Wrong divisor! + + +# ============================================================================= +# BUG #8: Race Condition (CONCURRENCY - HIGH) +# Shared counter without proper synchronization +# ============================================================================= +counter = 0 + +def increment_counter(): + """Increment the global counter (not thread-safe).""" + global counter + # BUG: Race condition - read-modify-write is not atomic + temp = counter + temp += 1 + counter = temp + + +def run_concurrent_increments(): + """Run multiple increments concurrently.""" + threads = [] + for _ in range(100): + t = threading.Thread(target=increment_counter) + threads.append(t) + t.start() + + for t in threads: + t.join() + + return counter + + +# ============================================================================= +# BUG #9: XSS Vulnerability (SECURITY - HIGH) +# User input directly embedded in HTML without escaping +# ============================================================================= +def render_user_profile(username: str, bio: str) -> str: + """Render a user profile as HTML.""" + # BUG: XSS vulnerability - user input not escaped + html = f""" +
+

Welcome, {username}!

+

{bio}

+
+ """ + return html + + +# ============================================================================= +# BUG #10: Weak Password Hashing (SECURITY - CRITICAL) +# Using MD5 for password hashing is insecure +# ============================================================================= +def hash_password(password: str) -> str: + """Hash a password for storage.""" + # BUG: MD5 is cryptographically broken, should use bcrypt/argon2 + return hashlib.md5(password.encode()).hexdigest() + + +def verify_password(password: str, hash_value: str) -> bool: + """Verify a password against its hash.""" + return hash_password(password) == hash_value + + +# ============================================================================= +# BUG #11: Missing Return Statement (LOGIC - MEDIUM) +# Function doesn't return anything in success case +# ============================================================================= +def validate_email(email: str) -> bool: + """Validate an email address format.""" + if "@" not in email: + return False + + if "." not in email.split("@")[1]: + return False + + # BUG: Missing return True at the end + # Function implicitly returns None when email is valid + + +# ============================================================================= +# BUG #12: Mutable Default Argument (LOGIC - MEDIUM) +# Using mutable default argument causes unexpected behavior +# ============================================================================= +def add_item_to_list(item: str, item_list: List[str] = []) -> List[str]: + """Add an item to a list and return it.""" + # BUG: Mutable default argument - list persists across calls + item_list.append(item) + return item_list + + +# ============================================================================= +# BUG #13: Unhandled Exception (ERROR HANDLING - MEDIUM) +# No exception handling for JSON parsing +# ============================================================================= +import json + +def parse_user_input(data: str) -> dict: + """Parse JSON user input.""" + # BUG: No try-except - will crash on invalid JSON + parsed = json.loads(data) + return parsed + + +# ============================================================================= +# BUG #14: Path Traversal Vulnerability (SECURITY - HIGH) +# User input used directly in file path without validation +# ============================================================================= +def read_user_file(filename: str) -> str: + """Read a file from the user uploads directory.""" + # BUG: Path traversal - user can access any file with ../ + base_path = "/var/uploads/" + file_path = base_path + filename # No sanitization! + + with open(file_path, "r") as f: + return f.read() + + +# ============================================================================= +# BUG #15: Infinite Loop Risk (LOGIC - HIGH) +# Loop condition may never become false +# ============================================================================= +def find_target(numbers: List[int], target: int) -> int: + """Find the index of target in a sorted list using binary search.""" + left = 0 + right = len(numbers) - 1 + + # BUG: Missing update to left/right in some cases causes infinite loop + while left <= right: + mid = (left + right) // 2 + + if numbers[mid] == target: + return mid + elif numbers[mid] < target: + left = mid # BUG: Should be mid + 1 + else: + right = mid # BUG: Should be mid - 1 + + return -1 + + +# ============================================================================= +# GROUND TRUTH - Bug Summary for Evaluation +# ============================================================================= +SEEDED_BUGS = { + "security": [ + {"id": 1, "type": "SQL Injection", "severity": "HIGH", "line": 31}, + {"id": 2, "type": "Hardcoded Secret", "severity": "CRITICAL", "line": 41}, + {"id": 9, "type": "XSS", "severity": "HIGH", "line": 118}, + {"id": 10, "type": "Weak Crypto (MD5)", "severity": "CRITICAL", "line": 131}, + {"id": 14, "type": "Path Traversal", "severity": "HIGH", "line": 168}, + ], + "logic": [ + {"id": 3, "type": "Off-by-One", "severity": "MEDIUM", "line": 56}, + {"id": 4, "type": "Missing Null Check", "severity": "MEDIUM", "line": 66}, + {"id": 6, "type": "Wrong Operator (is vs ==)", "severity": "HIGH", "line": 86}, + {"id": 7, "type": "Wrong Formula", "severity": "LOW", "line": 99}, + {"id": 11, "type": "Missing Return", "severity": "MEDIUM", "line": 141}, + {"id": 12, "type": "Mutable Default Arg", "severity": "MEDIUM", "line": 152}, + {"id": 15, "type": "Infinite Loop", "severity": "HIGH", "line": 181}, + ], + "resource": [ + {"id": 5, "type": "Resource Leak", "severity": "MEDIUM", "line": 76}, + ], + "concurrency": [ + {"id": 8, "type": "Race Condition", "severity": "HIGH", "line": 107}, + ], + "error_handling": [ + {"id": 13, "type": "Unhandled Exception", "severity": "MEDIUM", "line": 160}, + ], +} + +TOTAL_BUGS = 15