From d25946002ba0acaabb91683b7e9feb5e0953c336 Mon Sep 17 00:00:00 2001
From: Himanshu Jhawar <himanshujhawar@Mac.lan>
Date: Tue, 9 Dec 2025 09:05:49 -0500
Subject: [PATCH] Add benchmark test files with 27 seeded bugs for InspectAI
 evaluation

---
 tests/benchmark/SCORING_GUIDE.md      | 117 +++++++++++
 tests/benchmark/seeded_bugs_api.py    | 205 ++++++++++++++++++
 tests/benchmark/seeded_bugs_python.py | 291 ++++++++++++++++++++++++++
 3 files changed, 613 insertions(+)
 create mode 100644 tests/benchmark/SCORING_GUIDE.md
 create mode 100644 tests/benchmark/seeded_bugs_api.py
 create mode 100644 tests/benchmark/seeded_bugs_python.py

diff --git a/tests/benchmark/SCORING_GUIDE.md b/tests/benchmark/SCORING_GUIDE.md
new file mode 100644
index 0000000..5cfdc04
--- /dev/null
+++ b/tests/benchmark/SCORING_GUIDE.md
@@ -0,0 +1,117 @@
+# BENCHMARK SCORING GUIDE
+# =======================
+# 
+# This document tracks the seeded bugs and InspectAI's detection performance.
+# Run the InspectAI commands on the PR and fill in the results below.
+
+## Seeded Bugs Summary
+
+| File | Total Bugs | Security | Logic | Resource | Concurrency | Error Handling |
+|------|------------|----------|-------|----------|-------------|----------------|
+| seeded_bugs_python.py | 15 | 5 | 7 | 1 | 1 | 1 |
+| seeded_bugs_api.py | 12 | 7 | 4 | 0 | 1 | 0 |
+| **TOTAL** | **27** | **12** | **11** | **1** | **2** | **1** |
+
+---
+
+## Detailed Bug List (Ground Truth)
+
+### File 1: seeded_bugs_python.py
+
+| # | Bug Type | Severity | Category | Line | Description |
+|---|----------|----------|----------|------|-------------|
+| 1 | SQL Injection | HIGH | Security | 31 | User input directly in SQL query |
+| 2 | Hardcoded Secret | CRITICAL | Security | 41-42 | API key and password in source |
+| 3 | Off-by-One | MEDIUM | Logic | 56 | range(len+1) causes IndexError |
+| 4 | Missing Null Check | MEDIUM | Logic | 66 | No None check before access |
+| 5 | Resource Leak | MEDIUM | Resource | 76 | File never closed |
+| 6 | Wrong Operator | HIGH | Logic | 86 | Using 'is' instead of '==' |
+| 7 | Wrong Formula | LOW | Logic | 99 | Dividing by (n-1) not n |
+| 8 | Race Condition | HIGH | Concurrency | 107 | Non-atomic counter increment |
+| 9 | XSS | HIGH | Security | 118 | Unescaped user input in HTML |
+| 10 | Weak Crypto | CRITICAL | Security | 131 | MD5 for password hashing |
+| 11 | Missing Return | MEDIUM | Logic | 141 | No return True for valid case |
+| 12 | Mutable Default | MEDIUM | Logic | 152 | Default arg `[]` persists |
+| 13 | Unhandled Exception | MEDIUM | Error | 160 | No try-except for json.loads |
+| 14 | Path Traversal | HIGH | Security | 168 | User can access any file |
+| 15 | Infinite Loop | HIGH | Logic | 181 | Binary search doesn't converge |
+
+### File 2: seeded_bugs_api.py
+
+| # | Bug Type | Severity | Category | Line | Description |
+|---|----------|----------|----------|------|-------------|
+| 16 | Command Injection | CRITICAL | Security | 29 | shell=True with user input |
+| 17 | Insecure Deserialize | CRITICAL | Security | 41 | pickle.loads on untrusted data |
+| 18 | Weak Session | HIGH | Security | 51 | Predictable session tokens |
+| 19 | Missing AuthZ | HIGH | Security | 62 | No permission check for delete |
+| 20 | ReDoS | MEDIUM | Security | 74 | Evil regex pattern |
+| 21 | Float Comparison | MEDIUM | Logic | 88 | Using == for floats |
+| 22 | TOCTOU | HIGH | Concurrency | 96 | Check-then-use race condition |
+| 23 | Precision Loss | MEDIUM | Logic | 105 | int() truncates cents |
+| 24 | Weak Regex | LOW | Logic | 116 | Email regex too permissive |
+| 25 | Type Confusion | MEDIUM | Logic | 123 | No type validation before divide |
+| 26 | Timing Attack | MEDIUM | Security | 126 | String compare short-circuits |
+| 27 | Info Disclosure | LOW | Security | 137 | Error reveals username exists |
+
+---
+
+## Testing Procedure
+
+1. Open PR from `test-benchmark` branch to `main`
+2. Run these commands and record findings:
+
+### Command 1: `/inspectai_review`
+- [ ] Run command
+- Findings count: ___
+- True Positives: ___
+- False Positives: ___
+
+### Command 2: `/inspectai_bugs`  
+- [ ] Run command
+- Findings count: ___
+- True Positives: ___
+- False Positives: ___
+
+### Command 3: `/inspectai_security`
+- [ ] Run command
+- Findings count: ___
+- True Positives: ___
+- False Positives: ___
+
+---
+
+## Scoring Template
+
+After running commands, fill in this table:
+
+| Command | Bugs Found | True Positives | False Positives | Recall | Precision |
+|---------|------------|----------------|-----------------|--------|-----------|
+| /inspectai_review | | | | | |
+| /inspectai_bugs | | | | | |
+| /inspectai_security | | | | | |
+
+### Formulas:
+- **Recall** = True Positives / Total Seeded Bugs (27)
+- **Precision** = True Positives / (True Positives + False Positives)
+- **F1 Score** = 2 × (Precision × Recall) / (Precision + Recall)
+
+---
+
+## Categories Breakdown (After Testing)
+
+| Category | Total | Found | Recall |
+|----------|-------|-------|--------|
+| Security (CRITICAL) | 4 | | |
+| Security (HIGH) | 5 | | |
+| Security (MEDIUM) | 3 | | |
+| Security (LOW) | 1 | | |
+| Logic Errors | 11 | | |
+| Resource Leaks | 1 | | |
+| Concurrency | 2 | | |
+| Error Handling | 1 | | |
+
+---
+
+## Notes
+<!-- Record any observations during testing -->
+
diff --git a/tests/benchmark/seeded_bugs_api.py b/tests/benchmark/seeded_bugs_api.py
new file mode 100644
index 0000000..81680e9
--- /dev/null
+++ b/tests/benchmark/seeded_bugs_api.py
@@ -0,0 +1,205 @@
+"""
+BENCHMARK TEST FILE #2 - API/Web Application Bugs
+=================================================
+This file simulates a web API with common security and logic bugs.
+Contains 10 additional seeded bugs.
+
+DO NOT FIX THESE BUGS - They are intentional for benchmarking purposes.
+=================================================
+"""
+
+from typing import Optional, Dict, List, Any
+import re
+import pickle
+import subprocess
+
+
+class UserService:
+    """Service for managing users."""
+    
+    def __init__(self):
+        self.users: Dict[str, Dict] = {}
+        self.session_tokens: Dict[str, str] = {}
+    
+    # =========================================================================
+    # BUG #16: Command Injection (SECURITY - CRITICAL)
+    # User input passed directly to shell command
+    # =========================================================================
+    def ping_server(self, hostname: str) -> str:
+        """Ping a server to check if it's online."""
+        # BUG: Command injection - hostname not sanitized
+        result = subprocess.run(
+            f"ping -c 1 {hostname}",
+            shell=True,  # BUG: shell=True with user input
+            capture_output=True,
+            text=True
+        )
+        return result.stdout
+    
+    # =========================================================================
+    # BUG #17: Insecure Deserialization (SECURITY - CRITICAL)
+    # Using pickle to deserialize untrusted data
+    # =========================================================================
+    def load_user_preferences(self, data: bytes) -> dict:
+        """Load user preferences from serialized data."""
+        # BUG: Pickle deserialization of untrusted data - RCE vulnerability
+        return pickle.loads(data)
+    
+    # =========================================================================
+    # BUG #18: Broken Authentication (SECURITY - HIGH)
+    # Weak session token generation
+    # =========================================================================
+    def create_session(self, user_id: str) -> str:
+        """Create a session token for a user."""
+        # BUG: Predictable session token based on user_id
+        import time
+        token = f"{user_id}_{int(time.time())}"  # Easily guessable!
+        self.session_tokens[token] = user_id
+        return token
+    
+    # =========================================================================
+    # BUG #19: Missing Authorization Check (SECURITY - HIGH)
+    # Any user can delete any other user
+    # =========================================================================
+    def delete_user(self, target_user_id: str, requesting_user_id: str) -> bool:
+        """Delete a user account."""
+        # BUG: No authorization check - any user can delete any user
+        if target_user_id in self.users:
+            del self.users[target_user_id]
+            return True
+        return False
+    
+    # =========================================================================
+    # BUG #20: ReDoS Vulnerability (SECURITY - MEDIUM)
+    # Regex pattern vulnerable to catastrophic backtracking
+    # =========================================================================
+    def validate_input(self, text: str) -> bool:
+        """Validate user input format."""
+        # BUG: ReDoS - evil regex with nested quantifiers
+        pattern = r"^(a+)+$"
+        return bool(re.match(pattern, text))
+
+
+class PaymentProcessor:
+    """Service for processing payments."""
+    
+    def __init__(self):
+        self.transactions: List[Dict] = []
+    
+    # =========================================================================
+    # BUG #21: Floating Point Comparison (LOGIC - MEDIUM)
+    # Comparing floats for equality
+    # =========================================================================
+    def verify_payment(self, expected: float, received: float) -> bool:
+        """Verify that the received payment matches expected amount."""
+        # BUG: Floating point comparison - 0.1 + 0.2 != 0.3
+        return expected == received
+    
+    # =========================================================================
+    # BUG #22: TOCTOU Race Condition (CONCURRENCY - HIGH)
+    # Time-of-check to time-of-use vulnerability
+    # =========================================================================
+    def process_withdrawal(self, account_id: str, amount: float, balance: Dict[str, float]) -> bool:
+        """Process a withdrawal if sufficient balance exists."""
+        # BUG: TOCTOU - balance can change between check and update
+        if balance.get(account_id, 0) >= amount:
+            # Gap here where another thread could modify balance
+            balance[account_id] -= amount
+            return True
+        return False
+    
+    # =========================================================================
+    # BUG #23: Integer Overflow (LOGIC - MEDIUM)
+    # Not handling large numbers properly
+    # =========================================================================
+    def calculate_total_with_fee(self, amount: int, fee_percent: int) -> int:
+        """Calculate total amount including fee."""
+        # BUG: Potential overflow in multiplication before division
+        # In Python this won't overflow but the logic is still wrong for cents
+        fee = amount * fee_percent / 100
+        return int(amount + fee)  # Precision loss!
+
+
+class DataValidator:
+    """Utility class for data validation."""
+    
+    # =========================================================================
+    # BUG #24: Incorrect Regex for Email (LOGIC - LOW)
+    # Overly permissive email regex
+    # =========================================================================
+    def is_valid_email(self, email: str) -> bool:
+        """Check if email is valid."""
+        # BUG: Overly simple regex - accepts invalid emails like "a@b"
+        pattern = r".+@.+"
+        return bool(re.match(pattern, email))
+    
+    # =========================================================================
+    # BUG #25: Type Confusion (LOGIC - MEDIUM)
+    # Not validating input type before operations
+    # =========================================================================
+    def safe_divide(self, a: Any, b: Any) -> float:
+        """Safely divide two numbers."""
+        # BUG: No type validation - will fail silently with strings
+        if b == 0:
+            return 0.0
+        return a / b  # Will raise TypeError if a or b is not a number
+
+
+# =============================================================================
+# Additional standalone functions with bugs
+# =============================================================================
+
+# =========================================================================
+# BUG #26: Timing Attack Vulnerability (SECURITY - MEDIUM)
+# String comparison short-circuits on mismatch
+# =========================================================================
+def verify_api_key(provided_key: str, stored_key: str) -> bool:
+    """Verify an API key."""
+    # BUG: Timing attack - comparison short-circuits
+    return provided_key == stored_key  # Should use hmac.compare_digest
+
+
+# =========================================================================
+# BUG #27: Improper Error Message (SECURITY - LOW)
+# Leaking sensitive information in error message
+# =========================================================================
+def authenticate_user(username: str, password: str, users_db: Dict) -> Dict:
+    """Authenticate a user and return their profile."""
+    user = users_db.get(username)
+    
+    if not user:
+        # BUG: Information disclosure - reveals if username exists
+        raise ValueError(f"User '{username}' does not exist")
+    
+    if user["password"] != password:
+        # BUG: Should not differentiate between bad user and bad password
+        raise ValueError("Incorrect password")
+    
+    return user
+
+
+# =============================================================================
+# GROUND TRUTH - Bug Summary for Evaluation
+# =============================================================================
+SEEDED_BUGS_FILE2 = {
+    "security": [
+        {"id": 16, "type": "Command Injection", "severity": "CRITICAL", "line": 29},
+        {"id": 17, "type": "Insecure Deserialization", "severity": "CRITICAL", "line": 41},
+        {"id": 18, "type": "Weak Session Token", "severity": "HIGH", "line": 51},
+        {"id": 19, "type": "Missing Authorization", "severity": "HIGH", "line": 62},
+        {"id": 20, "type": "ReDoS", "severity": "MEDIUM", "line": 74},
+        {"id": 26, "type": "Timing Attack", "severity": "MEDIUM", "line": 126},
+        {"id": 27, "type": "Information Disclosure", "severity": "LOW", "line": 137},
+    ],
+    "logic": [
+        {"id": 21, "type": "Float Comparison", "severity": "MEDIUM", "line": 88},
+        {"id": 23, "type": "Precision Loss", "severity": "MEDIUM", "line": 105},
+        {"id": 24, "type": "Weak Regex", "severity": "LOW", "line": 116},
+        {"id": 25, "type": "Type Confusion", "severity": "MEDIUM", "line": 123},
+    ],
+    "concurrency": [
+        {"id": 22, "type": "TOCTOU Race", "severity": "HIGH", "line": 96},
+    ],
+}
+
+TOTAL_BUGS_FILE2 = 12
diff --git a/tests/benchmark/seeded_bugs_python.py b/tests/benchmark/seeded_bugs_python.py
new file mode 100644
index 0000000..640bac2
--- /dev/null
+++ b/tests/benchmark/seeded_bugs_python.py
@@ -0,0 +1,291 @@
+"""
+BENCHMARK TEST FILE - Contains intentionally seeded bugs for testing InspectAI
+================================================================================
+This file contains 15 seeded bugs across different categories:
+- Security vulnerabilities (SQL injection, hardcoded secrets, XSS)
+- Logic errors (off-by-one, wrong operator, missing return)
+- Null/None handling issues
+- Resource leaks
+- Race conditions
+- Type errors
+
+DO NOT FIX THESE BUGS - They are intentional for benchmarking purposes.
+================================================================================
+"""
+
+import os
+import sqlite3
+import hashlib
+import threading
+from typing import List, Optional, Dict, Any
+
+
+# =============================================================================
+# BUG #1: SQL Injection Vulnerability (SECURITY - HIGH)
+# The user_id is directly interpolated into the SQL query
+# =============================================================================
+def get_user_by_id(user_id: str) -> dict:
+    """Fetch a user from the database by their ID."""
+    conn = sqlite3.connect("users.db")
+    cursor = conn.cursor()
+    
+    # BUG: SQL Injection - user_id is not parameterized
+    query = f"SELECT * FROM users WHERE id = '{user_id}'"
+    cursor.execute(query)
+    
+    result = cursor.fetchone()
+    conn.close()
+    return {"id": result[0], "name": result[1]} if result else None
+
+
+# =============================================================================
+# BUG #2: Hardcoded Secret/API Key (SECURITY - CRITICAL)
+# API keys should never be hardcoded in source code
+# =============================================================================
+API_KEY = "sk-live-abc123def456ghi789jkl012mno345pqr678"
+DATABASE_PASSWORD = "super_secret_password_123!"
+
+def make_api_request(endpoint: str) -> dict:
+    """Make an authenticated API request."""
+    headers = {"Authorization": f"Bearer {API_KEY}"}
+    # Simulated request
+    return {"status": "ok", "endpoint": endpoint}
+
+
+# =============================================================================
+# BUG #3: Off-by-One Error (LOGIC - MEDIUM)
+# Loop should use range(len(items)) or enumerate, not len(items) + 1
+# =============================================================================
+def process_items(items: List[str]) -> List[str]:
+    """Process each item in the list."""
+    results = []
+    
+    # BUG: Off-by-one - will cause IndexError on last iteration
+    for i in range(len(items) + 1):
+        results.append(items[i].upper())
+    
+    return results
+
+
+# =============================================================================
+# BUG #4: Missing Null Check (LOGIC - MEDIUM)
+# Accessing attributes without checking if object is None
+# =============================================================================
+def get_user_email(user: Optional[Dict]) -> str:
+    """Get the user's email address."""
+    # BUG: No null check - will raise KeyError/TypeError if user is None
+    return user["email"].lower()
+
+
+# =============================================================================
+# BUG #5: Resource Leak - File Not Closed (RESOURCE - MEDIUM)
+# File handle is never closed, leading to resource leak
+# =============================================================================
+def read_config_file(path: str) -> str:
+    """Read configuration from a file."""
+    # BUG: File is never closed - resource leak
+    f = open(path, "r")
+    content = f.read()
+    # Missing: f.close() or use 'with' statement
+    return content
+
+
+# =============================================================================
+# BUG #6: Wrong Comparison Operator (LOGIC - HIGH)
+# Using = instead of == in comparison (though Python will error, 
+# using 'is' instead of '==' for value comparison is the realistic bug)
+# =============================================================================
+def check_status(status: str) -> bool:
+    """Check if the status indicates success."""
+    # BUG: Using 'is' instead of '==' for string comparison
+    # This may work for small strings due to interning but is incorrect
+    if status is "success":
+        return True
+    elif status is "pending":
+        return False
+    return False
+
+
+# =============================================================================
+# BUG #7: Integer Division Truncation (LOGIC - LOW)
+# In Python 3 this is fine, but the logic error is wrong formula
+# =============================================================================
+def calculate_average(numbers: List[int]) -> float:
+    """Calculate the average of a list of numbers."""
+    if not numbers:
+        return 0
+    
+    # BUG: Using len(numbers) - 1 instead of len(numbers)
+    total = sum(numbers)
+    return total / (len(numbers) - 1)  # Wrong divisor!
+
+
+# =============================================================================
+# BUG #8: Race Condition (CONCURRENCY - HIGH)
+# Shared counter without proper synchronization
+# =============================================================================
+counter = 0
+
+def increment_counter():
+    """Increment the global counter (not thread-safe)."""
+    global counter
+    # BUG: Race condition - read-modify-write is not atomic
+    temp = counter
+    temp += 1
+    counter = temp
+
+
+def run_concurrent_increments():
+    """Run multiple increments concurrently."""
+    threads = []
+    for _ in range(100):
+        t = threading.Thread(target=increment_counter)
+        threads.append(t)
+        t.start()
+    
+    for t in threads:
+        t.join()
+    
+    return counter
+
+
+# =============================================================================
+# BUG #9: XSS Vulnerability (SECURITY - HIGH)
+# User input directly embedded in HTML without escaping
+# =============================================================================
+def render_user_profile(username: str, bio: str) -> str:
+    """Render a user profile as HTML."""
+    # BUG: XSS vulnerability - user input not escaped
+    html = f"""
+    <div class="profile">
+        <h1>Welcome, {username}!</h1>
+        <p class="bio">{bio}</p>
+    </div>
+    """
+    return html
+
+
+# =============================================================================
+# BUG #10: Weak Password Hashing (SECURITY - CRITICAL)
+# Using MD5 for password hashing is insecure
+# =============================================================================
+def hash_password(password: str) -> str:
+    """Hash a password for storage."""
+    # BUG: MD5 is cryptographically broken, should use bcrypt/argon2
+    return hashlib.md5(password.encode()).hexdigest()
+
+
+def verify_password(password: str, hash_value: str) -> bool:
+    """Verify a password against its hash."""
+    return hash_password(password) == hash_value
+
+
+# =============================================================================
+# BUG #11: Missing Return Statement (LOGIC - MEDIUM)
+# Function doesn't return anything in success case
+# =============================================================================
+def validate_email(email: str) -> bool:
+    """Validate an email address format."""
+    if "@" not in email:
+        return False
+    
+    if "." not in email.split("@")[1]:
+        return False
+    
+    # BUG: Missing return True at the end
+    # Function implicitly returns None when email is valid
+
+
+# =============================================================================
+# BUG #12: Mutable Default Argument (LOGIC - MEDIUM)
+# Using mutable default argument causes unexpected behavior
+# =============================================================================
+def add_item_to_list(item: str, item_list: List[str] = []) -> List[str]:
+    """Add an item to a list and return it."""
+    # BUG: Mutable default argument - list persists across calls
+    item_list.append(item)
+    return item_list
+
+
+# =============================================================================
+# BUG #13: Unhandled Exception (ERROR HANDLING - MEDIUM)
+# No exception handling for JSON parsing
+# =============================================================================
+import json
+
+def parse_user_input(data: str) -> dict:
+    """Parse JSON user input."""
+    # BUG: No try-except - will crash on invalid JSON
+    parsed = json.loads(data)
+    return parsed
+
+
+# =============================================================================
+# BUG #14: Path Traversal Vulnerability (SECURITY - HIGH)
+# User input used directly in file path without validation
+# =============================================================================
+def read_user_file(filename: str) -> str:
+    """Read a file from the user uploads directory."""
+    # BUG: Path traversal - user can access any file with ../
+    base_path = "/var/uploads/"
+    file_path = base_path + filename  # No sanitization!
+    
+    with open(file_path, "r") as f:
+        return f.read()
+
+
+# =============================================================================
+# BUG #15: Infinite Loop Risk (LOGIC - HIGH)
+# Loop condition may never become false
+# =============================================================================
+def find_target(numbers: List[int], target: int) -> int:
+    """Find the index of target in a sorted list using binary search."""
+    left = 0
+    right = len(numbers) - 1
+    
+    # BUG: Missing update to left/right in some cases causes infinite loop
+    while left <= right:
+        mid = (left + right) // 2
+        
+        if numbers[mid] == target:
+            return mid
+        elif numbers[mid] < target:
+            left = mid  # BUG: Should be mid + 1
+        else:
+            right = mid  # BUG: Should be mid - 1
+    
+    return -1
+
+
+# =============================================================================
+# GROUND TRUTH - Bug Summary for Evaluation
+# =============================================================================
+SEEDED_BUGS = {
+    "security": [
+        {"id": 1, "type": "SQL Injection", "severity": "HIGH", "line": 31},
+        {"id": 2, "type": "Hardcoded Secret", "severity": "CRITICAL", "line": 41},
+        {"id": 9, "type": "XSS", "severity": "HIGH", "line": 118},
+        {"id": 10, "type": "Weak Crypto (MD5)", "severity": "CRITICAL", "line": 131},
+        {"id": 14, "type": "Path Traversal", "severity": "HIGH", "line": 168},
+    ],
+    "logic": [
+        {"id": 3, "type": "Off-by-One", "severity": "MEDIUM", "line": 56},
+        {"id": 4, "type": "Missing Null Check", "severity": "MEDIUM", "line": 66},
+        {"id": 6, "type": "Wrong Operator (is vs ==)", "severity": "HIGH", "line": 86},
+        {"id": 7, "type": "Wrong Formula", "severity": "LOW", "line": 99},
+        {"id": 11, "type": "Missing Return", "severity": "MEDIUM", "line": 141},
+        {"id": 12, "type": "Mutable Default Arg", "severity": "MEDIUM", "line": 152},
+        {"id": 15, "type": "Infinite Loop", "severity": "HIGH", "line": 181},
+    ],
+    "resource": [
+        {"id": 5, "type": "Resource Leak", "severity": "MEDIUM", "line": 76},
+    ],
+    "concurrency": [
+        {"id": 8, "type": "Race Condition", "severity": "HIGH", "line": 107},
+    ],
+    "error_handling": [
+        {"id": 13, "type": "Unhandled Exception", "severity": "MEDIUM", "line": 160},
+    ],
+}
+
+TOTAL_BUGS = 15