diff --git a/.github/workflows/claude-evaluation.yml b/.github/workflows/claude-evaluation.yml index d91282bf2..a13f6fa48 100644 --- a/.github/workflows/claude-evaluation.yml +++ b/.github/workflows/claude-evaluation.yml @@ -24,6 +24,7 @@ on: - "bug-fix" - "test-generation" - "code-review" + - "hello-world" test-run: description: "Indicate this is a test run (with few entries)" required: false diff --git a/.github/workflows/copilot-evaluation.yml b/.github/workflows/copilot-evaluation.yml index dd373536d..630a07217 100644 --- a/.github/workflows/copilot-evaluation.yml +++ b/.github/workflows/copilot-evaluation.yml @@ -31,6 +31,7 @@ on: - "bug-fix" - "test-generation" - "code-review" + - "hello-world" test-run: description: "Indicate this is a test run (with few entries)" required: false diff --git a/CATEGORIES.md b/CATEGORIES.md index 3269fa2e8..5a14181e6 100644 --- a/CATEGORIES.md +++ b/CATEGORIES.md @@ -2,7 +2,9 @@ BC-Bench is **category-based**. A category is a distinct evaluation scenario: `bug-fix` asks an agent to patch buggy code, while `test-generation` asks it to write reproduction tests. -The `bug-fix` and `test-generation` categories happen to share one dataset today. A new category should have its own: dataset schema, entry type, result type, pipeline, etc. +Today the benchmark ships several categories: `bug-fix`, `test-generation`, `code-review`, `nl2al`, and `hello-world`. The `bug-fix` and `test-generation` categories happen to share one dataset; every other category has its own. A new category should generally have its own: dataset schema, entry type, result type, pipeline, etc. + +`hello-world` is an intentionally tiny, imaginary, self-contained category (no BC container, no symbols) kept as a worked example of every step below. Use it together with the existing categories when adding your own. This doc is a map; the source files and their comments are the source of truth. To experiment with agent setup on existing categories, see [EXPERIMENT.md](EXPERIMENT.md). @@ -13,23 +15,30 @@ Start with `EvaluationCategory` in [src/bcbench/types.py](src/bcbench/types.py). - `dataset_path` — the dataset file for raw tasks. - `entry_class` — the typed Python model for one dataset row (aka one task). - `result_class` — the recorded outcome for one evaluated task. -- `summary_class` — the aggregate view used by result summaries and leaderboards. +- `summary_class` — the aggregate view for a single run, used by result summaries and leaderboards. +- `aggregate_class` — combines multiple runs of the same combination on the leaderboard. - `pipeline` — the category-specific setup, agent run, and evaluation behavior. +- `evaluators` / `core_score` — the bc-eval evaluators to run and the headline score. +- `requires_container` / `runner` — whether evaluation builds AL code (needs a BC container) and which GitHub Actions runner to use. - Prompt template — the category-specific prompt in [src/bcbench/agent/shared/config.yaml](src/bcbench/agent/shared/config.yaml), loaded by [src/bcbench/agent/shared/prompt.py](src/bcbench/agent/shared/prompt.py). +Every `match self` in `EvaluationCategory` is exhaustive and raises on an unhandled value, so adding the enum value forces you to fill in each property above. Categories that score externally (e.g. via an lm_checklist judge) can reuse the `JudgeBased*` result/summary/aggregate classes instead of writing new ones — `nl2al` and `hello-world` do this. + Keep dataset entry classes and result classes focused on typed data. Put category-specific behavior in the pipeline. ## Checklist -Use the existing `bug-fix` and `test-generation` implementations as examples. +`hello-world` is the smallest end-to-end example; `bug-fix` and `test-generation` show a full execution-based category. The `hello-world` commit touches every file below. -1. Add the enum value and mappings in [src/bcbench/types.py](src/bcbench/types.py). -2. Add the category dataset JSONL and entry class in [src/bcbench/dataset/dataset_entry.py](src/bcbench/dataset/dataset_entry.py). -3. Add a result class under [src/bcbench/results/](src/bcbench/results/) and map it from `EvaluationCategory.result_class`. -4. Add a pipeline under [src/bcbench/evaluate/](src/bcbench/evaluate/). -5. Add the prompt template to [src/bcbench/agent/shared/config.yaml](src/bcbench/agent/shared/config.yaml). -6. Add the category to workflow choice lists in [.github/workflows/](.github/workflows/), especially evaluation workflows and CI category selection. -7. Add docs, leaderboard data, notebooks, and tests for the category where relevant. +1. Add the enum value and all `match` arms in [src/bcbench/types.py](src/bcbench/types.py). +2. Add the entry class in [src/bcbench/dataset/dataset_entry.py](src/bcbench/dataset/dataset_entry.py), export it from [src/bcbench/dataset/__init__.py](src/bcbench/dataset/__init__.py), and add the dataset JSONL under [dataset/](dataset/). +3. Register the category and its dataset file in the `Get-BCBenchDatasetPath` `ValidateSet`/`switch` in [scripts/BCBenchUtils.psm1](scripts/BCBenchUtils.psm1) so the PowerShell setup scripts accept it. +4. Add (or reuse) a result class under [src/bcbench/results/](src/bcbench/results/) and map it from `EvaluationCategory.result_class` (plus `summary_class` and `aggregate_class`). +5. Add a pipeline under [src/bcbench/evaluate/](src/bcbench/evaluate/) and export it from [src/bcbench/evaluate/__init__.py](src/bcbench/evaluate/__init__.py). +6. Add the prompt template to [src/bcbench/agent/shared/config.yaml](src/bcbench/agent/shared/config.yaml). +7. Handle the category in `MockEvaluationPipeline.evaluate` in [src/bcbench/commands/evaluate.py](src/bcbench/commands/evaluate.py) so the CI mock-evaluation job passes. +8. Add the category to workflow choice lists in [.github/workflows/](.github/workflows/), especially evaluation workflows and CI category selection. +9. Add test fixtures/handling (e.g. in [tests/conftest.py](tests/conftest.py), [tests/test_type_exhaustiveness.py](tests/test_type_exhaustiveness.py), [tests/test_evaluate_pipeline.py](tests/test_evaluate_pipeline.py)) and docs, leaderboard data, and notebooks where relevant. ## Validation @@ -40,4 +49,10 @@ uv run pytest tests/test_type_exhaustiveness.py uv run bcbench run copilot --category --repo-path /path/to/repo ``` +For example, with the `hello-world` sample: + +```powershell +uv run bcbench run copilot helloworld__greeting-english-1 --category hello-world --repo-path /tmp/hello-world-repo +``` + Then trigger a CI test run before running the full dataset. diff --git a/dataset/hello_world.jsonl b/dataset/hello_world.jsonl new file mode 100644 index 000000000..d3863602b --- /dev/null +++ b/dataset/hello_world.jsonl @@ -0,0 +1,2 @@ +{"metadata": {"area": "demo"}, "repo": "microsoft/BCApps", "instance_id": "helloworld__greeting-english-1", "base_commit": "70fd0246a0a4dbc72cb183ca719106722c03be4d", "created_at": "2026-06-25", "environment_setup_version": "28.0", "project_paths": ["HelloWorldGreeting"], "language": "English", "patch": "TODO: gold AL code", "expected": [{"text": "The output defines an AL codeunit named Greeting.", "level": "critical"}, {"text": "The codeunit exposes a procedure that returns a greeting string.", "level": "critical"}, {"text": "The returned greeting is written in English.", "level": "expected"}]} +{"metadata": {"area": "demo"}, "repo": "microsoft/BCApps", "instance_id": "helloworld__greeting-french-1", "base_commit": "70fd0246a0a4dbc72cb183ca719106722c03be4d", "created_at": "2026-06-25", "environment_setup_version": "28.0", "project_paths": ["HelloWorldGreeting"], "language": "French", "patch": "TODO: gold AL code", "expected": [{"text": "The output defines an AL codeunit named Greeting.", "level": "critical"}, {"text": "The codeunit exposes a procedure that returns a greeting string.", "level": "critical"}, {"text": "The returned greeting is written in French (for example 'Bonjour').", "level": "expected"}]} diff --git a/scripts/BCBenchUtils.psm1 b/scripts/BCBenchUtils.psm1 index a8a02cf9c..28cf4d752 100644 --- a/scripts/BCBenchUtils.psm1 +++ b/scripts/BCBenchUtils.psm1 @@ -490,7 +490,7 @@ function Get-BCBenchDatasetPath { param( [Parameter(Mandatory = $true)] # Category validation lives only here: every caller resolves the dataset path through this function, so there's no need to duplicate ValidateSet on each caller. - [ValidateSet("bug-fix", "test-generation", "code-review", "nl2al")] + [ValidateSet("bug-fix", "test-generation", "code-review", "nl2al", "hello-world")] [string] $Category ) @@ -499,6 +499,7 @@ function Get-BCBenchDatasetPath { "test-generation" { $DatasetName = "bcbench.jsonl" } "code-review" { $DatasetName = "codereview.jsonl" } "nl2al" { $DatasetName = "nl2al.jsonl" } + "hello-world" { $DatasetName = "hello_world.jsonl" } } [string] $projectRoot = Split-Path $PSScriptRoot -Parent diff --git a/src/bcbench/agent/shared/config.yaml b/src/bcbench/agent/shared/config.yaml index b5dd27f64..8dd34450c 100644 --- a/src/bcbench/agent/shared/config.yaml +++ b/src/bcbench/agent/shared/config.yaml @@ -66,6 +66,15 @@ prompt: If there are no findings, write an empty array. Write only valid JSON to review.json, with no surrounding markdown or commentary. + hello-world-template: | + You are working with an AL project at {{repo_path}}. + + Task: {{task}} + + Important constraints: + - Create a single new .al file for the codeunit + - Do NOT commit any changes to the repository + # controls: # 1. whether to copy custom instructions from `src/bcbench/agent/shared/instructions//` # - Copilot: copies to repo/.github/ and renames AGENTS.md to copilot-instructions.md diff --git a/src/bcbench/commands/evaluate.py b/src/bcbench/commands/evaluate.py index 43a43da6f..0cd46bead 100644 --- a/src/bcbench/commands/evaluate.py +++ b/src/bcbench/commands/evaluate.py @@ -313,6 +313,8 @@ def evaluate(self, context: EvaluationContext[BaseDatasetEntry]) -> None: scenarios = ["invalid", "valid"] case EvaluationCategory.NL2AL: scenarios = ["raw", "empty"] + case EvaluationCategory.HELLO_WORLD: + scenarios = ["raw", "empty"] case _: raise ValueError(f"Unsupported category for mock evaluation: {context.category}") diff --git a/src/bcbench/dataset/__init__.py b/src/bcbench/dataset/__init__.py index d975e152f..ae6ed475b 100644 --- a/src/bcbench/dataset/__init__.py +++ b/src/bcbench/dataset/__init__.py @@ -1,12 +1,13 @@ """Dataset module for querying, validating and analyze dataset entries.""" from bcbench.dataset.codereview import CodeReviewEntry, ReviewComment, Severity -from bcbench.dataset.dataset_entry import BaseDatasetEntry, BugFixEntry, NL2ALEntry, TestEntry, TestGenEntry +from bcbench.dataset.dataset_entry import BaseDatasetEntry, BugFixEntry, HelloWorldEntry, NL2ALEntry, TestEntry, TestGenEntry __all__ = [ "BaseDatasetEntry", "BugFixEntry", "CodeReviewEntry", + "HelloWorldEntry", "NL2ALEntry", "ReviewComment", "Severity", diff --git a/src/bcbench/dataset/dataset_entry.py b/src/bcbench/dataset/dataset_entry.py index 71fa94516..ec49b27f5 100644 --- a/src/bcbench/dataset/dataset_entry.py +++ b/src/bcbench/dataset/dataset_entry.py @@ -14,7 +14,7 @@ _config = get_config() -__all__ = ["BaseDatasetEntry", "BugFixEntry", "NL2ALEntry", "TestEntry", "TestGenEntry"] +__all__ = ["BaseDatasetEntry", "BugFixEntry", "HelloWorldEntry", "NL2ALEntry", "TestEntry", "TestGenEntry"] class TestEntry(BaseModel): @@ -168,3 +168,22 @@ def get_task(self) -> str: def get_expected_output(self) -> Checklist: return {"assertions": self.expected} + + +class HelloWorldEntry(BaseDatasetEntry): + """Dataset entry for the imaginary hello-world demo category. + + A deliberately tiny, self-contained category used to demonstrate how to add a new + category to BC-Bench: the agent writes a small AL codeunit that returns a greeting, + and the output is scored by an lm_checklist judge. + """ + + base_commit: str | None = None + language: str + expected: Annotated[list[ChecklistAssertion], Field(min_length=1)] + + def get_task(self) -> str: + return f"Create an AL codeunit named Greeting with a procedure that returns a friendly 'Hello, World!' greeting written in {self.language}." + + def get_expected_output(self) -> Checklist: + return {"assertions": self.expected} diff --git a/src/bcbench/evaluate/__init__.py b/src/bcbench/evaluate/__init__.py index c96892d07..d68ba8896 100644 --- a/src/bcbench/evaluate/__init__.py +++ b/src/bcbench/evaluate/__init__.py @@ -3,7 +3,8 @@ from bcbench.evaluate.base import EvaluationPipeline from bcbench.evaluate.bugfix import BugFixPipeline from bcbench.evaluate.codereview import CodeReviewPipeline +from bcbench.evaluate.hello_world import HelloWorldPipeline from bcbench.evaluate.nl2al import NL2ALPipeline from bcbench.evaluate.testgeneration import TestGenerationPipeline -__all__ = ["BugFixPipeline", "CodeReviewPipeline", "EvaluationPipeline", "NL2ALPipeline", "TestGenerationPipeline"] +__all__ = ["BugFixPipeline", "CodeReviewPipeline", "EvaluationPipeline", "HelloWorldPipeline", "NL2ALPipeline", "TestGenerationPipeline"] diff --git a/src/bcbench/evaluate/hello_world.py b/src/bcbench/evaluate/hello_world.py new file mode 100644 index 000000000..158f4bf52 --- /dev/null +++ b/src/bcbench/evaluate/hello_world.py @@ -0,0 +1,68 @@ +import os +import shutil +import subprocess +from collections.abc import Callable +from pathlib import Path + +from bcbench.dataset import HelloWorldEntry +from bcbench.evaluate.base import EvaluationPipeline +from bcbench.exceptions import EmptyDiffError +from bcbench.github_actions import github_log_group +from bcbench.logger import get_logger +from bcbench.operations import stage_and_get_diff +from bcbench.results.base import JudgeBasedEvaluationResult +from bcbench.types import EvaluationContext + +logger = get_logger(__name__) + +__all__ = ["HelloWorldPipeline"] + + +def _force_remove_readonly(func: Callable, path: str, _: object) -> None: + Path(path).chmod(0o666) + func(path) + + +def _reset_repo_path(repo_path: Path) -> None: + if repo_path.exists(): + shutil.rmtree(repo_path, onexc=_force_remove_readonly) + repo_path.mkdir(parents=True, exist_ok=True) + + +def _git_init_and_commit(repo_path: Path) -> None: + env = {**os.environ, "GIT_AUTHOR_NAME": "bcbench", "GIT_AUTHOR_EMAIL": "bcbench@localhost", "GIT_COMMITTER_NAME": "bcbench", "GIT_COMMITTER_EMAIL": "bcbench@localhost"} + subprocess.run(["git", "init"], cwd=repo_path, capture_output=True, check=True) + subprocess.run(["git", "add", "."], cwd=repo_path, capture_output=True, check=True) + subprocess.run(["git", "commit", "-m", "Initial hello-world scaffold"], cwd=repo_path, capture_output=True, check=True, env=env) + + +class HelloWorldPipeline(EvaluationPipeline[HelloWorldEntry]): + """Smallest possible example category: the agent writes a tiny AL greeting codeunit. + + Self-contained (no BC container, no symbols); scoring is judge-based downstream, so + evaluate() only captures the agent's diff as the raw output. + """ + + def setup_workspace(self, entry: HelloWorldEntry, repo_path: Path) -> None: + _reset_repo_path(repo_path) + (repo_path / "README.md").write_text(f"# {entry.instance_id}\n\n{entry.get_task()}\n", encoding="utf-8") + _git_init_and_commit(repo_path) + + def setup(self, context: EvaluationContext[HelloWorldEntry]) -> None: + self.setup_workspace(context.entry, context.repo_path) + + def run_agent(self, context: EvaluationContext[HelloWorldEntry], agent_runner: Callable) -> None: + with github_log_group(f"{context.agent_name} -- Entry: {context.entry.instance_id}"): + context.metrics, context.experiment = agent_runner(context) + + def evaluate(self, context: EvaluationContext[HelloWorldEntry]) -> None: + try: + generated_patch = stage_and_get_diff(context.repo_path) + except EmptyDiffError: + result = JudgeBasedEvaluationResult.create_empty_output(context) + logger.warning(f"Agent produced no changes for {context.entry.instance_id}") + else: + result = JudgeBasedEvaluationResult.create_raw(context, output=generated_patch) + logger.info(f"Saved raw hello-world result for {context.entry.instance_id} (scoring pending)") + + self.save_result(context, result) diff --git a/src/bcbench/types.py b/src/bcbench/types.py index b70821d05..0d1da8c76 100644 --- a/src/bcbench/types.py +++ b/src/bcbench/types.py @@ -133,6 +133,8 @@ class EvaluationCategory(StrEnum): TEST_GENERATION = "test-generation" CODE_REVIEW = "code-review" NL2AL = "nl2al" + # An imaginary, self-contained sample category used to demonstrate how to add a new category. + HELLO_WORLD = "hello-world" # EVENT_REQUEST = "event-request" @property @@ -148,12 +150,14 @@ def dataset_path(self) -> Path: return get_config().paths.dataset_dir / "codereview.jsonl" case EvaluationCategory.NL2AL: return get_config().paths.dataset_dir / "nl2al.jsonl" + case EvaluationCategory.HELLO_WORLD: + return get_config().paths.dataset_dir / "hello_world.jsonl" raise ValueError(f"Unknown evaluation category: {self}") @property def entry_class(self) -> type[BaseDatasetEntry]: - from bcbench.dataset import BugFixEntry, CodeReviewEntry, NL2ALEntry, TestGenEntry + from bcbench.dataset import BugFixEntry, CodeReviewEntry, HelloWorldEntry, NL2ALEntry, TestGenEntry match self: case EvaluationCategory.BUG_FIX: @@ -164,6 +168,8 @@ def entry_class(self) -> type[BaseDatasetEntry]: return CodeReviewEntry case EvaluationCategory.NL2AL: return NL2ALEntry + case EvaluationCategory.HELLO_WORLD: + return HelloWorldEntry raise ValueError(f"Unknown evaluation category: {self}") @@ -183,6 +189,8 @@ def result_class(self) -> type[BaseEvaluationResult]: return CodeReviewResult case EvaluationCategory.NL2AL: return JudgeBasedEvaluationResult + case EvaluationCategory.HELLO_WORLD: + return JudgeBasedEvaluationResult raise ValueError(f"Unknown evaluation category: {self}") @@ -201,6 +209,8 @@ def summary_class(self) -> type[EvaluationResultSummary]: return CodeReviewResultSummary case EvaluationCategory.NL2AL: return JudgeBasedEvaluationResultSummary + case EvaluationCategory.HELLO_WORLD: + return JudgeBasedEvaluationResultSummary raise ValueError(f"Unknown evaluation category: {self}") @@ -218,12 +228,14 @@ def aggregate_class(self) -> type[LeaderboardAggregate]: return CodeReviewLeaderboardAggregate case EvaluationCategory.NL2AL: return JudgeBasedLeaderboardAggregate + case EvaluationCategory.HELLO_WORLD: + return JudgeBasedLeaderboardAggregate raise ValueError(f"Unknown evaluation category: {self}") @property def pipeline(self) -> EvaluationPipeline: - from bcbench.evaluate import BugFixPipeline, CodeReviewPipeline, NL2ALPipeline, TestGenerationPipeline + from bcbench.evaluate import BugFixPipeline, CodeReviewPipeline, HelloWorldPipeline, NL2ALPipeline, TestGenerationPipeline match self: case EvaluationCategory.BUG_FIX: @@ -234,6 +246,8 @@ def pipeline(self) -> EvaluationPipeline: return CodeReviewPipeline() case EvaluationCategory.NL2AL: return NL2ALPipeline() + case EvaluationCategory.HELLO_WORLD: + return HelloWorldPipeline() raise ValueError(f"Unknown evaluation category: {self}") @@ -253,6 +267,8 @@ def evaluators(self) -> list[str]: return ["precision_score", "recall_score", "f1_score", "valid_review_output"] case EvaluationCategory.NL2AL: return ["lm_checklist"] + case EvaluationCategory.HELLO_WORLD: + return ["lm_checklist"] raise ValueError(f"Unknown evaluation category: {self}") @@ -264,7 +280,7 @@ def core_score(self) -> str: return "ResolutionRate" case EvaluationCategory.CODE_REVIEW: return "F1Score" - case EvaluationCategory.NL2AL: + case EvaluationCategory.NL2AL | EvaluationCategory.HELLO_WORLD: return "test_passed" raise ValueError(f"Unknown evaluation category: {self}") @@ -275,7 +291,7 @@ def requires_container(self) -> bool: match self: case EvaluationCategory.BUG_FIX | EvaluationCategory.TEST_GENERATION: return True - case EvaluationCategory.CODE_REVIEW | EvaluationCategory.NL2AL: + case EvaluationCategory.CODE_REVIEW | EvaluationCategory.NL2AL | EvaluationCategory.HELLO_WORLD: return False raise ValueError(f"Unknown evaluation category: {self}") @@ -293,6 +309,8 @@ def runner(self) -> str: return "ubuntu-latest" case EvaluationCategory.NL2AL: return "windows-latest" + case EvaluationCategory.HELLO_WORLD: + return "ubuntu-latest" raise ValueError(f"Unknown evaluation category: {self}") diff --git a/tests/conftest.py b/tests/conftest.py index 789dca6a2..ab0697ec7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,7 +13,7 @@ import pytest -from bcbench.dataset import BaseDatasetEntry, BugFixEntry, NL2ALEntry, TestEntry +from bcbench.dataset import BaseDatasetEntry, BugFixEntry, HelloWorldEntry, NL2ALEntry, TestEntry from bcbench.dataset.codereview import CodeReviewEntry, ReviewComment, Severity from bcbench.dataset.dataset_entry import EntryMetadata, _BugFixTestGenBase from bcbench.evaluate.review_parsing import parse_review_output @@ -340,3 +340,37 @@ def create_nl2al_entry( @pytest.fixture def sample_nl2al_entry() -> NL2ALEntry: return create_nl2al_entry() + + +def create_hello_world_entry( + instance_id: str = "helloworld__greeting-english-1", + repo: str = "helloworld/template", + environment_setup_version: str = VALID_ENVIRONMENT_VERSION, + project_paths: list[str] | None = None, + patch: str = VALID_PATCH, + language: str = "English", + created_at: str = VALID_CREATED_AT, + expected: list[ChecklistAssertion] | None = None, +) -> HelloWorldEntry: + if project_paths is None: + project_paths = ["HelloWorldGreeting"] + + if expected is None: + expected = [ChecklistAssertion(text="The output defines an AL codeunit named Greeting.", level="critical")] + + return HelloWorldEntry( + instance_id=instance_id, + repo=repo, + base_commit=None, + environment_setup_version=environment_setup_version, + project_paths=project_paths, + patch=patch, + language=language, + created_at=created_at, + expected=expected, + ) + + +@pytest.fixture +def sample_hello_world_entry() -> HelloWorldEntry: + return create_hello_world_entry() diff --git a/tests/test_evaluate_pipeline.py b/tests/test_evaluate_pipeline.py index f6a1db97b..bc10f6627 100644 --- a/tests/test_evaluate_pipeline.py +++ b/tests/test_evaluate_pipeline.py @@ -14,7 +14,7 @@ from bcbench.exceptions import AgentTimeoutError from bcbench.results.base import BaseEvaluationResult from bcbench.types import AgentMetrics, EvaluationCategory, EvaluationContext, ExperimentConfiguration -from tests.conftest import create_codereview_entry, create_dataset_entry, create_evaluation_context, create_nl2al_entry +from tests.conftest import create_codereview_entry, create_dataset_entry, create_evaluation_context, create_hello_world_entry, create_nl2al_entry class _StubPipeline(EvaluationPipeline[BugFixEntry]): @@ -99,6 +99,8 @@ def _entry_for_category(category: EvaluationCategory) -> BaseDatasetEntry: return create_codereview_entry() case EvaluationCategory.NL2AL: return create_nl2al_entry() + case EvaluationCategory.HELLO_WORLD: + return create_hello_world_entry() case _: return create_dataset_entry() diff --git a/tests/test_hello_world_pipeline.py b/tests/test_hello_world_pipeline.py new file mode 100644 index 000000000..f7d6e3a95 --- /dev/null +++ b/tests/test_hello_world_pipeline.py @@ -0,0 +1,66 @@ +"""Tests for the imaginary hello-world demo category.""" + +from bcbench.dataset import HelloWorldEntry +from bcbench.evaluate.hello_world import HelloWorldPipeline +from bcbench.exceptions import EmptyDiffError +from bcbench.results.base import JudgeBasedEvaluationResult +from bcbench.types import EvaluationCategory +from tests.conftest import create_evaluation_context, create_hello_world_entry + + +def _hello_world_context(tmp_path): + entry = create_hello_world_entry() + return create_evaluation_context(tmp_path, entry=entry, category=EvaluationCategory.HELLO_WORLD) # ty: ignore[invalid-argument-type] + + +def test_category_wires_hello_world_pieces(): + assert EvaluationCategory.HELLO_WORLD.entry_class is HelloWorldEntry + assert isinstance(EvaluationCategory.HELLO_WORLD.pipeline, HelloWorldPipeline) + assert EvaluationCategory.HELLO_WORLD.requires_container is False + + +def test_dataset_file_loads(): + entries = HelloWorldEntry.load(EvaluationCategory.HELLO_WORLD.dataset_path) + assert entries + assert all(isinstance(e.get_expected_output()["assertions"], list) for e in entries) + + +def test_setup_workspace_creates_git_repo(tmp_path): + entry = create_hello_world_entry() + repo_path = tmp_path / "repo" + + HelloWorldPipeline().setup_workspace(entry, repo_path) + + assert (repo_path / ".git").is_dir() + assert (repo_path / "README.md").exists() + + +def test_empty_diff_persists_empty_output(tmp_path, monkeypatch): + ctx = _hello_world_context(tmp_path) + monkeypatch.setattr("bcbench.evaluate.hello_world.stage_and_get_diff", lambda _repo_path: (_ for _ in ()).throw(EmptyDiffError())) + + HelloWorldPipeline().evaluate(ctx) + + result = _read_only_result(ctx) + assert result.output == "" + assert result.error_message is None + + +def test_non_empty_diff_persists_raw_output(tmp_path, monkeypatch): + ctx = _hello_world_context(tmp_path) + monkeypatch.setattr("bcbench.evaluate.hello_world.stage_and_get_diff", lambda _repo_path: "diff --git a/Greeting.al b/Greeting.al\n+codeunit") + + HelloWorldPipeline().evaluate(ctx) + + result = _read_only_result(ctx) + assert "codeunit" in result.output + assert result.error_message is None + + +def _read_only_result(ctx) -> JudgeBasedEvaluationResult: + from bcbench.config import get_config + + result_file = ctx.result_dir / f"{ctx.entry.instance_id}{get_config().file_patterns.result_pattern}" + lines = result_file.read_text(encoding="utf-8").strip().splitlines() + assert len(lines) == 1 + return JudgeBasedEvaluationResult.model_validate_json(lines[0]) diff --git a/tests/test_type_exhaustiveness.py b/tests/test_type_exhaustiveness.py index 06169ba29..73a9e50e2 100644 --- a/tests/test_type_exhaustiveness.py +++ b/tests/test_type_exhaustiveness.py @@ -1,6 +1,6 @@ from pathlib import Path -from bcbench.dataset import BugFixEntry, CodeReviewEntry, NL2ALEntry +from bcbench.dataset import BugFixEntry, CodeReviewEntry, HelloWorldEntry, NL2ALEntry from bcbench.dataset.codereview import ReviewComment, Severity from bcbench.types import AgentType, EvaluationCategory @@ -40,7 +40,7 @@ def test_all_categories_have_aggregate_classes(): assert issubclass(aggregate_cls, LeaderboardAggregate) -def test_all_categories_handled_in_get_expected_output(sample_dataset_entry_with_problem_statement: BugFixEntry, sample_nl2al_entry: NL2ALEntry): +def test_all_categories_handled_in_get_expected_output(sample_dataset_entry_with_problem_statement: BugFixEntry, sample_nl2al_entry: NL2ALEntry, sample_hello_world_entry: HelloWorldEntry): for category in EvaluationCategory: entry_cls = category.entry_class if entry_cls == CodeReviewEntry: @@ -56,6 +56,8 @@ def test_all_categories_handled_in_get_expected_output(sample_dataset_entry_with ) elif entry_cls is NL2ALEntry: entry = sample_nl2al_entry + elif entry_cls is HelloWorldEntry: + entry = sample_hello_world_entry else: # Reconstruct entry as the category-specific type so get_expected_output() works entry = entry_cls.model_validate(sample_dataset_entry_with_problem_statement.model_dump(by_alias=True))