From bdd79f4bdec376175a0b2593402773bff64861b8 Mon Sep 17 00:00:00 2001
From: Kilian Lieret <kilian.lieret@posteo.de>
Date: Sat, 23 Aug 2025 21:40:38 -0400
Subject: [PATCH 1/8] Single player mode initial version

---
 codeclash/agents/abstract.py           | 108 +++++++++++++----
 codeclash/agents/dummy.py              |   3 +-
 codeclash/agents/minisweagent.py       |   2 +-
 codeclash/games/abstract.py            |   5 +-
 codeclash/utils/environment.py         |  20 ++++
 configs/battlesnake_single_player.yaml |  23 ++++
 configs/mini/default.yaml              |  11 +-
 main.py                                |   4 +-
 main_single_player.py                  | 158 +++++++++++++++++++++++++
 9 files changed, 301 insertions(+), 33 deletions(-)
 create mode 100644 configs/battlesnake_single_player.yaml
 create mode 100644 main_single_player.py

diff --git a/codeclash/agents/abstract.py b/codeclash/agents/abstract.py
index ec2c1a72..f54f461e 100644
--- a/codeclash/agents/abstract.py
+++ b/codeclash/agents/abstract.py
@@ -1,4 +1,5 @@
 import os
+import uuid
 from abc import ABC, abstractmethod
 
 from dotenv import load_dotenv
@@ -18,9 +19,11 @@ def __init__(
         config: dict,
         environment: Environment,
         game_context: GameContext,
-    ):
+    ) -> None:
         self.config = config
         self.name = config["name"]
+        self._player_unique_id = uuid.uuid4()
+        """Unique ID that doesn't clash even accross multiple games. Used for git tags."""
         self.environment = environment
         self.game_context = game_context
         self.game_context.render_and_set_prompts()
@@ -29,28 +32,39 @@ def __init__(
             log_path=self.game_context.log_local / f"{self.name}.log",
             emoji="👤",
         )
+        self._metadata = {
+            "name": self.name,
+            "player_unique_id": self._player_unique_id,
+            "diff": {},  # mapping round -> diff
+            "incremental_diff": {},  # mapping round -> diff
+        }
 
-    @property
-    def branch_name(self):
-        """Get the branch name for the agent's codebase."""
-        return f"{self.game_context.id}.{self.name}"
-
-    def commit(self):
-        """Commit changes to the agent's codebase."""
-        r, rounds = self.game_context.round, self.game_context.rounds
-        for cmd in [
-            "git add -A",
-            f"git commit --allow-empty -m 'Round {r}/{rounds} Update'",
-        ]:
-            assert_zero_exit_code(self.environment.execute(cmd), logger=self.logger)
-        self.logger.info(f"Committed changes for {self.name} for round {r}/{rounds}")
+    # --- Main methods ---
 
-    def on_round_update(self, new_round: int):
-        """Update the agent's round to match the game round."""
+    def pre_run_hook(self, *, new_round: int) -> None:
+        """Should be called before we call the run method."""
+        if new_round == 1:
+            self._tag_round(0)
         self.game_context.round = new_round
         self.game_context.render_and_set_prompts()
 
-    def push(self):
+    def post_run_hook(self, *, round: int) -> None:
+        """Should be called after we called the run method."""
+        self._commit()
+        self._metadata["diff"][round] = self._get_round_diff(round)
+        self._metadata["incremental_diff"][round] = self._get_round_diff(
+            round, incremental=True
+        )
+
+    @abstractmethod
+    def run(self) -> None:
+        """Given the observation / recap, update the codebase"""
+
+    def get_metadata(self) -> dict:
+        """Get metadata for the agent."""
+        return self._metadata
+
+    def push(self) -> None:
         """Push codebase to a branch on the game's remote repository."""
         token = os.getenv("GITHUB_TOKEN")
         if not token:
@@ -59,13 +73,61 @@ def push(self):
         for cmd in [
             "git remote remove origin",
             f"git remote add origin https://x-access-token:{token}@github.com/{GH_ORG}/{self.game_context.name}.git",
-            f"git push origin {self.branch_name}",
+            f"git push origin {self._branch_name}",
+            "git push origin --tags",
         ]:
             assert_zero_exit_code(self.environment.execute(cmd), logger=self.logger)
         self.logger.info(
-            f"Pushed {self.name} commit history to remote repository (branch {self.branch_name})"
+            f"Pushed {self.name} commit history to remote repository (branch {self._branch_name})"
         )
 
-    @abstractmethod
-    def run(self):
-        """Given the observation / recap, update the codebase"""
+    # --- Helper methods ---
+
+    def _tag_round(self, round: int) -> None:
+        """Git tag the codebase at the given round."""
+        assert_zero_exit_code(
+            self.environment.execute(
+                f"git tag -a {self._get_round_tag_name(round)} -m 'Round {round} Update'"
+            ),
+            logger=self.logger,
+        )
+
+    @property
+    def _branch_name(self) -> str:
+        """Get the branch name for the agent's codebase."""
+        return f"{self.game_context.id}.{self.name}"
+
+    def _get_round_tag_name(self, round: int) -> str:
+        """Get git tag name for the version of the codebase at the given round."""
+        return f"{self._player_unique_id}-round-{round}"
+
+    def _commit(self) -> None:
+        """Commit changes to the agent's codebase."""
+        r = self.game_context.round
+        for cmd in [
+            "git add -A",
+            f"git commit --allow-empty -m 'Round {r} Update'",
+        ]:
+            assert_zero_exit_code(self.environment.execute(cmd), logger=self.logger)
+        self._tag_round(r)
+        self.logger.info(f"Committed changes for {self.name} for round {r}")
+
+    def _get_round_diff(self, round: int, *, incremental: bool = False) -> str:
+        """Get the diff between the round and initial version (round 0).
+        If incremental is True, get the diff between the round and the previous round.
+        Returns empty string if round is 0.
+        """
+        if round == 0:
+            return ""
+        if incremental:
+            previous_round_tag = self._get_round_tag_name(round - 1)
+        else:
+            previous_round_tag = self._get_round_tag_name(0)
+        current_round_tag = self._get_round_tag_name(round)
+        out = assert_zero_exit_code(
+            self.environment.execute(
+                f"git diff {previous_round_tag}..{current_round_tag}"
+            ),
+            logger=self.logger,
+        )
+        return out["output"]
diff --git a/codeclash/agents/dummy.py b/codeclash/agents/dummy.py
index 514341c0..ffa882fc 100644
--- a/codeclash/agents/dummy.py
+++ b/codeclash/agents/dummy.py
@@ -5,4 +5,5 @@ class Dummy(Player):
     """A dummy player that does nothing. Mainly for testing purposes."""
 
     def run(self):
-        self.commit()
+        pass
+        # self.commit()  # now called in post_round_hook
diff --git a/codeclash/agents/minisweagent.py b/codeclash/agents/minisweagent.py
index 645131f2..ec5d7cb8 100644
--- a/codeclash/agents/minisweagent.py
+++ b/codeclash/agents/minisweagent.py
@@ -110,4 +110,4 @@ def run(self):
                 traj_path,
                 self.game_context.log_env / traj_path.name,
             )
-            self.commit()
+            # self.commit()  # now called in post_round_hook
diff --git a/codeclash/games/abstract.py b/codeclash/games/abstract.py
index ac20956a..3b9a7c2c 100644
--- a/codeclash/games/abstract.py
+++ b/codeclash/games/abstract.py
@@ -41,7 +41,7 @@ def __init__(self, config: dict):
             self.name, log_path=self.log_local / "game.log", emoji="🏓"
         )
         self.environment: DockerEnvironment = self.get_environment()
-        assert len(config["players"]) >= 2, "At least two players are required"
+        # assert len(config["players"]) >= 2, "At least two players are required"
 
     @property
     def image_name(self) -> str:
@@ -124,9 +124,6 @@ def _pre_round_setup(self, agents: list[Player]):
         """Copy agent codebases into game's container and make round log file"""
         self.round += 1
         # Notify agents of round update
-        for agent in agents:
-            if hasattr(agent, "on_round_update"):
-                agent.on_round_update(self.round)
         self.logger.info(f"▶️ Running {self.name} round {self.round}...")
 
         # Copy agent codebases into game's container
diff --git a/codeclash/utils/environment.py b/codeclash/utils/environment.py
index 3525547f..1d6f6aa1 100644
--- a/codeclash/utils/environment.py
+++ b/codeclash/utils/environment.py
@@ -113,3 +113,23 @@ def copy_file_from_container(
             f"Failed to copy {container.container_id}:{src_path} to {dest_path}: {result.stdout}{result.stderr}"
         )
     return result
+
+
+def create_file_on_container(
+    container: DockerEnvironment,
+    *,
+    content: str,
+    dest_path: str | Path,
+):
+    """
+    Create a file with given content on a Docker container.
+    Uses a temporary file on the local filesystem for the transfer.
+    """
+    with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file:
+        tmp_file.write(content)
+        tmp_file_path = Path(tmp_file.name)
+
+    try:
+        copy_file_to_container(container, tmp_file_path, dest_path)
+    finally:
+        tmp_file_path.unlink()  # Clean up the temporary file
diff --git a/configs/battlesnake_single_player.yaml b/configs/battlesnake_single_player.yaml
new file mode 100644
index 00000000..64985c40
--- /dev/null
+++ b/configs/battlesnake_single_player.yaml
@@ -0,0 +1,23 @@
+game:
+  name: BattleSnake
+  rounds: 5
+  args:
+    width: 11
+    height: 11
+    browser: false
+player:
+  agent: mini
+  config: configs/mini/default.yaml
+  model: openai/gpt-5-mini
+  name: main
+prompts:
+  game_description: |
+    You are a software developer ({{player_id}}) competing in a coding game called BattleSnake.
+    Your bot (`main.py`) controls a snake on a grid-based board.
+    Snakes collect food, avoid collisions, and try to outlast their opponents.
+
+    The game is played in {{rounds}} rounds. For every round, you (and your competitor) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `main.py`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your bot and supporting assets.
diff --git a/configs/mini/default.yaml b/configs/mini/default.yaml
index 7450d86f..c6c762e0 100644
--- a/configs/mini/default.yaml
+++ b/configs/mini/default.yaml
@@ -1,6 +1,11 @@
 agent:
   system_template: |
-    You are a helpful assistant editing a codebase to play a programming game.
+    You are a helpful assistant interacting continuously with a computer by submitting commands.
+    You'll be editing a codebase to play a programming game.
+
+    <important>
+    This is an interactive process where you will think and issue ONE command, see its result, then think and issue your next command.
+    </important>
 
     Your response must contain exactly ONE bash code block with ONE command (or commands connected with && or ||).
     Include a THOUGHT section before your command where you explain your reasoning process.
@@ -142,8 +147,8 @@ agent:
 
     Note: In rare cases, if you need to reference a similar format in your command, you might have
     to proceed in two steps, first writing TRIPLEBACKTICKSBASH, then replacing them with ```bash.
-  step_limit: 0.
-  cost_limit: 0.
+  step_limit: 30
+  cost_limit: 1.
 environment:
   env:
     PAGER: cat
diff --git a/main.py b/main.py
index c9678719..7c965c0c 100644
--- a/main.py
+++ b/main.py
@@ -17,10 +17,12 @@ def main(config_path: str, cleanup: bool = False, push_agent: bool = False):
         agents.append(get_agent(agent_conf, config["prompts"], game))
 
     try:
-        for _ in range(game.rounds):
+        for round in range(1, game.rounds + 1):
             game.run_round(agents)
             for agent in agents:
+                agent.pre_run_hook(new_round=round)
                 agent.run()
+                agent.post_run_hook(round=round)
     finally:
         game.end(cleanup)
         if push_agent:
diff --git a/main_single_player.py b/main_single_player.py
new file mode 100644
index 00000000..e5b6087c
--- /dev/null
+++ b/main_single_player.py
@@ -0,0 +1,158 @@
+"""
+In single player mode, the agent runs always against its previous version.
+"""
+
+import argparse
+import copy
+
+import yaml
+
+from codeclash.agents import get_agent
+from codeclash.agents.abstract import Player
+from codeclash.games import get_game
+from codeclash.games.abstract import CodeGame
+from codeclash.utils.environment import assert_zero_exit_code, create_file_on_container
+from codeclash.utils.log import get_logger
+
+
+def filter_git_diff(text: str) -> str:
+    """Return a git diff with any file sections mentioning binary content removed."""
+    lines = text.splitlines(keepends=True)
+    out: list[str] = []
+    block: list[str] = []
+    in_block = False
+    prelude_copied = False
+
+    def is_binary_block(bl: list[str]) -> bool:
+        for ln in bl:
+            s = ln.strip()
+            if ln.startswith("Binary files "):
+                return True
+            if s == "GIT binary patch":
+                return True
+        return False
+
+    for ln in lines:
+        if ln.startswith("diff --git "):
+            if in_block:
+                if not is_binary_block(block):
+                    out.extend(block)
+                block = []
+            else:
+                if not prelude_copied:
+                    prelude_copied = True
+            in_block = True
+        if in_block:
+            block.append(ln)
+        else:
+            out.append(ln)
+
+    if in_block and block:
+        if not is_binary_block(block):
+            out.extend(block)
+
+    return "".join(out)
+
+
+class SinglePlayerTraining:
+    def __init__(self, config: dict, cleanup: bool = False):
+        self.config = config
+        self.cleanup_on_end = cleanup
+        self.game: CodeGame = get_game(self.config)
+        self.agent: Player = get_agent(
+            self.config["player"], self.config["prompts"], self.game
+        )
+        mirror_agent_config = copy.deepcopy(self.config["player"])
+        mirror_agent_config["name"] = "mirror"
+        self.mirror_agent: Player = get_agent(
+            mirror_agent_config, self.config["prompts"], self.game
+        )
+        self.logger = get_logger(self.game.name)
+
+    def run(self):
+        """Main execution function that runs all rounds."""
+        try:
+            for round_num in range(1, self.game.rounds + 1):
+                self.run_round(round_num)
+        finally:
+            self.cleanup()
+
+    def run_round(self, round_num: int):
+        """Execute a single training round."""
+        self.game.run_round([self.agent, self.mirror_agent])
+        self.run_main_agent(round_num)
+        self.run_mirror_agent(round_num)
+
+    def run_main_agent(self, round_num: int):
+        """Run the main agent for the current round."""
+        self.agent.pre_run_hook(new_round=round_num)
+        self.agent.run()
+        self.agent.post_run_hook(round=round_num)
+
+    def run_mirror_agent(self, round_num: int):
+        """Update mirror agent's codebase with the main agent's changes."""
+        if round_num == 1:
+            self.logger.info("Skipping updating mirror agent for round 1")
+            return
+
+        # Set mirror agent's codebase to the main agent's codebase of the previous round
+        full_diff = self.agent.get_metadata()["diff"][round_num - 1]
+
+        full_diff = filter_git_diff(full_diff)
+
+        if full_diff.strip():
+            self.logger.debug(
+                assert_zero_exit_code(
+                    self.mirror_agent.environment.execute(
+                        "git reset --hard && git clean -fd"
+                    )
+                )
+            )
+
+            create_file_on_container(
+                container=self.mirror_agent.environment,  # type: ignore
+                content=full_diff,
+                dest_path="tmp_patch.txt",
+            )
+
+            self.logger.info("Applying patch to mirror agent's codebase")
+            self.logger.debug(f"Full diff: {full_diff}")
+
+            commands = ["git status", "git apply tmp_patch.txt", "rm -f tmp_patch.txt"]
+            for cmd in commands:
+                self.logger.debug(f"Executing command: {cmd}")
+                out = assert_zero_exit_code(
+                    self.mirror_agent.environment.execute(cmd), logger=self.logger
+                )
+                self.logger.debug(out)
+        else:
+            self.logger.info("No diff found for mirror agent, skipping update")
+
+    def cleanup(self):
+        """Clean up game resources."""
+        self.game.end(self.cleanup_on_end)
+
+
+def main(config_path: str, cleanup: bool = False):
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)
+    training = SinglePlayerTraining(config, cleanup)
+    training.run()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="CodeClash")
+    parser.add_argument(
+        "config_path",
+        type=str,
+        default="configs/battlesnake.yaml",
+        help="Path to the config file.",
+    )
+    parser.add_argument(
+        "-c",
+        "--cleanup",
+        action="store_true",
+        help="If set, do not clean up the game environment after running.",
+    )
+    args = parser.parse_args()
+    main(**vars(args))

From b8427d410d4f675a722cdf2717e6e04280dc8d1f Mon Sep 17 00:00:00 2001
From: Kilian Lieret <kilian.lieret@posteo.de>
Date: Sun, 24 Aug 2025 11:25:12 -0400
Subject: [PATCH 2/8] Ref: Create new Tournament class. Move main.py code

---
 codeclash/tournaments/abstract.py             |   2 +
 codeclash/tournaments/pvp_training.py         |  51 +++++++
 .../tournaments/single_player_training.py     |  93 +++++++++++++
 codeclash/tournaments/utils/git_utils.py      |  37 +++++
 main.py                                       |  26 +---
 main_single_player.py                         | 130 +-----------------
 6 files changed, 188 insertions(+), 151 deletions(-)
 create mode 100644 codeclash/tournaments/abstract.py
 create mode 100644 codeclash/tournaments/pvp_training.py
 create mode 100644 codeclash/tournaments/single_player_training.py
 create mode 100644 codeclash/tournaments/utils/git_utils.py

diff --git a/codeclash/tournaments/abstract.py b/codeclash/tournaments/abstract.py
new file mode 100644
index 00000000..59b0e753
--- /dev/null
+++ b/codeclash/tournaments/abstract.py
@@ -0,0 +1,2 @@
+class AbstractTournament:
+    pass
diff --git a/codeclash/tournaments/pvp_training.py b/codeclash/tournaments/pvp_training.py
new file mode 100644
index 00000000..07ab57c6
--- /dev/null
+++ b/codeclash/tournaments/pvp_training.py
@@ -0,0 +1,51 @@
+"""
+PvP training mode where multiple agents compete against each other.
+"""
+
+from codeclash.agents import get_agent
+from codeclash.agents.abstract import Player
+from codeclash.games import get_game
+from codeclash.games.abstract import CodeGame
+from codeclash.tournaments.abstract import AbstractTournament
+from codeclash.utils.log import get_logger
+
+
+class PvpTraining(AbstractTournament):
+    def __init__(
+        self, config: dict, *, cleanup: bool = False, push_agent: bool = False
+    ):
+        self.config = config
+        self.cleanup_on_end = cleanup
+        self.push_agent = push_agent
+        self.game: CodeGame = get_game(self.config)
+        self.agents: list[Player] = []
+        for agent_conf in self.config["players"]:
+            self.agents.append(get_agent(agent_conf, self.config["prompts"], self.game))
+        self.logger = get_logger(self.game.name)
+
+    def run(self) -> None:
+        """Main execution function that runs all rounds."""
+        try:
+            for round_num in range(1, self.game.rounds + 1):
+                self.run_training_round(round_num)
+        finally:
+            self.cleanup()
+
+    def run_training_round(self, round_num: int) -> None:
+        """Execute a single training round."""
+        self.game.run_round(self.agents)
+        for agent in self.agents:
+            self.run_agent(agent, round_num)
+
+    def run_agent(self, agent: Player, round_num: int) -> None:
+        """Run a single agent for the current round."""
+        agent.pre_run_hook(new_round=round_num)
+        agent.run()
+        agent.post_run_hook(round=round_num)
+
+    def cleanup(self) -> None:
+        """Clean up game resources and push agents if requested."""
+        self.game.end(self.cleanup_on_end)
+        if self.push_agent:
+            for agent in self.agents:
+                agent.push()
diff --git a/codeclash/tournaments/single_player_training.py b/codeclash/tournaments/single_player_training.py
new file mode 100644
index 00000000..f2325640
--- /dev/null
+++ b/codeclash/tournaments/single_player_training.py
@@ -0,0 +1,93 @@
+"""
+In single player mode, the agent runs always against its previous version.
+"""
+
+import copy
+
+from codeclash.agents import get_agent
+from codeclash.agents.abstract import Player
+from codeclash.games import get_game
+from codeclash.games.abstract import CodeGame
+from codeclash.tournaments.abstract import AbstractTournament
+from codeclash.tournaments.utils.git_utils import filter_git_diff
+from codeclash.utils.environment import assert_zero_exit_code, create_file_on_container
+from codeclash.utils.log import get_logger
+
+
+class SinglePlayerTraining(AbstractTournament):
+    def __init__(self, config: dict, cleanup: bool = False):
+        self.config = config
+        self.cleanup_on_end = cleanup
+        self.game: CodeGame = get_game(self.config)
+        self.agent: Player = get_agent(
+            self.config["player"], self.config["prompts"], self.game
+        )
+        mirror_agent_config = copy.deepcopy(self.config["player"])
+        mirror_agent_config["name"] = "mirror"
+        self.mirror_agent: Player = get_agent(
+            mirror_agent_config, self.config["prompts"], self.game
+        )
+        self.logger = get_logger(self.game.name)
+
+    def run(self) -> None:
+        """Main execution function that runs all rounds."""
+        try:
+            for round_num in range(1, self.game.rounds + 1):
+                self.run_training_round(round_num)
+        finally:
+            self.cleanup()
+
+    def run_training_round(self, round_num: int) -> None:
+        """Execute a single training round."""
+        self.game.run_round([self.agent, self.mirror_agent])
+        self.run_main_agent(round_num)
+        self.run_mirror_agent(round_num)
+
+    def run_main_agent(self, round_num: int) -> None:
+        """Run the main agent for the current round."""
+        self.agent.pre_run_hook(new_round=round_num)
+        self.agent.run()
+        self.agent.post_run_hook(round=round_num)
+
+    def run_mirror_agent(self, round_num: int) -> None:
+        """Update mirror agent's codebase with the main agent's changes."""
+        if round_num == 1:
+            self.logger.info("Skipping updating mirror agent for round 1")
+            return
+
+        # Set mirror agent's codebase to the main agent's codebase of the previous round
+        full_diff = self.agent.get_metadata()["diff"][round_num - 1]
+
+        full_diff = filter_git_diff(full_diff)
+
+        if full_diff.strip():
+            self.logger.debug(
+                assert_zero_exit_code(
+                    self.mirror_agent.environment.execute(
+                        "git reset --hard && git clean -fd"
+                    )
+                )
+            )
+
+            create_file_on_container(
+                container=self.mirror_agent.environment,  # type: ignore
+                content=full_diff,
+                dest_path="tmp_patch.txt",
+            )
+
+            self.logger.info("Applying patch to mirror agent's codebase")
+            self.logger.debug(f"Full diff: {full_diff}")
+
+            commands = ["git status", "git apply tmp_patch.txt", "rm -f tmp_patch.txt"]
+            for cmd in commands:
+                self.logger.debug(f"Executing command: {cmd}")
+                out = assert_zero_exit_code(
+                    self.mirror_agent.environment.execute(cmd), logger=self.logger
+                )
+                self.logger.debug(out)
+        else:
+            self.logger.info("No diff found for mirror agent, skipping update")
+
+    def cleanup(self) -> None:
+        """Clean up game resources."""
+        self.game.end(self.cleanup_on_end)
diff --git a/codeclash/tournaments/utils/git_utils.py b/codeclash/tournaments/utils/git_utils.py
new file mode 100644
index 00000000..22bef669
--- /dev/null
+++ b/codeclash/tournaments/utils/git_utils.py
@@ -0,0 +1,37 @@
+def filter_git_diff(text: str) -> str:
+    """Return a git diff with any file sections mentioning binary content removed."""
+    lines = text.splitlines(keepends=True)
+    out: list[str] = []
+    block: list[str] = []
+    in_block = False
+    prelude_copied = False
+
+    def is_binary_block(bl: list[str]) -> bool:
+        for ln in bl:
+            s = ln.strip()
+            if ln.startswith("Binary files "):
+                return True
+            if s == "GIT binary patch":
+                return True
+        return False
+
+    for ln in lines:
+        if ln.startswith("diff --git "):
+            if in_block:
+                if not is_binary_block(block):
+                    out.extend(block)
+                block = []
+            else:
+                if not prelude_copied:
+                    prelude_copied = True
+            in_block = True
+        if in_block:
+            block.append(ln)
+        else:
+            out.append(ln)
+
+    if in_block and block:
+        if not is_binary_block(block):
+            out.extend(block)
+
+    return "".join(out)
diff --git a/main.py b/main.py
index 7c965c0c..14245d38 100644
--- a/main.py
+++ b/main.py
@@ -2,32 +2,14 @@
 
 import yaml
 
-from codeclash.agents import get_agent
-from codeclash.agents.abstract import Player
-from codeclash.games import get_game
-from codeclash.games.abstract import CodeGame
+from codeclash.tournaments.pvp_training import PvpTraining
 
 
-def main(config_path: str, cleanup: bool = False, push_agent: bool = False):
+def main(config_path: str, *, cleanup: bool = False, push_agent: bool = False):
     with open(config_path, "r") as f:
         config = yaml.safe_load(f)
-    game: CodeGame = get_game(config)
-    agents: list[Player] = []
-    for agent_conf in config["players"]:
-        agents.append(get_agent(agent_conf, config["prompts"], game))
-
-    try:
-        for round in range(1, game.rounds + 1):
-            game.run_round(agents)
-            for agent in agents:
-                agent.pre_run_hook(new_round=round)
-                agent.run()
-                agent.post_run_hook(round=round)
-    finally:
-        game.end(cleanup)
-        if push_agent:
-            for agent in agents:
-                agent.push()
+    training = PvpTraining(config, cleanup=cleanup, push_agent=push_agent)
+    training.run()
 
 
 if __name__ == "__main__":
diff --git a/main_single_player.py b/main_single_player.py
index e5b6087c..6c7ff720 100644
--- a/main_single_player.py
+++ b/main_single_player.py
@@ -1,136 +1,8 @@
-"""
-In single player mode, the agent runs always against its previous version.
-"""
-
 import argparse
-import copy
 
 import yaml
 
-from codeclash.agents import get_agent
-from codeclash.agents.abstract import Player
-from codeclash.games import get_game
-from codeclash.games.abstract import CodeGame
-from codeclash.utils.environment import assert_zero_exit_code, create_file_on_container
-from codeclash.utils.log import get_logger
-
-
-def filter_git_diff(text: str) -> str:
-    """Return a git diff with any file sections mentioning binary content removed."""
-    lines = text.splitlines(keepends=True)
-    out: list[str] = []
-    block: list[str] = []
-    in_block = False
-    prelude_copied = False
-
-    def is_binary_block(bl: list[str]) -> bool:
-        for ln in bl:
-            s = ln.strip()
-            if ln.startswith("Binary files "):
-                return True
-            if s == "GIT binary patch":
-                return True
-        return False
-
-    for ln in lines:
-        if ln.startswith("diff --git "):
-            if in_block:
-                if not is_binary_block(block):
-                    out.extend(block)
-                block = []
-            else:
-                if not prelude_copied:
-                    prelude_copied = True
-            in_block = True
-        if in_block:
-            block.append(ln)
-        else:
-            out.append(ln)
-
-    if in_block and block:
-        if not is_binary_block(block):
-            out.extend(block)
-
-    return "".join(out)
-
-
-class SinglePlayerTraining:
-    def __init__(self, config: dict, cleanup: bool = False):
-        self.config = config
-        self.cleanup_on_end = cleanup
-        self.game: CodeGame = get_game(self.config)
-        self.agent: Player = get_agent(
-            self.config["player"], self.config["prompts"], self.game
-        )
-        mirror_agent_config = copy.deepcopy(self.config["player"])
-        mirror_agent_config["name"] = "mirror"
-        self.mirror_agent: Player = get_agent(
-            mirror_agent_config, self.config["prompts"], self.game
-        )
-        self.logger = get_logger(self.game.name)
-
-    def run(self):
-        """Main execution function that runs all rounds."""
-        try:
-            for round_num in range(1, self.game.rounds + 1):
-                self.run_round(round_num)
-        finally:
-            self.cleanup()
-
-    def run_round(self, round_num: int):
-        """Execute a single training round."""
-        self.game.run_round([self.agent, self.mirror_agent])
-        self.run_main_agent(round_num)
-        self.run_mirror_agent(round_num)
-
-    def run_main_agent(self, round_num: int):
-        """Run the main agent for the current round."""
-        self.agent.pre_run_hook(new_round=round_num)
-        self.agent.run()
-        self.agent.post_run_hook(round=round_num)
-
-    def run_mirror_agent(self, round_num: int):
-        """Update mirror agent's codebase with the main agent's changes."""
-        if round_num == 1:
-            self.logger.info("Skipping updating mirror agent for round 1")
-            return
-
-        # Set mirror agent's codebase to the main agent's codebase of the previous round
-        full_diff = self.agent.get_metadata()["diff"][round_num - 1]
-
-        full_diff = filter_git_diff(full_diff)
-
-        if full_diff.strip():
-            self.logger.debug(
-                assert_zero_exit_code(
-                    self.mirror_agent.environment.execute(
-                        "git reset --hard && git clean -fd"
-                    )
-                )
-            )
-
-            create_file_on_container(
-                container=self.mirror_agent.environment,  # type: ignore
-                content=full_diff,
-                dest_path="tmp_patch.txt",
-            )
-
-            self.logger.info("Applying patch to mirror agent's codebase")
-            self.logger.debug(f"Full diff: {full_diff}")
-
-            commands = ["git status", "git apply tmp_patch.txt", "rm -f tmp_patch.txt"]
-            for cmd in commands:
-                self.logger.debug(f"Executing command: {cmd}")
-                out = assert_zero_exit_code(
-                    self.mirror_agent.environment.execute(cmd), logger=self.logger
-                )
-                self.logger.debug(out)
-        else:
-            self.logger.info("No diff found for mirror agent, skipping update")
-
-    def cleanup(self):
-        """Clean up game resources."""
-        self.game.end(self.cleanup_on_end)
+from codeclash.tournaments.single_player_training import SinglePlayerTraining
 
 
 def main(config_path: str, cleanup: bool = False):

From 20f8cb25f8f4076ebfa78aeae4b2644b4f07e43d Mon Sep 17 00:00:00 2001
From: Kilian Lieret <kilian.lieret@posteo.de>
Date: Sun, 24 Aug 2025 13:33:11 -0400
Subject: [PATCH 3/8] WIP: Factored out tournament class from game class

---
 codeclash/agents/abstract.py                  |   4 +-
 codeclash/games/__init__.py                   |  11 +-
 codeclash/games/abstract.py                   | 131 +++++++-----------
 codeclash/games/battlecode/main.py            |  38 +++--
 codeclash/games/battlesnake/main.py           |  47 +++++--
 codeclash/games/corewar/main.py               |  45 ++++--
 codeclash/games/robocode/main.py              |  48 +++++--
 codeclash/games/robotrumble/main.py           |  47 +++++--
 codeclash/tournaments/abstract.py             |  26 +++-
 codeclash/tournaments/pvp_training.py         |  47 ++++++-
 .../tournaments/single_player_training.py     |  63 +++++++--
 11 files changed, 352 insertions(+), 155 deletions(-)

diff --git a/codeclash/agents/abstract.py b/codeclash/agents/abstract.py
index f54f461e..1cfc39c3 100644
--- a/codeclash/agents/abstract.py
+++ b/codeclash/agents/abstract.py
@@ -3,7 +3,7 @@
 from abc import ABC, abstractmethod
 
 from dotenv import load_dotenv
-from minisweagent import Environment
+from minisweagent.environments.docker import DockerEnvironment
 
 from codeclash.agents.utils import GameContext
 from codeclash.constants import GH_ORG
@@ -17,7 +17,7 @@ class Player(ABC):
     def __init__(
         self,
         config: dict,
-        environment: Environment,
+        environment: DockerEnvironment,
         game_context: GameContext,
     ) -> None:
         self.config = config
diff --git a/codeclash/games/__init__.py b/codeclash/games/__init__.py
index d996b8bf..781c3557 100644
--- a/codeclash/games/__init__.py
+++ b/codeclash/games/__init__.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 from codeclash.games.abstract import CodeGame
 from codeclash.games.battlecode.main import BattleCodeGame
 from codeclash.games.battlesnake.main import BattleSnakeGame
@@ -7,16 +9,17 @@
 
 
 # might consider postponing imports to avoid loading things we don't need
-def get_game(config: dict) -> CodeGame:
+def get_game(config: dict, *, tournament_id: str, local_output_dir: Path) -> CodeGame:
     game = {
-        x.name: x for x in [
+        x.name: x
+        for x in [
             BattleCodeGame,
             BattleSnakeGame,
             CoreWarGame,
             RoboCodeGame,
-            RobotRumbleGame
+            RobotRumbleGame,
         ]
     }.get(config["game"]["name"])
     if game is None:
         raise ValueError(f"Unknown game: {config['game']['name']}")
-    return game(config)
+    return game(config, tournament_id=tournament_id, local_output_dir=local_output_dir)
diff --git a/codeclash/games/abstract.py b/codeclash/games/abstract.py
index 3b9a7c2c..9c45e205 100644
--- a/codeclash/games/abstract.py
+++ b/codeclash/games/abstract.py
@@ -1,30 +1,30 @@
-import getpass
 import json
 import os
 import subprocess
-import time
-import traceback
 from abc import ABC, abstractmethod
 from collections import Counter
 from pathlib import Path
-from typing import Any
 
 from minisweagent.environments.docker import DockerEnvironment
 
 from codeclash.agents.abstract import Player
 from codeclash.constants import DIR_LOGS, DIR_WORK, GH_ORG
-from codeclash.utils.environment import (
-    assert_zero_exit_code,
-    copy_between_containers,
-    copy_file_from_container,
-)
+from codeclash.utils.environment import assert_zero_exit_code, copy_between_containers
 from codeclash.utils.log import get_logger
 
 
 class CodeGame(ABC):
     name: str
 
-    def __init__(self, config: dict):
+    def __init__(self, config: dict, *, tournament_id: str, local_output_dir: Path):
+        """The CodeGame class is responsible for running games, i.e., taking a list of code
+        from different agents/players and running them against each other.
+        It also provides the environments for the game and agents to run in.
+
+        The central method is `run_round`, which takes a list of agents and returns the winner of the round.
+
+        At the end of the the tournament, run the `end` method to clean up the game and agents and write the metadata.
+        """
         self.url_gh: str = f"git@github.com:{GH_ORG}/{self.name}.git"
         self.artifacts: list[Path] = []
         """Artifact objects that we might want to clean up after the game."""
@@ -32,16 +32,21 @@ def __init__(self, config: dict):
         """List of (round number, winner (player id))"""
         self.game_config: dict = config["game"]
         self.config: dict = config
-        self.rounds: int = self.game_config.get("rounds", 1)
-        self.round: int = 0
-        self.game_id: str = f"{self.name}{time.strftime('%y%m%d%H%M%S')}"
+        self.game_id: str = tournament_id
         self.log_env: Path = (DIR_WORK / DIR_LOGS / self.game_id).resolve()
-        self.log_local: Path = (DIR_LOGS / getpass.getuser() / self.game_id).resolve()
+        self.log_local: Path = local_output_dir
         self.logger = get_logger(
             self.name, log_path=self.log_local / "game.log", emoji="🏓"
         )
         self.environment: DockerEnvironment = self.get_environment()
+        """The running docker environment for executing the game"""
         # assert len(config["players"]) >= 2, "At least two players are required"
+        """Total number of rounds to play"""
+        self._metadata: dict = {
+            "name": self.name,
+            "config": self.config,
+            "game_id": self.game_id,
+        }
 
     @property
     def image_name(self) -> str:
@@ -84,12 +89,7 @@ def get_metadata(self) -> dict:
         """This is what we write to metadata.json.
         You can subclass extend this to add more details for specific games.
         """
-        return {
-            "name": self.name,
-            "scoreboard": self.scoreboard,
-            "config": self.config,
-            "game_id": self.game_id,
-        }
+        return self._metadata
 
     def end(self, cleanup: bool = False):
         self.logger.info("Overall score: %s", Counter([x[1] for x in self.scoreboard]))
@@ -121,11 +121,7 @@ def get_environment(self, branch_name: str | None = None) -> DockerEnvironment:
         return environment
 
     def _pre_round_setup(self, agents: list[Player]):
-        """Copy agent codebases into game's container and make round log file"""
-        self.round += 1
-        # Notify agents of round update
-        self.logger.info(f"▶️ Running {self.name} round {self.round}...")
-
+        """Copy agent codebases into game's container"""
         # Copy agent codebases into game's container
         for agent in agents:
             self.logger.debug(f"Copying {agent.name}'s codebase")
@@ -136,78 +132,55 @@ def _pre_round_setup(self, agents: list[Player]):
                 dest_path=f"/{agent.name}",
             )
 
-        # Ensure the log path + file exists
+        # Ensure the log directory exists
         assert_zero_exit_code(
             self.environment.execute(f"mkdir -p {self.log_env}"),
             logger=self.logger,
         )
-        assert_zero_exit_code(
-            self.environment.execute(f"touch {self.round_log_path}"), logger=self.logger
-        )
 
     @abstractmethod
-    def determine_winner(self, agents: list[Player]) -> Any:
-        """Determine the winner of the game based on the round results,
-        Should update self.scoreboard
+    def determine_winner(
+        self, result_output: str, agents: list[Player]
+    ) -> dict[str, str]:
+        """Determine the winner of the game based on the result output.
+
+        Args:
+            result_output: The specific output containing winning information
+            agents: List of agents participating in the round
+
+        Returns:
+            Dictionary with key "winner" containing the winner's name
         """
         pass
 
     @abstractmethod
-    def execute_round(self, agents: list[Player]):
-        """Subclasses implement their game-specific logic here, must write results to round_log_path.
+    def execute_round(self, agents: list[Player]) -> dict[str, str]:
+        """Subclasses implement their game-specific logic here.
         This is the low level implementation, you probably want to use run_round instead, which
         includes the pre-round setup, post-round setup, and winner determination.
+
+        Returns:
+            Dictionary with keys "log_output" and "result_output"
         """
         pass
 
-    def _post_round_setup(self, agents: list[Player]):
-        for agent in agents:
-            try:
-                copy_between_containers(
-                    self.environment,
-                    agent.environment,
-                    self.round_log_path,
-                    f"{agent.environment.config.cwd}/logs/round_{self.round}.log",
-                )
-            except Exception:
-                self.logger.error(
-                    f"Error copying round log to {agent.name}'s container: {traceback.format_exc()}"
-                )
-            else:
-                self.logger.info(f"Copied round log to {agent.name}'s container.")
-
-            try:
-                copy_file_from_container(
-                    self.environment,
-                    self.round_log_path,
-                    self.log_local / self.round_log_path.name,
-                )
-            except Exception:
-                self.logger.error(
-                    f"Error copying round log to {agent.name}'s container: {traceback.format_exc()}"
-                )
-            else:
-                self.logger.info(
-                    f"Copied round log from {agent.name}'s container to local log dir."
-                )
-        self.logger.info(f"Round {self.round} completed.")
-
-    def run_round(self, agents: list[Player]):
+    def run_round(self, agents: list[Player]) -> dict[str, str]:
         """
         Run a single round of the game with the given agents.
 
-        Writes to directory containing logs and results of the round(s).
+        Returns the log output, result output, and winner name. All bookkeeping should be
+        handled by the tournament class.
         """
         self._pre_round_setup(agents)
-        self.execute_round(agents)
-        self.determine_winner(agents)
-        last_winner = self.scoreboard[-1][1]
-        self.logger.info(f"Round {self.round} winner: {last_winner}")
-        self._post_round_setup(agents)
+        result = self.execute_round(agents)
+        log_output = result["log_output"]
+        result_output = result["result_output"]
 
-    @property
-    def round_log_path(self) -> Path:
-        """
-        Get the path to the current round's log file.
-        """
-        return self.log_env / f"round_{self.round}.log"
+        winner_result = self.determine_winner(result_output, agents)
+        winner_name = winner_result["winner"]
+
+        return {
+            "log_output": log_output,
+            "result_output": result_output,
+            "winner": winner_name,
+        }
diff --git a/codeclash/games/battlecode/main.py b/codeclash/games/battlecode/main.py
index 07df6252..59faa13f 100644
--- a/codeclash/games/battlecode/main.py
+++ b/codeclash/games/battlecode/main.py
@@ -1,4 +1,5 @@
 import re
+from pathlib import Path
 from typing import Any
 
 from codeclash.constants import DIR_WORK, RESULT_TIE
@@ -8,8 +9,10 @@
 class BattleCodeGame(CodeGame):
     name: str = "BattleCode"
 
-    def __init__(self, config):
-        super().__init__(config)
+    def __init__(self, config, *, tournament_id: str, local_output_dir: Path):
+        super().__init__(
+            config, tournament_id=tournament_id, local_output_dir=local_output_dir
+        )
         assert len(config["players"]) == 2, "BattleCode is a two-player game"
         self.run_cmd_round: str = "python run.py run"
         for arg, val in self.game_config.get("args", {}).items():
@@ -19,13 +22,27 @@ def __init__(self, config):
             else:
                 self.run_cmd_round += f" --{arg} {val}"
 
-    def determine_winner(self, agents: list[Any]):
-        response = self.environment.execute(f"tail -3 {self.round_log_path} | head -1")
-        winner = re.search(r"\s\((.*)\)\swins\s\(", response["output"]).group(1)
-        winner = {"A": agents[0].name, "B": agents[1].name}.get(winner, RESULT_TIE)
-        self.scoreboard.append((self.round, winner))
+    def determine_winner(self, result_output: str, agents: list[Any]) -> dict[str, str]:
+        self.logger.debug(f"Determining winner from result output: {result_output}")
+        lines = result_output.strip().split("\n")
+        # Get the third-to-last line which contains the winner info
+        winner_line = lines[-3] if len(lines) >= 3 else ""
+        self.logger.debug(f"Winner line: {winner_line}")
+        match = re.search(r"\s\((.*)\)\swins\s\(", winner_line)
+        if match:
+            winner_key = match.group(1)
+            self.logger.debug(f"Winner key from match: {winner_key}")
+            # Map A/B to actual agent names (much closer to original code)
+            winner = {"A": agents[0].name, "B": agents[1].name}.get(
+                winner_key, RESULT_TIE
+            )
+            self.logger.debug(f"Concluding winner: {winner}")
+            return {"winner": winner}
+        else:
+            self.logger.debug("No winner match found, returning tie")
+            return {"winner": RESULT_TIE}
 
-    def execute_round(self, agents: list[Any]):
+    def execute_round(self, agents: list[Any]) -> dict[str, str]:
         for agent in agents:
             src, dest = f"/{agent.name}/src/mysubmission/", str(
                 DIR_WORK / "src" / agent.name
@@ -35,7 +52,10 @@ def execute_round(self, agents: list[Any]):
             f"--p{idx+1}-dir src --p{idx+1} {agent.name}"
             for idx, agent in enumerate(agents)
         ]
-        cmd = f"{self.run_cmd_round} {' '.join(args)} > {self.round_log_path}"
+        cmd = f"{self.run_cmd_round} {' '.join(args)}"
         self.logger.info(f"Running command: {cmd}")
         response = self.environment.execute(cmd)
         assert response["returncode"] == 0, response
+        # For BattleCode, log_output and result_output are the same
+        output = response["output"]
+        return {"log_output": output, "result_output": output}
diff --git a/codeclash/games/battlesnake/main.py b/codeclash/games/battlesnake/main.py
index 206f6fed..d3a8187e 100644
--- a/codeclash/games/battlesnake/main.py
+++ b/codeclash/games/battlesnake/main.py
@@ -1,5 +1,6 @@
 import json
 import time
+from pathlib import Path
 
 from codeclash.agents.abstract import Player
 from codeclash.games.abstract import CodeGame
@@ -9,8 +10,10 @@
 class BattleSnakeGame(CodeGame):
     name: str = "BattleSnake"
 
-    def __init__(self, config):
-        super().__init__(config)
+    def __init__(self, config, *, tournament_id: str, local_output_dir: Path):
+        super().__init__(
+            config, tournament_id=tournament_id, local_output_dir=local_output_dir
+        )
         self.run_cmd_round: str = "./battlesnake play"
         for arg, val in self.game_config.get("args", {}).items():
             if isinstance(val, bool):
@@ -19,14 +22,19 @@ def __init__(self, config):
             else:
                 self.run_cmd_round += f" --{arg} {val}"
 
-    def determine_winner(self, agents: list[Player]):
-        response = assert_zero_exit_code(
-            self.environment.execute(f"tail -1 {self.round_log_path}")
-        )
-        winner = json.loads(response["output"].strip("\n"))["winnerName"]
-        self.scoreboard.append((self.round, winner))
+    def determine_winner(
+        self, result_output: str, agents: list[Player]
+    ) -> dict[str, str]:
+        self.logger.debug(f"Determining winner from result output: {result_output}")
+        lines = result_output.strip().split("\n")
+        # Get the last line which contains the game result
+        last_line = lines[-1] if lines else ""
+        self.logger.debug(f"Last line: {last_line}")
+        winner = json.loads(last_line)["winnerName"]
+        self.logger.debug(f"Concluding winner: {winner}")
+        return {"winner": winner}
 
-    def execute_round(self, agents: list[Player]):
+    def execute_round(self, agents: list[Player]) -> dict[str, str]:
         cmd = []
         for idx, agent in enumerate(agents):
             port = 8001 + idx
@@ -38,18 +46,27 @@ def execute_round(self, agents: list[Player]):
 
         time.sleep(3)  # Give servers time to start
 
-        cmd.append(f"-o {self.round_log_path}")
-        cmd = " ".join(cmd)
-        self.logger.info(f"Running command: {cmd}")
+        # Create temporary output file for results
+        output_file = f"battlesnake_output_{int(time.time())}.json"
+        cmd_str = " ".join(cmd) + f" -o {output_file}"
+        self.logger.info(f"Running command: {self.run_cmd_round} {cmd_str}")
 
-        # todo: should probably keep output somewhere?
         try:
-            assert_zero_exit_code(
+            response = assert_zero_exit_code(
                 self.environment.execute(
-                    f"{self.run_cmd_round} {cmd}",
+                    f"{self.run_cmd_round} {cmd_str}",
                     cwd=f"{self.environment.config.cwd}/game",
                 )
             )
+
+            # Read the output file for result information
+            result_response = self.environment.execute(f"cat game/{output_file}")
+            result_output = result_response["output"]
+
+            # Clean up the output file
+            self.environment.execute(f"rm -f game/{output_file}")
+
+            return {"log_output": response["output"], "result_output": result_output}
         finally:
             # Kill all python servers when done
             self.environment.execute("pkill -f 'python main.py' || true")
diff --git a/codeclash/games/corewar/main.py b/codeclash/games/corewar/main.py
index 46d5eafc..55e2960a 100644
--- a/codeclash/games/corewar/main.py
+++ b/codeclash/games/corewar/main.py
@@ -1,4 +1,5 @@
 import re
+from pathlib import Path
 
 from codeclash.agents.abstract import Player
 from codeclash.games.abstract import CodeGame
@@ -7,8 +8,10 @@
 class CoreWarGame(CodeGame):
     name: str = "CoreWar"
 
-    def __init__(self, config):
-        super().__init__(config)
+    def __init__(self, config, *, tournament_id: str, local_output_dir: Path):
+        super().__init__(
+            config, tournament_id=tournament_id, local_output_dir=local_output_dir
+        )
         self.run_cmd_round: str = "./src/pmars"
         for arg, val in self.game_config.get("args", {}).items():
             if isinstance(val, bool):
@@ -17,20 +20,42 @@ def __init__(self, config):
             else:
                 self.run_cmd_round += f" -{arg} {val}"
 
-    def determine_winner(self, agents: list[Player]):
+    def determine_winner(
+        self, result_output: str, agents: list[Player]
+    ) -> dict[str, str]:
+        self.logger.debug(f"Determining winner from result output: {result_output}")
         scores = []
         n = len(agents) * 2
-        response = self.environment.execute(f"tail -{n} {self.round_log_path}")
-        for line in response["output"].splitlines():
+        lines = result_output.strip().split("\n")
+        # Get the last n lines which contain the scores (closer to original)
+        relevant_lines = lines[-n:] if len(lines) >= n else lines
+        self.logger.debug(f"Relevant lines for scoring: {relevant_lines}")
+
+        for line in relevant_lines:
             match = re.search(r".*\sby\s.*\sscores\s(\d+)", line)
             if match:
-                scores.append(int(match.group(1)))
-        winner = agents[scores.index(max(scores))].name
-        self.scoreboard.append((self.round, winner))
+                score = int(match.group(1))
+                scores.append(score)
+                self.logger.debug(f"Found score: {score} from line: {line}")
+
+        self.logger.debug(f"All scores: {scores}")
+        if scores:
+            max_score_index = scores.index(max(scores))
+            winner = agents[max_score_index].name
+            self.logger.debug(
+                f"Concluding winner: {winner} with index {max_score_index}"
+            )
+            return {"winner": winner}
+        else:
+            self.logger.debug("No scores found, returning unknown")
+            return {"winner": "unknown"}
 
-    def execute_round(self, agents: list[Player]):
+    def execute_round(self, agents: list[Player]) -> dict[str, str]:
         args = [f"/{agent.name}/warriors/warrior.red" for agent in agents]
-        cmd = f"{self.run_cmd_round} {' '.join(args)} > {self.round_log_path}"
+        cmd = f"{self.run_cmd_round} {' '.join(args)}"
         self.logger.info(f"Running command: {cmd}")
         response = self.environment.execute(cmd)
         assert response["returncode"] == 0, response
+        # For CoreWar, log_output and result_output are the same
+        output = response["output"]
+        return {"log_output": output, "result_output": output}
diff --git a/codeclash/games/robocode/main.py b/codeclash/games/robocode/main.py
index fdcf5435..514e7ab2 100644
--- a/codeclash/games/robocode/main.py
+++ b/codeclash/games/robocode/main.py
@@ -1,4 +1,6 @@
 import subprocess
+import time
+from pathlib import Path
 
 from codeclash.agents.abstract import Player
 from codeclash.games.abstract import CodeGame
@@ -8,8 +10,10 @@
 class RoboCodeGame(CodeGame):
     name: str = "RoboCode"
 
-    def __init__(self, config):
-        super().__init__(config)
+    def __init__(self, config, *, tournament_id: str, local_output_dir: Path):
+        super().__init__(
+            config, tournament_id=tournament_id, local_output_dir=local_output_dir
+        )
         self.run_cmd_round: str = "./robocode.sh"
         for arg, val in self.game_config.get("args", {}).items():
             if isinstance(val, bool):
@@ -51,12 +55,23 @@ def dict_to_lines(d, prefix=""):
         dict_to_lines(default_battle_config)
         return "\n".join(battle_lines)
 
-    def determine_winner(self, agents: list[Player]):
-        response = self.environment.execute(f"head -3 {self.round_log_path} | tail -1")
-        winner = response["output"].split()[1].rsplit(".", 1)[0]
-        self.scoreboard.append((self.round, winner))
+    def determine_winner(
+        self, result_output: str, agents: list[Player]
+    ) -> dict[str, str]:
+        self.logger.debug(f"Determining winner from result output: {result_output}")
+        lines = result_output.strip().split("\n")
+        # Get the second line which contains the winner info (closer to original)
+        winner_line = lines[1] if len(lines) >= 2 else ""
+        self.logger.debug(f"Winner line: {winner_line}")
+        if winner_line:
+            winner = winner_line.split()[1].rsplit(".", 1)[0]
+            self.logger.debug(f"Concluding winner: {winner}")
+            return {"winner": winner}
+        else:
+            self.logger.debug("No winner line found, returning unknown")
+            return {"winner": "unknown"}
 
-    def execute_round(self, agents: list[Player]):
+    def execute_round(self, agents: list[Player]) -> dict[str, str]:
         for agent in agents:
             # Copy the agent codebase into the game codebase and compile it
             for cmd in [
@@ -69,7 +84,8 @@ def execute_round(self, agents: list[Player]):
 
         # Create .battle file
         selected_robots = ",".join([f"{agent.name}.MyTank*" for agent in agents])
-        battle_file = f"{self.game_id}-round{self.round}.battle"
+        # Use timestamp for unique battle file name since rounds are managed by tournament
+        battle_file = f"{self.game_id}-battle{int(time.time())}.battle"
         with open(battle_file, "w") as f:
             f.write(
                 f"""#Battle Properties
@@ -80,10 +96,18 @@ def execute_round(self, agents: list[Player]):
         copy_file_to_container(self.environment, battle_file, f"battles/{battle_file}")
         subprocess.run(f"rm -f {battle_file}", shell=True)
 
-        # Run battle
-        cmd = (
-            f"{self.run_cmd_round} -battle {battle_file} -results {self.round_log_path}"
-        )
+        # Run battle with results output to file
+        results_file = f"results_{int(time.time())}.txt"
+        cmd = f"{self.run_cmd_round} -battle {battle_file} -results {results_file}"
         self.logger.info(f"Running command: {cmd}")
         response = self.environment.execute(cmd)
         assert response["returncode"] == 0, response
+
+        # Read the results file to get result output
+        cat_response = self.environment.execute(f"cat {results_file}")
+        result_output = cat_response["output"]
+
+        # Clean up the results file
+        self.environment.execute(f"rm -f {results_file}")
+
+        return {"log_output": response["output"], "result_output": result_output}
diff --git a/codeclash/games/robotrumble/main.py b/codeclash/games/robotrumble/main.py
index 495d10e9..e66d3a93 100644
--- a/codeclash/games/robotrumble/main.py
+++ b/codeclash/games/robotrumble/main.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 from codeclash.agents.abstract import Player
 from codeclash.constants import RESULT_TIE
 from codeclash.games.abstract import CodeGame
@@ -6,23 +8,44 @@
 class RobotRumbleGame(CodeGame):
     name: str = "RobotRumble"
 
-    def __init__(self, config):
-        super().__init__(config)
+    def __init__(self, config, *, tournament_id: str, local_output_dir: Path):
+        super().__init__(
+            config, tournament_id=tournament_id, local_output_dir=local_output_dir
+        )
         assert len(config["players"]) == 2, "RobotRumble is a two-player game"
         self.run_cmd_round: str = "./rumblebot run term"
 
-    def determine_winner(self, agents: list[Player]):
-        response = self.environment.execute(f"tail -2 {self.round_log_path}")
-        if "Blue won" in response["output"]:
-            self.scoreboard.append((self.round, agents[0].name))
-        elif "Red won" in response["output"]:
-            self.scoreboard.append((self.round, agents[1].name))
-        elif "it was a tie" in response["output"]:
-            self.scoreboard.append((self.round, RESULT_TIE))
+    def determine_winner(
+        self, result_output: str, agents: list[Player]
+    ) -> dict[str, str]:
+        self.logger.debug(f"Determining winner from result output: {result_output}")
+        lines = result_output.strip().split("\n")
+        # Get the last 2 lines which contain the game result (same as original)
+        relevant_lines = lines[-2:] if len(lines) >= 2 else lines
+        log_text = "\n".join(relevant_lines)
+        self.logger.debug(f"Relevant lines: {log_text}")
+
+        if "Blue won" in log_text:
+            winner = agents[0].name
+            self.logger.debug(f"Blue won - Concluding winner: {winner}")
+            return {"winner": winner}
+        elif "Red won" in log_text:
+            winner = agents[1].name
+            self.logger.debug(f"Red won - Concluding winner: {winner}")
+            return {"winner": winner}
+        elif "it was a tie" in log_text:
+            self.logger.debug("Game was a tie")
+            return {"winner": RESULT_TIE}
+        else:
+            self.logger.debug("No clear result found, treating as tie")
+            return {"winner": RESULT_TIE}
 
-    def execute_round(self, agents: list[Player]):
+    def execute_round(self, agents: list[Player]) -> dict[str, str]:
         args = [f"/{agent.name}/robot.py" for agent in agents]
-        cmd = f"{self.run_cmd_round} {' '.join(args)} > {self.round_log_path}"
+        cmd = f"{self.run_cmd_round} {' '.join(args)}"
         self.logger.info(f"Running command: {cmd}")
         response = self.environment.execute(cmd)
         assert response["returncode"] == 0, response
+        # For RobotRumble, log_output and result_output are the same
+        output = response["output"]
+        return {"log_output": output, "result_output": output}
diff --git a/codeclash/tournaments/abstract.py b/codeclash/tournaments/abstract.py
index 59b0e753..4a865181 100644
--- a/codeclash/tournaments/abstract.py
+++ b/codeclash/tournaments/abstract.py
@@ -1,2 +1,26 @@
+import getpass
+import time
+from pathlib import Path
+
+from codeclash.constants import DIR_LOGS
+from codeclash.utils.log import get_logger
+
+
 class AbstractTournament:
-    pass
+    def __init__(self, config: dict, *, name: str, **kwargs):
+        self.config: dict = config
+        self.name: str = name
+        self.tournament_id: str = f"{self.name}{time.strftime('%y%m%d%H%M%S')}"
+        self.local_output_dir: Path = (
+            DIR_LOGS / getpass.getuser() / self.tournament_id
+        ).resolve()
+        self._metadata: dict = {
+            "name": self.name,
+            "tournament_id": self.tournament_id,
+        }
+        self.logger = get_logger(
+            self.name, log_path=self.local_output_dir / "tournament.log", emoji="🏆"
+        )
+
+    def get_metadata(self) -> dict:
+        return self._metadata
diff --git a/codeclash/tournaments/pvp_training.py b/codeclash/tournaments/pvp_training.py
index 07ab57c6..9528fb51 100644
--- a/codeclash/tournaments/pvp_training.py
+++ b/codeclash/tournaments/pvp_training.py
@@ -2,11 +2,14 @@
 PvP training mode where multiple agents compete against each other.
 """
 
+import traceback
+
 from codeclash.agents import get_agent
 from codeclash.agents.abstract import Player
 from codeclash.games import get_game
 from codeclash.games.abstract import CodeGame
 from codeclash.tournaments.abstract import AbstractTournament
+from codeclash.utils.environment import create_file_on_container
 from codeclash.utils.log import get_logger
 
 
@@ -14,10 +17,14 @@ class PvpTraining(AbstractTournament):
     def __init__(
         self, config: dict, *, cleanup: bool = False, push_agent: bool = False
     ):
-        self.config = config
+        super().__init__(config, name="PvpTraining")
         self.cleanup_on_end = cleanup
         self.push_agent = push_agent
-        self.game: CodeGame = get_game(self.config)
+        self.game: CodeGame = get_game(
+            self.config,
+            tournament_id=self.tournament_id,
+            local_output_dir=self.local_output_dir,
+        )
         self.agents: list[Player] = []
         for agent_conf in self.config["players"]:
             self.agents.append(get_agent(agent_conf, self.config["prompts"], self.game))
@@ -33,7 +40,23 @@ def run(self) -> None:
 
     def run_training_round(self, round_num: int) -> None:
         """Execute a single training round."""
-        self.game.run_round(self.agents)
+        # Run the game round and get results
+        result = self.game.run_round(self.agents)
+        log_output = result["log_output"]
+        result_output = result["result_output"]
+        winner = result["winner"]
+
+        # Handle bookkeeping that was previously in the game
+        self.game.scoreboard.append((round_num, winner))
+        self.logger.info(f"Round {round_num} winner: {winner}")
+
+        # Write log to file
+        round_log_path = self.game.log_local / f"round_{round_num}.log"
+        round_log_path.write_text(log_output)
+
+        # Copy log to agent environments
+        self._post_round_setup(self.agents, round_num, log_output)
+
         for agent in self.agents:
             self.run_agent(agent, round_num)
 
@@ -43,6 +66,24 @@ def run_agent(self, agent: Player, round_num: int) -> None:
         agent.run()
         agent.post_run_hook(round=round_num)
 
+    def _post_round_setup(self, agents: list, round_num: int, log_output: str) -> None:
+        """Copy round logs to agent environments and local directory."""
+        for agent in agents:
+            try:
+                create_file_on_container(
+                    container=agent.environment,
+                    content=log_output,
+                    dest_path=f"logs/round_{round_num}.log",
+                )
+            except Exception:
+                self.logger.error(
+                    f"Error creating round log in {agent.name}'s container: {traceback.format_exc()}"
+                )
+            else:
+                self.logger.info(f"Created round log in {agent.name}'s container.")
+
+        self.logger.info("Round completed.")
+
     def cleanup(self) -> None:
         """Clean up game resources and push agents if requested."""
         self.game.end(self.cleanup_on_end)
diff --git a/codeclash/tournaments/single_player_training.py b/codeclash/tournaments/single_player_training.py
index f2325640..784a7749 100644
--- a/codeclash/tournaments/single_player_training.py
+++ b/codeclash/tournaments/single_player_training.py
@@ -3,6 +3,7 @@
 """
 
 import copy
+import traceback
 
 from codeclash.agents import get_agent
 from codeclash.agents.abstract import Player
@@ -16,9 +17,15 @@
 
 class SinglePlayerTraining(AbstractTournament):
     def __init__(self, config: dict, cleanup: bool = False):
-        self.config = config
+        super().__init__(config, name="SinglePlayerTraining")
         self.cleanup_on_end = cleanup
-        self.game: CodeGame = get_game(self.config)
+        self.game: CodeGame = get_game(
+            self.config,
+            tournament_id=self.tournament_id,
+            local_output_dir=self.local_output_dir,
+        )
+        # fixme: hack
+        self.game.rounds = self.config["game"]["rounds"]
         self.agent: Player = get_agent(
             self.config["player"], self.config["prompts"], self.game
         )
@@ -29,27 +36,47 @@ def __init__(self, config: dict, cleanup: bool = False):
         )
         self.logger = get_logger(self.game.name)
 
-    def run(self) -> None:
+    @property
+    def rounds(self) -> int:
+        return self.config["game"]["rounds"]
+
+    def run(self):
         """Main execution function that runs all rounds."""
         try:
-            for round_num in range(1, self.game.rounds + 1):
+            for round_num in range(1, self.rounds + 1):
                 self.run_training_round(round_num)
         finally:
             self.cleanup()
 
     def run_training_round(self, round_num: int) -> None:
         """Execute a single training round."""
-        self.game.run_round([self.agent, self.mirror_agent])
+        # Run the game round and get results
+        result = self.game.run_round([self.agent, self.mirror_agent])
+        log_output = result["log_output"]
+        result_output = result["result_output"]
+        winner = result["winner"]
+
+        # Handle bookkeeping that was previously in the game
+        self.game.scoreboard.append((round_num, winner))
+        self.logger.info(f"Round {round_num} winner: {winner}")
+
+        # Write log to file
+        round_log_path = self.game.log_local / f"round_{round_num}.log"
+        round_log_path.write_text(log_output)
+
+        # Copy log to main agent environment only
+        self._copy_game_log_to_agent([self.agent], round_num, log_output)
+
         self.run_main_agent(round_num)
         self.run_mirror_agent(round_num)
 
-    def run_main_agent(self, round_num: int) -> None:
+    def run_main_agent(self, round_num: int):
         """Run the main agent for the current round."""
         self.agent.pre_run_hook(new_round=round_num)
         self.agent.run()
         self.agent.post_run_hook(round=round_num)
 
-    def run_mirror_agent(self, round_num: int) -> None:
+    def run_mirror_agent(self, round_num: int):
         """Update mirror agent's codebase with the main agent's changes."""
         if round_num == 1:
             self.logger.info("Skipping updating mirror agent for round 1")
@@ -88,6 +115,26 @@ def run_mirror_agent(self, round_num: int) -> None:
         else:
             self.logger.info("No diff found for mirror agent, skipping update")
 
-    def cleanup(self) -> None:
+    def _copy_game_log_to_agent(
+        self, agents: list, round_num: int, log_output: str
+    ) -> None:
+        """Copy round logs to agent environments and local directory."""
+        for agent in agents:
+            try:
+                create_file_on_container(
+                    container=agent.environment,
+                    content=log_output,
+                    dest_path=f"logs/round_{round_num}.log",
+                )
+            except Exception:
+                self.logger.error(
+                    f"Error creating round log in {agent.name}'s container: {traceback.format_exc()}"
+                )
+            else:
+                self.logger.info(f"Created round log in {agent.name}'s container.")
+
+        self.logger.info("Round completed.")
+
+    def cleanup(self):
         """Clean up game resources."""
         self.game.end(self.cleanup_on_end)

From 548b6b424c843091e52a8bf6520dace1cd88def1 Mon Sep 17 00:00:00 2001
From: Kilian Lieret <kilian.lieret@posteo.de>
Date: Sun, 24 Aug 2025 16:46:43 -0400
Subject: [PATCH 4/8] Working evaluation for single player mode

---
 codeclash/agents/abstract.py                  |  40 ++++++-
 codeclash/tournaments/abstract.py             |  17 +++
 codeclash/tournaments/pvp_training.py         |  26 +----
 .../tournaments/single_player_training.py     | 106 ++++++++----------
 configs/battlesnake_single_player.yaml        |   2 +-
 5 files changed, 108 insertions(+), 83 deletions(-)

diff --git a/codeclash/agents/abstract.py b/codeclash/agents/abstract.py
index 1cfc39c3..b6f6c2ff 100644
--- a/codeclash/agents/abstract.py
+++ b/codeclash/agents/abstract.py
@@ -7,7 +7,8 @@
 
 from codeclash.agents.utils import GameContext
 from codeclash.constants import GH_ORG
-from codeclash.utils.environment import assert_zero_exit_code
+from codeclash.tournaments.utils.git_utils import filter_git_diff
+from codeclash.utils.environment import assert_zero_exit_code, create_file_on_container
 from codeclash.utils.log import get_logger
 
 load_dotenv()
@@ -81,6 +82,43 @@ def push(self) -> None:
             f"Pushed {self.name} commit history to remote repository (branch {self._branch_name})"
         )
 
+    def reset_and_apply_patch(
+        self, patch: str, *, base_commit: str = "", filter_patch: bool = True
+    ) -> None:
+        """Clean all uncommited changes. If base_commit is provided, reset to that commit.
+        Then apply the patch to the codebase.
+        """
+        # Need to clean before we copy over the patch (else it's gonna be removed by git clean)
+        self.logger.debug(
+            assert_zero_exit_code(
+                self.environment.execute(
+                    f"git reset --hard {base_commit} && git clean -fd"
+                )
+            )
+        )
+
+        patch = filter_git_diff(patch) if filter_patch else patch
+
+        if not patch.strip():
+            self.logger.debug("No patch to apply, skipping")
+            return
+
+        create_file_on_container(
+            container=self.environment,  # type: ignore
+            content=patch,
+            dest_path="tmp_patch.txt",
+        )
+
+        self.logger.debug(f"Applying patch to agent's codebase: {patch}")
+
+        commands = ["git status", "git apply tmp_patch.txt", "rm -f tmp_patch.txt"]
+        for cmd in commands:
+            self.logger.debug(f"Executing command: {cmd}")
+            out = assert_zero_exit_code(
+                self.environment.execute(cmd), logger=self.logger
+            )
+            self.logger.debug(out)
+
     # --- Helper methods ---
 
     def _tag_round(self, round: int) -> None:
diff --git a/codeclash/tournaments/abstract.py b/codeclash/tournaments/abstract.py
index 4a865181..fe1a7fd0 100644
--- a/codeclash/tournaments/abstract.py
+++ b/codeclash/tournaments/abstract.py
@@ -1,8 +1,10 @@
 import getpass
 import time
+import traceback
 from pathlib import Path
 
 from codeclash.constants import DIR_LOGS
+from codeclash.utils.environment import create_file_on_container
 from codeclash.utils.log import get_logger
 
 
@@ -24,3 +26,18 @@ def __init__(self, config: dict, *, name: str, **kwargs):
 
     def get_metadata(self) -> dict:
         return self._metadata
+
+    def _copy_game_log_to_agent(self, agent, round_num: int, log_output: str) -> None:
+        """Copy round log to agent environment."""
+        try:
+            create_file_on_container(
+                container=agent.environment,
+                content=log_output,
+                dest_path=f"logs/round_{round_num}.log",
+            )
+        except Exception:
+            self.logger.error(
+                f"Error creating round log in {agent.name}'s container: {traceback.format_exc()}"
+            )
+        else:
+            self.logger.info(f"Created round log in {agent.name}'s container.")
diff --git a/codeclash/tournaments/pvp_training.py b/codeclash/tournaments/pvp_training.py
index 9528fb51..0e910606 100644
--- a/codeclash/tournaments/pvp_training.py
+++ b/codeclash/tournaments/pvp_training.py
@@ -2,14 +2,11 @@
 PvP training mode where multiple agents compete against each other.
 """
 
-import traceback
-
 from codeclash.agents import get_agent
 from codeclash.agents.abstract import Player
 from codeclash.games import get_game
 from codeclash.games.abstract import CodeGame
 from codeclash.tournaments.abstract import AbstractTournament
-from codeclash.utils.environment import create_file_on_container
 from codeclash.utils.log import get_logger
 
 
@@ -55,35 +52,20 @@ def run_training_round(self, round_num: int) -> None:
         round_log_path.write_text(log_output)
 
         # Copy log to agent environments
-        self._post_round_setup(self.agents, round_num, log_output)
+        for agent in self.agents:
+            self._copy_game_log_to_agent(agent, round_num, log_output)
 
         for agent in self.agents:
             self.run_agent(agent, round_num)
 
+        self.logger.info("Round completed.")
+
     def run_agent(self, agent: Player, round_num: int) -> None:
         """Run a single agent for the current round."""
         agent.pre_run_hook(new_round=round_num)
         agent.run()
         agent.post_run_hook(round=round_num)
 
-    def _post_round_setup(self, agents: list, round_num: int, log_output: str) -> None:
-        """Copy round logs to agent environments and local directory."""
-        for agent in agents:
-            try:
-                create_file_on_container(
-                    container=agent.environment,
-                    content=log_output,
-                    dest_path=f"logs/round_{round_num}.log",
-                )
-            except Exception:
-                self.logger.error(
-                    f"Error creating round log in {agent.name}'s container: {traceback.format_exc()}"
-                )
-            else:
-                self.logger.info(f"Created round log in {agent.name}'s container.")
-
-        self.logger.info("Round completed.")
-
     def cleanup(self) -> None:
         """Clean up game resources and push agents if requested."""
         self.game.end(self.cleanup_on_end)
diff --git a/codeclash/tournaments/single_player_training.py b/codeclash/tournaments/single_player_training.py
index 784a7749..693ea915 100644
--- a/codeclash/tournaments/single_player_training.py
+++ b/codeclash/tournaments/single_player_training.py
@@ -3,7 +3,6 @@
 """
 
 import copy
-import traceback
 
 from codeclash.agents import get_agent
 from codeclash.agents.abstract import Player
@@ -11,7 +10,6 @@
 from codeclash.games.abstract import CodeGame
 from codeclash.tournaments.abstract import AbstractTournament
 from codeclash.tournaments.utils.git_utils import filter_git_diff
-from codeclash.utils.environment import assert_zero_exit_code, create_file_on_container
 from codeclash.utils.log import get_logger
 
 
@@ -45,6 +43,7 @@ def run(self):
         try:
             for round_num in range(1, self.rounds + 1):
                 self.run_training_round(round_num)
+            self.evaluate()
         finally:
             self.cleanup()
 
@@ -65,10 +64,13 @@ def run_training_round(self, round_num: int) -> None:
         round_log_path.write_text(log_output)
 
         # Copy log to main agent environment only
-        self._copy_game_log_to_agent([self.agent], round_num, log_output)
+        self._copy_game_log_to_agent(self.agent, round_num, log_output)
 
         self.run_main_agent(round_num)
-        self.run_mirror_agent(round_num)
+        mirror_agent_state = round_num - 1 if round_num > 1 else 0
+        self.set_mirror_state_to_round(mirror_agent_state)
+
+        self.logger.info("Round completed.")
 
     def run_main_agent(self, round_num: int):
         """Run the main agent for the current round."""
@@ -76,65 +78,51 @@ def run_main_agent(self, round_num: int):
         self.agent.run()
         self.agent.post_run_hook(round=round_num)
 
-    def run_mirror_agent(self, round_num: int):
+    def set_mirror_state_to_round(self, round_num: int):
         """Update mirror agent's codebase with the main agent's changes."""
-        if round_num == 1:
-            self.logger.info("Skipping updating mirror agent for round 1")
-            return
-
-        # Set mirror agent's codebase to the main agent's codebase of the previous round
-        full_diff = self.agent.get_metadata()["diff"][round_num - 1]
-
-        full_diff = filter_git_diff(full_diff)
-
-        if full_diff.strip():
-            self.logger.debug(
-                assert_zero_exit_code(
-                    self.mirror_agent.environment.execute(
-                        "git reset --hard && git clean -fd"
-                    )
-                )
-            )
-
-            create_file_on_container(
-                container=self.mirror_agent.environment,  # type: ignore
-                content=full_diff,
-                dest_path="tmp_patch.txt",
-            )
-
-            self.logger.info("Applying patch to mirror agent's codebase")
-            self.logger.debug(f"Full diff: {full_diff}")
-
-            commands = ["git status", "git apply tmp_patch.txt", "rm -f tmp_patch.txt"]
-            for cmd in commands:
-                self.logger.debug(f"Executing command: {cmd}")
-                out = assert_zero_exit_code(
-                    self.mirror_agent.environment.execute(cmd), logger=self.logger
-                )
-                self.logger.debug(out)
+        if round_num == 0:
+            full_diff = ""
         else:
-            self.logger.info("No diff found for mirror agent, skipping update")
-
-    def _copy_game_log_to_agent(
-        self, agents: list, round_num: int, log_output: str
-    ) -> None:
-        """Copy round logs to agent environments and local directory."""
-        for agent in agents:
-            try:
-                create_file_on_container(
-                    container=agent.environment,
-                    content=log_output,
-                    dest_path=f"logs/round_{round_num}.log",
-                )
-            except Exception:
-                self.logger.error(
-                    f"Error creating round log in {agent.name}'s container: {traceback.format_exc()}"
-                )
-            else:
-                self.logger.info(f"Created round log in {agent.name}'s container.")
+            full_diff = self.agent.get_metadata()["diff"][round_num]
+            full_diff = filter_git_diff(full_diff)
 
-        self.logger.info("Round completed.")
+        self.mirror_agent.reset_and_apply_patch(full_diff)
 
     def cleanup(self):
         """Clean up game resources."""
         self.game.end(self.cleanup_on_end)
+
+    def evaluate(self, n_repetitions: int = 3):
+        """Evaluate the agent's performance by
+        calculating the matrix of every round against each other.
+        """
+        p1 = get_agent(self.config["player"], self.config["prompts"], self.game)
+        p1.name = "p1"
+        p2 = get_agent(self.config["player"], self.config["prompts"], self.game)
+        p2.name = "p2"
+        matrix = {
+            p1_round: {p2_round: [] for p2_round in range(0, self.rounds + 1)}
+            for p1_round in range(0, self.rounds + 1)
+        }
+        for p1_round in range(0, self.rounds + 1):
+            for p2_round in range(0, self.rounds + 1):
+                self.logger.info(
+                    f"Evaluating agent at round {p1_round} against agent at round {p2_round}"
+                )
+                p1_patch = (
+                    self.agent.get_metadata()["diff"][p1_round] if p1_round > 0 else ""
+                )
+                p2_patch = (
+                    self.agent.get_metadata()["diff"][p2_round] if p2_round > 0 else ""
+                )
+                p1.reset_and_apply_patch(p1_patch)
+                p2.reset_and_apply_patch(p2_patch)
+                for i_repetition in range(n_repetitions):
+                    result = self.game.run_round([p1, p2])
+                    winner = result["winner"]
+                    self.logger.info(
+                        f"Round {p1_round} vs {p2_round} repetition {i_repetition} winner: {winner}"
+                    )
+                    matrix[p1_round][p2_round].append(winner)
+        self.logger.info(f"Evaluation matrix: {matrix}")
+        return matrix
diff --git a/configs/battlesnake_single_player.yaml b/configs/battlesnake_single_player.yaml
index 64985c40..984e7890 100644
--- a/configs/battlesnake_single_player.yaml
+++ b/configs/battlesnake_single_player.yaml
@@ -1,6 +1,6 @@
 game:
   name: BattleSnake
-  rounds: 5
+  rounds: 4
   args:
     width: 11
     height: 11

From cbeaf1fb66928029c66c1255a181e5e9ee3eafa2 Mon Sep 17 00:00:00 2001
From: Kilian Lieret <kilian.lieret@posteo.de>
Date: Mon, 25 Aug 2025 12:57:48 -0400
Subject: [PATCH 5/8] Ref/simplify: Add empty patch for round 0

---
 codeclash/agents/abstract.py                    | 4 ++--
 codeclash/tournaments/single_player_training.py | 8 ++------
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/codeclash/agents/abstract.py b/codeclash/agents/abstract.py
index b6f6c2ff..3e7e85bb 100644
--- a/codeclash/agents/abstract.py
+++ b/codeclash/agents/abstract.py
@@ -36,8 +36,8 @@ def __init__(
         self._metadata = {
             "name": self.name,
             "player_unique_id": self._player_unique_id,
-            "diff": {},  # mapping round -> diff
-            "incremental_diff": {},  # mapping round -> diff
+            "diff": {0: ""},  # mapping round -> diff
+            "incremental_diff": {0: ""},  # mapping round -> diff
         }
 
     # --- Main methods ---
diff --git a/codeclash/tournaments/single_player_training.py b/codeclash/tournaments/single_player_training.py
index 693ea915..1ea81d86 100644
--- a/codeclash/tournaments/single_player_training.py
+++ b/codeclash/tournaments/single_player_training.py
@@ -80,12 +80,8 @@ def run_main_agent(self, round_num: int):
 
     def set_mirror_state_to_round(self, round_num: int):
         """Update mirror agent's codebase with the main agent's changes."""
-        if round_num == 0:
-            full_diff = ""
-        else:
-            full_diff = self.agent.get_metadata()["diff"][round_num]
-            full_diff = filter_git_diff(full_diff)
-
+        full_diff = self.agent.get_metadata()["diff"][round_num]
+        full_diff = filter_git_diff(full_diff)
         self.mirror_agent.reset_and_apply_patch(full_diff)
 
     def cleanup(self):

From 16f811789202fdc253b6b51c2dd399604ff57483 Mon Sep 17 00:00:00 2001
From: Kilian Lieret <kilian.lieret@posteo.de>
Date: Mon, 25 Aug 2025 12:58:02 -0400
Subject: [PATCH 6/8] Enh(viewer): Improve game session dropdown

Add warning sign next to probably failed; add # rounds; alphabetical sort
---
 codeclash/viewer/app.py               | 37 +++++++++++++++++++++++++--
 codeclash/viewer/templates/index.html |  8 +++---
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/codeclash/viewer/app.py b/codeclash/viewer/app.py
index 25dcd0ee..a8e62920 100644
--- a/codeclash/viewer/app.py
+++ b/codeclash/viewer/app.py
@@ -22,6 +22,25 @@ def set_log_base_directory(directory: str | Path):
     LOG_BASE_DIR = Path(directory).resolve()
 
 
+def is_probably_failed_run(log_dir: Path) -> bool:
+    """Check if a run probably failed by checking if metadata.json is missing"""
+    metadata_file = log_dir / "metadata.json"
+    return not metadata_file.exists()
+
+
+def get_round_count_from_metadata(log_dir: Path) -> Optional[int]:
+    """Extract round count from metadata.json if it exists"""
+    metadata_file = log_dir / "metadata.json"
+    if not metadata_file.exists():
+        return None
+
+    try:
+        metadata = json.loads(metadata_file.read_text())
+        return metadata.get("config", {}).get("game", {}).get("rounds")
+    except (json.JSONDecodeError, KeyError):
+        return None
+
+
 @dataclass
 class GameMetadata:
     """Metadata about a game session"""
@@ -150,9 +169,22 @@ def index():
     """Main viewer page"""
     # Get available log directories
     logs_dir = LOG_BASE_DIR
-    log_folders = []
+    log_folders_info = []
     if logs_dir.exists():
-        log_folders = [d.name for d in logs_dir.iterdir() if d.is_dir()]
+        for d in logs_dir.iterdir():
+            if d.is_dir():
+                folder_info = {
+                    "name": d.name,
+                    "is_failed": is_probably_failed_run(d),
+                    "round_count": get_round_count_from_metadata(d),
+                }
+                log_folders_info.append(folder_info)
+
+        # Sort folders alphabetically by name
+        log_folders_info.sort(key=lambda x: x["name"])
+
+    # Extract just the names for backwards compatibility
+    log_folders = [folder["name"] for folder in log_folders_info]
 
     selected_folder = request.args.get(
         "folder", log_folders[0] if log_folders else None
@@ -178,6 +210,7 @@ def index():
     return render_template(
         "index.html",
         log_folders=log_folders,
+        log_folders_info=log_folders_info,
         selected_folder=selected_folder,
         metadata=metadata,
         trajectories_by_round=trajectories_by_round,
diff --git a/codeclash/viewer/templates/index.html b/codeclash/viewer/templates/index.html
index c59c7382..40641d3d 100644
--- a/codeclash/viewer/templates/index.html
+++ b/codeclash/viewer/templates/index.html
@@ -16,10 +16,10 @@ <h1>🎮 CodeClash Trajectory Viewer</h1>
                 <div class="control-group">
                     <label for="folder-select">Game Session:</label>
                     <select id="folder-select" onchange="changeFolder()">
-                        {% for folder in log_folders %}
-                        <option value="{{ folder }}"
-                                {% if folder == selected_folder %}selected{% endif %}>
-                            {{ folder }}
+                        {% for folder_info in log_folders_info %}
+                        <option value="{{ folder_info.name }}"
+                                {% if folder_info.name == selected_folder %}selected{% endif %}>
+                            {{ folder_info.name }}{% if folder_info.is_failed %} ⚠️{% endif %}{% if folder_info.round_count %} ({{ folder_info.round_count }} rounds){% endif %}
                         </option>
                         {% endfor %}
                     </select>

From 7f4cbb439dc941da77de4d0f1bb13796763f205e Mon Sep 17 00:00:00 2001
From: Kilian Lieret <kilian.lieret@posteo.de>
Date: Mon, 25 Aug 2025 14:00:05 -0400
Subject: [PATCH 7/8] Ref: Change get_agent and prompt rendering (needs to use
 tournament)

---
 codeclash/agents/__init__.py                  | 27 ++-------
 codeclash/agents/abstract.py                  |  2 -
 codeclash/agents/minisweagent.py              |  9 +--
 codeclash/agents/utils.py                     | 31 ++++------
 codeclash/games/abstract.py                   | 11 ++--
 codeclash/games/battlecode/main.py            |  3 +-
 codeclash/games/corewar/main.py               |  3 +-
 codeclash/games/robotrumble/main.py           |  3 +-
 codeclash/tournaments/abstract.py             |  5 +-
 codeclash/tournaments/pvp_training.py         | 33 +++++++++-
 .../tournaments/single_player_training.py     | 60 ++++++++++++++-----
 configs/battlesnake_single_player.yaml        |  2 +-
 tests/test_integration.py                     | 11 ++--
 13 files changed, 120 insertions(+), 80 deletions(-)

diff --git a/codeclash/agents/__init__.py b/codeclash/agents/__init__.py
index 15c8b6f9..5dd9d0a6 100644
--- a/codeclash/agents/__init__.py
+++ b/codeclash/agents/__init__.py
@@ -1,33 +1,18 @@
+from minisweagent.environments.docker import DockerEnvironment
+
 from codeclash.agents.abstract import Player
 from codeclash.agents.dummy import Dummy
 from codeclash.agents.minisweagent import MiniSWEAgent
 from codeclash.agents.utils import GameContext
-from codeclash.constants import DIR_WORK
-from codeclash.games.abstract import CodeGame
 
 
-def get_agent(config: dict, prompts: dict, game: CodeGame) -> Player:
+def get_agent(
+    config: dict, game_context: GameContext, environment: DockerEnvironment
+) -> Player:
     agents = {
         "dummy": Dummy,
         "mini": MiniSWEAgent,
     }.get(config["agent"])
     if agents is None:
         raise ValueError(f"Unknown agent type: {config['agent']}")
-    environment = game.get_environment(
-        f"{game.game_id}.{config['name']}"
-    )  # NOTE: MUST be branch_name (defined in agents/abstract.py)
-    return agents(
-        config,
-        environment,
-        GameContext(
-            id=game.game_id,
-            log_env=game.log_env,
-            log_local=game.log_local,
-            name=game.name,
-            player_id=config["name"],
-            prompts=prompts,
-            round=1,
-            rounds=game.rounds,
-            working_dir=str(DIR_WORK),
-        ),
-    )
+    return agents(config, environment, game_context)
diff --git a/codeclash/agents/abstract.py b/codeclash/agents/abstract.py
index 3e7e85bb..ed47b8da 100644
--- a/codeclash/agents/abstract.py
+++ b/codeclash/agents/abstract.py
@@ -27,7 +27,6 @@ def __init__(
         """Unique ID that doesn't clash even accross multiple games. Used for git tags."""
         self.environment = environment
         self.game_context = game_context
-        self.game_context.render_and_set_prompts()
         self.logger = get_logger(
             self.name,
             log_path=self.game_context.log_local / f"{self.name}.log",
@@ -47,7 +46,6 @@ def pre_run_hook(self, *, new_round: int) -> None:
         if new_round == 1:
             self._tag_round(0)
         self.game_context.round = new_round
-        self.game_context.render_and_set_prompts()
 
     def post_run_hook(self, *, round: int) -> None:
         """Should be called after we called the run method."""
diff --git a/codeclash/agents/minisweagent.py b/codeclash/agents/minisweagent.py
index ec5d7cb8..34665433 100644
--- a/codeclash/agents/minisweagent.py
+++ b/codeclash/agents/minisweagent.py
@@ -8,8 +8,9 @@
 
 import yaml
 from jinja2 import Template
-from minisweagent import Environment, Model
+from minisweagent import Model
 from minisweagent.agents.default import AgentConfig, DefaultAgent
+from minisweagent.environments.docker import DockerEnvironment
 from minisweagent.models.litellm_model import LitellmModel
 from minisweagent.run.utils.save import save_traj
 from rich.console import Console
@@ -28,7 +29,7 @@ class ClashAgent(DefaultAgent):
     def __init__(
         self,
         model: Model,
-        env: Environment,
+        env: DockerEnvironment,
         name: str,
         game_context: GameContext,
         *,
@@ -56,7 +57,7 @@ def render_template(self, template: str, **kwargs) -> str:
             | asdict(self.env.config)
             | asdict(self.model.config)
             | platform.uname()._asdict()
-            | self.game_context.to_dict()
+            | self.game_context.to_template_vars()
         )
         return Template(template).render(**kwargs, **cs, **os.environ)
 
@@ -69,7 +70,7 @@ class MiniSWEAgent(Player):
     """Player with agentic code editing capabilities"""
 
     def __init__(
-        self, config: dict, environment: Environment, game_context: GameContext
+        self, config: dict, environment: DockerEnvironment, game_context: GameContext
     ):
         super().__init__(config, environment=environment, game_context=game_context)
 
diff --git a/codeclash/agents/utils.py b/codeclash/agents/utils.py
index a5de6856..16ecd13a 100644
--- a/codeclash/agents/utils.py
+++ b/codeclash/agents/utils.py
@@ -36,24 +36,15 @@ class GameContext:
     rounds: int
     working_dir: str
 
-    def render_and_set_prompts(self):
-        """Render and set prompts using the current game context."""
+    def _render_prompt_templates(self) -> dict:
         context = asdict(self)
-        del context["prompts"]
-        for key, template_str in self.prompts.items():
-            rendered = Template(template_str).render(**context)
-            setattr(self, key, rendered)
-
-    def to_dict(self):
-        """Convert the GameContext to a dictionary, including dynamically added attributes."""
-        result = asdict(self)
-        declared = set(self.__dataclass_fields__)
-        for attr in dir(self):
-            if (
-                not attr.startswith("_")
-                and attr not in declared
-                and not callable(getattr(self, attr))
-            ):
-                result[attr] = getattr(self, attr)
-        del result["prompts"]
-        return result
+        return {
+            key: Template(template_str).render(**context)
+            for key, template_str in self.prompts.items()
+        }
+
+    def to_template_vars(self) -> dict[str, str]:
+        """Convert the GameContext to a dictionary for rendering prompts in the agent"""
+        out = asdict(self) | self._render_prompt_templates()
+        out.pop("prompts")
+        return out
diff --git a/codeclash/games/abstract.py b/codeclash/games/abstract.py
index 9c45e205..ddbb316f 100644
--- a/codeclash/games/abstract.py
+++ b/codeclash/games/abstract.py
@@ -2,7 +2,6 @@
 import os
 import subprocess
 from abc import ABC, abstractmethod
-from collections import Counter
 from pathlib import Path
 
 from minisweagent.environments.docker import DockerEnvironment
@@ -24,12 +23,15 @@ def __init__(self, config: dict, *, tournament_id: str, local_output_dir: Path):
         The central method is `run_round`, which takes a list of agents and returns the winner of the round.
 
         At the end of the the tournament, run the `end` method to clean up the game and agents and write the metadata.
+
+        Args:
+            config: The overall config for the tournament.
+            tournament_id: The id of the tournament.
+            local_output_dir: The host/local directory to write logs to.
         """
         self.url_gh: str = f"git@github.com:{GH_ORG}/{self.name}.git"
         self.artifacts: list[Path] = []
         """Artifact objects that we might want to clean up after the game."""
-        self.scoreboard: list[tuple[int, str]] = []
-        """List of (round number, winner (player id))"""
         self.game_config: dict = config["game"]
         self.config: dict = config
         self.game_id: str = tournament_id
@@ -40,8 +42,6 @@ def __init__(self, config: dict, *, tournament_id: str, local_output_dir: Path):
         )
         self.environment: DockerEnvironment = self.get_environment()
         """The running docker environment for executing the game"""
-        # assert len(config["players"]) >= 2, "At least two players are required"
-        """Total number of rounds to play"""
         self._metadata: dict = {
             "name": self.name,
             "config": self.config,
@@ -92,7 +92,6 @@ def get_metadata(self) -> dict:
         return self._metadata
 
     def end(self, cleanup: bool = False):
-        self.logger.info("Overall score: %s", Counter([x[1] for x in self.scoreboard]))
         (self.log_local / "metadata.json").write_text(json.dumps(self.get_metadata()))
         if cleanup:
             for artifact in self.artifacts:
diff --git a/codeclash/games/battlecode/main.py b/codeclash/games/battlecode/main.py
index 59faa13f..43c26295 100644
--- a/codeclash/games/battlecode/main.py
+++ b/codeclash/games/battlecode/main.py
@@ -1,4 +1,5 @@
 import re
+import shlex
 from pathlib import Path
 from typing import Any
 
@@ -52,7 +53,7 @@ def execute_round(self, agents: list[Any]) -> dict[str, str]:
             f"--p{idx+1}-dir src --p{idx+1} {agent.name}"
             for idx, agent in enumerate(agents)
         ]
-        cmd = f"{self.run_cmd_round} {' '.join(args)}"
+        cmd = f"{self.run_cmd_round} {shlex.join(args)}"
         self.logger.info(f"Running command: {cmd}")
         response = self.environment.execute(cmd)
         assert response["returncode"] == 0, response
diff --git a/codeclash/games/corewar/main.py b/codeclash/games/corewar/main.py
index 55e2960a..7e6e5617 100644
--- a/codeclash/games/corewar/main.py
+++ b/codeclash/games/corewar/main.py
@@ -1,4 +1,5 @@
 import re
+import shlex
 from pathlib import Path
 
 from codeclash.agents.abstract import Player
@@ -52,7 +53,7 @@ def determine_winner(
 
     def execute_round(self, agents: list[Player]) -> dict[str, str]:
         args = [f"/{agent.name}/warriors/warrior.red" for agent in agents]
-        cmd = f"{self.run_cmd_round} {' '.join(args)}"
+        cmd = f"{self.run_cmd_round} {shlex.join(args)}"
         self.logger.info(f"Running command: {cmd}")
         response = self.environment.execute(cmd)
         assert response["returncode"] == 0, response
diff --git a/codeclash/games/robotrumble/main.py b/codeclash/games/robotrumble/main.py
index e66d3a93..8f893ee4 100644
--- a/codeclash/games/robotrumble/main.py
+++ b/codeclash/games/robotrumble/main.py
@@ -1,3 +1,4 @@
+import shlex
 from pathlib import Path
 
 from codeclash.agents.abstract import Player
@@ -42,7 +43,7 @@ def determine_winner(
 
     def execute_round(self, agents: list[Player]) -> dict[str, str]:
         args = [f"/{agent.name}/robot.py" for agent in agents]
-        cmd = f"{self.run_cmd_round} {' '.join(args)}"
+        cmd = f"{self.run_cmd_round} {shlex.join(args)}"
         self.logger.info(f"Running command: {cmd}")
         response = self.environment.execute(cmd)
         assert response["returncode"] == 0, response
diff --git a/codeclash/tournaments/abstract.py b/codeclash/tournaments/abstract.py
index fe1a7fd0..d98a9d6c 100644
--- a/codeclash/tournaments/abstract.py
+++ b/codeclash/tournaments/abstract.py
@@ -3,7 +3,10 @@
 import traceback
 from pathlib import Path
 
-from codeclash.constants import DIR_LOGS
+from codeclash.agents import get_agent
+from codeclash.agents.abstract import Player
+from codeclash.agents.utils import GameContext
+from codeclash.constants import DIR_LOGS, DIR_WORK
 from codeclash.utils.environment import create_file_on_container
 from codeclash.utils.log import get_logger
 
diff --git a/codeclash/tournaments/pvp_training.py b/codeclash/tournaments/pvp_training.py
index 0e910606..80b74b81 100644
--- a/codeclash/tournaments/pvp_training.py
+++ b/codeclash/tournaments/pvp_training.py
@@ -4,6 +4,8 @@
 
 from codeclash.agents import get_agent
 from codeclash.agents.abstract import Player
+from codeclash.agents.utils import GameContext
+from codeclash.constants import DIR_WORK
 from codeclash.games import get_game
 from codeclash.games.abstract import CodeGame
 from codeclash.tournaments.abstract import AbstractTournament
@@ -24,13 +26,38 @@ def __init__(
         )
         self.agents: list[Player] = []
         for agent_conf in self.config["players"]:
-            self.agents.append(get_agent(agent_conf, self.config["prompts"], self.game))
+            self.agents.append(self.get_agent(agent_conf, self.config["prompts"]))
         self.logger = get_logger(self.game.name)
+        self.scoreboard: list[tuple[int, str]] = []
+
+    @property
+    def rounds(self) -> int:
+        return self.config["game"]["rounds"]
+
+    def get_agent(self, agent_config: dict, prompts: dict) -> Player:
+        """Create an agent with environment and game context."""
+        environment = self.game.get_environment(
+            f"{self.game.game_id}.{agent_config['name']}"
+        )
+
+        game_context = GameContext(
+            id=self.game.game_id,
+            log_env=self.game.log_env,
+            log_local=self.game.log_local,
+            name=self.game.name,
+            player_id=agent_config["name"],
+            prompts=prompts,
+            round=1,
+            rounds=self.rounds,
+            working_dir=str(DIR_WORK),
+        )
+
+        return get_agent(agent_config, game_context, environment)
 
     def run(self) -> None:
         """Main execution function that runs all rounds."""
         try:
-            for round_num in range(1, self.game.rounds + 1):
+            for round_num in range(1, self.rounds + 1):
                 self.run_training_round(round_num)
         finally:
             self.cleanup()
@@ -44,7 +71,7 @@ def run_training_round(self, round_num: int) -> None:
         winner = result["winner"]
 
         # Handle bookkeeping that was previously in the game
-        self.game.scoreboard.append((round_num, winner))
+        self.scoreboard.append((round_num, winner))
         self.logger.info(f"Round {round_num} winner: {winner}")
 
         # Write log to file
diff --git a/codeclash/tournaments/single_player_training.py b/codeclash/tournaments/single_player_training.py
index 1ea81d86..42c02b33 100644
--- a/codeclash/tournaments/single_player_training.py
+++ b/codeclash/tournaments/single_player_training.py
@@ -6,6 +6,9 @@
 
 from codeclash.agents import get_agent
 from codeclash.agents.abstract import Player
+from codeclash.agents.dummy import Dummy
+from codeclash.agents.utils import GameContext
+from codeclash.constants import DIR_WORK
 from codeclash.games import get_game
 from codeclash.games.abstract import CodeGame
 from codeclash.tournaments.abstract import AbstractTournament
@@ -22,22 +25,47 @@ def __init__(self, config: dict, cleanup: bool = False):
             tournament_id=self.tournament_id,
             local_output_dir=self.local_output_dir,
         )
-        # fixme: hack
-        self.game.rounds = self.config["game"]["rounds"]
-        self.agent: Player = get_agent(
-            self.config["player"], self.config["prompts"], self.game
-        )
+        self.agent: Player = self.get_agent(self.config["player"], round=1)
         mirror_agent_config = copy.deepcopy(self.config["player"])
         mirror_agent_config["name"] = "mirror"
-        self.mirror_agent: Player = get_agent(
-            mirror_agent_config, self.config["prompts"], self.game
-        )
+        self.mirror_agent: Player = self.get_agent(mirror_agent_config, round=0)
         self.logger = get_logger(self.game.name)
+        self.scoreboard: list[tuple[int, str]] = []
 
     @property
     def rounds(self) -> int:
         return self.config["game"]["rounds"]
 
+    def get_game_context(self, agent_config: dict, *, round: int) -> GameContext:
+        """Create a game context for an agent."""
+        return GameContext(
+            id=self.game.game_id,
+            log_env=self.game.log_env,
+            log_local=self.game.log_local,
+            name=self.game.name,
+            player_id=agent_config["name"],
+            prompts=self.config["prompts"],
+            round=round,
+            rounds=self.rounds,
+            working_dir=str(DIR_WORK),
+        )
+
+    def get_agent(self, agent_config: dict, round: int) -> Player:
+        """Create an agent with environment and game context."""
+        environment = self.game.get_environment(
+            f"{self.game.game_id}.{agent_config['name']}"
+        )
+        game_context = self.get_game_context(agent_config, round=round)
+        return get_agent(agent_config, game_context, environment)
+
+    def get_dummy_agent(self) -> Player:
+        """Create a dummy agent that does nothing."""
+        return Dummy(
+            self.config["player"],
+            environment=self.game.get_environment(f"{self.game.game_id}.dummy"),
+            game_context=self.get_game_context(self.config["player"], round=0),
+        )
+
     def run(self):
         """Main execution function that runs all rounds."""
         try:
@@ -48,15 +76,14 @@ def run(self):
             self.cleanup()
 
     def run_training_round(self, round_num: int) -> None:
-        """Execute a single training round."""
+        """Execute a single training round, i.e., run the game, then run the agent."""
         # Run the game round and get results
         result = self.game.run_round([self.agent, self.mirror_agent])
         log_output = result["log_output"]
-        result_output = result["result_output"]
         winner = result["winner"]
 
         # Handle bookkeeping that was previously in the game
-        self.game.scoreboard.append((round_num, winner))
+        self.scoreboard.append((round_num, winner))
         self.logger.info(f"Round {round_num} winner: {winner}")
 
         # Write log to file
@@ -92,10 +119,13 @@ def evaluate(self, n_repetitions: int = 3):
         """Evaluate the agent's performance by
         calculating the matrix of every round against each other.
         """
-        p1 = get_agent(self.config["player"], self.config["prompts"], self.game)
-        p1.name = "p1"
-        p2 = get_agent(self.config["player"], self.config["prompts"], self.game)
-        p2.name = "p2"
+        p1_config = self.config["player"].copy()
+        p1_config["name"] = "p1"
+        p1 = self.get_dummy_agent()
+
+        p2_config = self.config["player"].copy()
+        p2_config["name"] = "p2"
+        p2 = self.get_dummy_agent()
         matrix = {
             p1_round: {p2_round: [] for p2_round in range(0, self.rounds + 1)}
             for p1_round in range(0, self.rounds + 1)
diff --git a/configs/battlesnake_single_player.yaml b/configs/battlesnake_single_player.yaml
index 984e7890..af74b703 100644
--- a/configs/battlesnake_single_player.yaml
+++ b/configs/battlesnake_single_player.yaml
@@ -1,6 +1,6 @@
 game:
   name: BattleSnake
-  rounds: 4
+  rounds: 1
   args:
     width: 11
     height: 11
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 4ef70c26..1278571e 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -42,8 +42,8 @@ def test_main_battlesnake_integration():
         def mock_get_agent(original_get_agent):
             """Wrapper to replace agent models with DeterministicModel"""
 
-            def wrapper(config, prompts, game):
-                agent = original_get_agent(config, prompts, game)
+            def wrapper(config, game_context, environment):
+                agent = original_get_agent(config, game_context, environment)
                 print("In wrapper, got agent of type ", type(agent))
 
                 # Replace model if the agent has one (specifically for MiniSWEAgent)
@@ -61,10 +61,13 @@ def wrapper(config, prompts, game):
 
             return wrapper
 
-        # Import the get_agent function and patch it where it's used in main
+        # Import the get_agent function and patch it where it's used in the tournaments
         from codeclash.agents import get_agent
 
         # Run the main function with cleanup enabled
-        with patch("main.get_agent", side_effect=mock_get_agent(get_agent)):
+        with patch(
+            "codeclash.tournaments.pvp_training.get_agent",
+            side_effect=mock_get_agent(get_agent),
+        ):
             # This should complete without raising any exceptions
             main(temp_config_path, cleanup=True, push_agent=False)

From 73f64f2c5e4f282ef71859bee557d19699869968 Mon Sep 17 00:00:00 2001
From: Kilian Lieret <kilian.lieret@posteo.de>
Date: Mon, 25 Aug 2025 15:42:10 -0400
Subject: [PATCH 8/8] Change: Move round setting to tournament section of
 config

---
 codeclash/agents/minisweagent.py                | 1 +
 codeclash/tournaments/pvp_training.py           | 2 +-
 codeclash/tournaments/single_player_training.py | 6 +++---
 configs/battlecode.yaml                         | 3 ++-
 configs/battlesnake.yaml                        | 5 +++--
 configs/battlesnake_dummy.yaml                  | 3 ++-
 configs/battlesnake_single_player.yaml          | 4 +++-
 configs/corewar.yaml                            | 3 ++-
 configs/robocode.yaml                           | 3 ++-
 configs/robotrumble.yaml                        | 1 +
 tests/test_integration.py                       | 2 +-
 11 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/codeclash/agents/minisweagent.py b/codeclash/agents/minisweagent.py
index 34665433..f3fec5d4 100644
--- a/codeclash/agents/minisweagent.py
+++ b/codeclash/agents/minisweagent.py
@@ -105,6 +105,7 @@ def run(self):
                 traj_path,
                 exit_status=exit_status,
                 result=result,
+                print_fct=self.logger.debug,
             )
             copy_file_to_container(
                 self.environment,
diff --git a/codeclash/tournaments/pvp_training.py b/codeclash/tournaments/pvp_training.py
index 80b74b81..ffc223cc 100644
--- a/codeclash/tournaments/pvp_training.py
+++ b/codeclash/tournaments/pvp_training.py
@@ -32,7 +32,7 @@ def __init__(
 
     @property
     def rounds(self) -> int:
-        return self.config["game"]["rounds"]
+        return self.config["tournament"]["rounds"]
 
     def get_agent(self, agent_config: dict, prompts: dict) -> Player:
         """Create an agent with environment and game context."""
diff --git a/codeclash/tournaments/single_player_training.py b/codeclash/tournaments/single_player_training.py
index 42c02b33..832d36b4 100644
--- a/codeclash/tournaments/single_player_training.py
+++ b/codeclash/tournaments/single_player_training.py
@@ -29,12 +29,11 @@ def __init__(self, config: dict, cleanup: bool = False):
         mirror_agent_config = copy.deepcopy(self.config["player"])
         mirror_agent_config["name"] = "mirror"
         self.mirror_agent: Player = self.get_agent(mirror_agent_config, round=0)
-        self.logger = get_logger(self.game.name)
         self.scoreboard: list[tuple[int, str]] = []
 
     @property
     def rounds(self) -> int:
-        return self.config["game"]["rounds"]
+        return self.config["tournament"]["rounds"]
 
     def get_game_context(self, agent_config: dict, *, round: int) -> GameContext:
         """Create a game context for an agent."""
@@ -71,7 +70,8 @@ def run(self):
         try:
             for round_num in range(1, self.rounds + 1):
                 self.run_training_round(round_num)
-            self.evaluate()
+            if self.config["tournament"]["evaluate_matrix"]:
+                self.evaluate()
         finally:
             self.cleanup()
 
diff --git a/configs/battlecode.yaml b/configs/battlecode.yaml
index ec692e5f..ed6386cf 100644
--- a/configs/battlecode.yaml
+++ b/configs/battlecode.yaml
@@ -1,8 +1,9 @@
 game:
   name: BattleCode
-  rounds: 2
   args:
     maps: quack
+tournament:
+  rounds: 2
 players:
 - agent: dummy
   name: p1
diff --git a/configs/battlesnake.yaml b/configs/battlesnake.yaml
index 8e57c630..9016df1e 100644
--- a/configs/battlesnake.yaml
+++ b/configs/battlesnake.yaml
@@ -1,15 +1,16 @@
 game:
   name: BattleSnake
-  rounds: 2
   args:
     width: 11
     height: 11
     browser: false
+tournament:
+  rounds: 2
 players:
 - agent: mini
   name: p1
   config: configs/mini/default.yaml
-  model: claude-sonnet-4-20250514
+  model: openai/gpt-5-mini
 - agent: dummy
   name: p2
 prompts:
diff --git a/configs/battlesnake_dummy.yaml b/configs/battlesnake_dummy.yaml
index 1610ae33..05a8d4b4 100644
--- a/configs/battlesnake_dummy.yaml
+++ b/configs/battlesnake_dummy.yaml
@@ -1,10 +1,11 @@
 game:
   name: BattleSnake
-  rounds: 2
   args:
     width: 11
     height: 11
     browser: false
+tournament:
+  rounds: 2
 players:
 - agent: dummy
   name: p1
diff --git a/configs/battlesnake_single_player.yaml b/configs/battlesnake_single_player.yaml
index af74b703..e5f6f370 100644
--- a/configs/battlesnake_single_player.yaml
+++ b/configs/battlesnake_single_player.yaml
@@ -1,10 +1,12 @@
 game:
   name: BattleSnake
-  rounds: 1
   args:
     width: 11
     height: 11
     browser: false
+tournament:
+  rounds: 1
+  evaluate_matrix: true
 player:
   agent: mini
   config: configs/mini/default.yaml
diff --git a/configs/corewar.yaml b/configs/corewar.yaml
index a244cbe4..daaf3e2e 100644
--- a/configs/corewar.yaml
+++ b/configs/corewar.yaml
@@ -1,8 +1,9 @@
 game:
   name: CoreWar
-  rounds: 3
   args:
     r: 100
+tournament:
+  rounds: 3
 players:
 - agent: dummy
   name: p1
diff --git a/configs/robocode.yaml b/configs/robocode.yaml
index 77e05313..c401cdfd 100644
--- a/configs/robocode.yaml
+++ b/configs/robocode.yaml
@@ -1,6 +1,5 @@
 game:
   name: RoboCode
-  rounds: 3
   battle:
     battle:
       numRounds: 10
@@ -14,6 +13,8 @@ game:
   args:
     nodisplay: true
     nosound: true
+tournament:
+  rounds: 3
 players:
 - agent: dummy
   name: p1
diff --git a/configs/robotrumble.yaml b/configs/robotrumble.yaml
index 2f94d7d6..84e9e06f 100644
--- a/configs/robotrumble.yaml
+++ b/configs/robotrumble.yaml
@@ -1,5 +1,6 @@
 game:
   name: RobotRumble
+tournament:
   rounds: 3
 players:
 - agent: dummy
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 1278571e..52fa4676 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -33,7 +33,7 @@ def test_main_battlesnake_integration():
         temp_config_path = os.path.join(temp_dir, "test_battlesnake.yaml")
 
         # Reduce rounds to 1 for faster testing
-        config["game"]["rounds"] = 1
+        config["tournament"]["rounds"] = 1
 
         # Write the modified config
         with open(temp_config_path, "w") as f: