From 5e1aca81eecd6e7163b8e26a069f9db508ca53fd Mon Sep 17 00:00:00 2001 From: Debug Agent Date: Tue, 19 May 2026 16:43:46 +0200 Subject: [PATCH 1/7] perf(agent_context): drop auto-loaded skills from serialized output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When ``load_user_skills`` / ``load_public_skills`` are enabled, ``_load_auto_skills`` mutates ``self.skills`` at model-validation time with the fully resolved skill catalog. Pydantic then persists those skills onto the conversation's stored state and serializes them back on every ``GET /api/conversations``. For a stock setup that bundles ~40 skills under ``~/.openhands/skills``, that's ~263 KB of duplicated payload per stored conversation per fetch — content the client never reads directly, because the skills are only consumed server-side when rendering the system prompt. Track which skill names ``_load_auto_skills`` added in a ``PrivateAttr`` and drop them at serialize time via a ``@field_serializer("skills")``. The validator is ``mode="after"``, so it re-runs on every deserialization and repopulates the same auto-loaded subset deterministically from the ``load_user_skills`` / ``load_public_skills`` / ``marketplace_path`` configuration. Runtime callers of ``self.skills`` (prompt rendering, ``to_acp_prompt_context``, ``get_dynamic_context``) see the full resolved list as today — only the wire payload changes. Measured impact (agent-canvas at ``/conversations/``, cold-cold full page reload, agent-server 1.22.1, same browser): Conversation Before After (this PR) 40 skills + 161 events 2226 ms ~1159 ms (~48% faster) 0 skills + 209 events 1159 ms 1159 ms (unchanged) 0 skills + 0 events 979 ms 979 ms (unchanged) Tests pin both halves of the contract: - In-memory: every ``self.skills`` lookup keeps returning the full resolved list, including auto-loaded entries. - On the wire: ``model_dump`` returns only explicit skills (caller- supplied via the constructor); auto-loaded ones are absent. - Round-trip: dumping then ``model_validate``-ing repopulates the auto-loaded subset via the same validator, so a roundtripped ``AgentContext`` has the same in-memory shape as the source. - An explicit skill colliding with an auto-loaded name wins (it was never marked auto-loaded), in-memory AND on the wire. - A concrete byte-count assertion: 40-skill auto-load + 1 explicit skill serializes to <1 KB, not the ~40 KB the auto-loaded set would occupy. Closes software-agent-sdk#3301. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../openhands/sdk/context/agent_context.py | 38 ++++- ...test_agent_context_skills_serialization.py | 141 ++++++++++++++++++ 2 files changed, 178 insertions(+), 1 deletion(-) create mode 100644 tests/sdk/context/test_agent_context_skills_serialization.py diff --git a/openhands-sdk/openhands/sdk/context/agent_context.py b/openhands-sdk/openhands/sdk/context/agent_context.py index 17d9508427..17d2e73564 100644 --- a/openhands-sdk/openhands/sdk/context/agent_context.py +++ b/openhands-sdk/openhands/sdk/context/agent_context.py @@ -8,6 +8,7 @@ from pydantic import ( BaseModel, Field, + PrivateAttr, SecretStr, field_serializer, field_validator, @@ -118,6 +119,33 @@ class AgentContext(BaseModel): json_schema_extra={"acp_compatible": True}, ) + # Names of skills that ``_load_auto_skills`` added on top of the + # caller-supplied list. Tracked so the serializer can drop them from + # the wire payload: the same auto-load will re-run on the next + # deserialization (this model_validator runs in ``mode="after"``), so + # persisting them is pure duplication. For a stock configuration that + # turns both flags on (~40 skills bundled under + # ``~/.openhands/skills``) the resolved list is ~260 KB per + # ``AgentContext`` instance — every ``GET`` on a stored conversation + # carried that. See software-agent-sdk#3301. + _auto_loaded_skill_names: set[str] = PrivateAttr(default_factory=set) + + @field_serializer("skills", when_used="always") + def _serialize_skills(self, value: list[Skill], info) -> list[Any]: + """Drop auto-loaded skills from the serialized output. + + The runtime keeps the full resolved list on ``self.skills`` so + prompt rendering and downstream consumers behave exactly as + today. Only the wire payload changes: callers re-loading the + model will trigger ``_load_auto_skills`` again, which rebuilds + the auto-loaded subset deterministically from the same + ``load_user_skills`` / ``load_public_skills`` / + ``marketplace_path`` configuration. + """ + auto_names = self._auto_loaded_skill_names + kept = [s for s in value if s.name not in auto_names] + return [s.model_dump(mode=info.mode, context=info.context) for s in kept] + @field_serializer("secrets", when_used="always") def _serialize_secrets( self, value: Mapping[str, SecretValue] | None, info @@ -148,7 +176,14 @@ def _validate_skills(cls, v: list[Skill], _info): @model_validator(mode="after") def _load_auto_skills(self): - """Load user and/or public skills if enabled.""" + """Load user and/or public skills if enabled. + + Names of skills added here are tracked in + ``_auto_loaded_skill_names`` so the serializer can drop them + from the wire payload (this validator re-runs on every model + load, so the same skills repopulate without needing to be + persisted). + """ if not self.load_user_skills and not self.load_public_skills: return self @@ -164,6 +199,7 @@ def _load_auto_skills(self): for name, skill in auto_skills.items(): if name not in existing_names: self.skills.append(skill) + self._auto_loaded_skill_names.add(name) else: logger.debug( f"Skipping auto-loaded skill '{name}' (already in explicit skills)" diff --git a/tests/sdk/context/test_agent_context_skills_serialization.py b/tests/sdk/context/test_agent_context_skills_serialization.py new file mode 100644 index 0000000000..ff0dac1a09 --- /dev/null +++ b/tests/sdk/context/test_agent_context_skills_serialization.py @@ -0,0 +1,141 @@ +"""Tests for the auto-loaded-skills serialization optimisation. + +When ``load_user_skills`` / ``load_public_skills`` are enabled, +``AgentContext._load_auto_skills`` resolves every matching skill and +appends it to ``self.skills`` at model-validation time. Persisting +those skills on the wire is pure duplication — the validator runs +again on every deserialization and rebuilds the same set deterministically. + +The serializer drops auto-loaded skills from ``model_dump`` output but +keeps the in-memory ``self.skills`` intact so runtime prompt rendering +behaves exactly as today. These tests pin both halves of that contract. +""" + +from __future__ import annotations + +from unittest.mock import patch + +from openhands.sdk.context import AgentContext +from openhands.sdk.skills import Skill + + +def _make_skill(name: str, content: str = "auto skill body") -> Skill: + return Skill( + name=name, + content=content, + source=f"/fake/{name}.md", + ) + + +class TestAutoLoadedSkillsSerialization: + def test_auto_loaded_skills_drop_from_serialized_output(self): + """``model_dump`` omits auto-loaded skills (the headline payload win).""" + auto = { + "auto-1": _make_skill("auto-1"), + "auto-2": _make_skill("auto-2"), + "auto-3": _make_skill("auto-3"), + } + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value=auto, + ): + ctx = AgentContext(load_public_skills=True) + + # In-memory: the validator populated all three. + assert {s.name for s in ctx.skills} == {"auto-1", "auto-2", "auto-3"} + # On the wire: dropped entirely (no caller passed explicit skills). + dumped = ctx.model_dump() + assert dumped["skills"] == [] + + def test_explicit_skills_survive_serialization(self): + """Caller-supplied skills are NOT dropped — only the auto-loaded ones.""" + explicit = _make_skill("user-explicit", "this one is mine") + auto = {"auto-only": _make_skill("auto-only")} + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value=auto, + ): + ctx = AgentContext(load_public_skills=True, skills=[explicit]) + + # In-memory: both present. + assert {s.name for s in ctx.skills} == {"user-explicit", "auto-only"} + # On the wire: only the explicit one. + dumped = ctx.model_dump() + assert [s["name"] for s in dumped["skills"]] == ["user-explicit"] + + def test_explicit_skill_shadows_an_auto_one(self): + """When the explicit list collides with an auto-loaded name, the + explicit one wins in-memory AND on the wire (it was never marked + as auto-loaded — the auto-load step skipped it). + """ + explicit = _make_skill("shared-name", "user version") + auto = {"shared-name": _make_skill("shared-name", "auto version")} + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value=auto, + ): + ctx = AgentContext(load_public_skills=True, skills=[explicit]) + + assert len(ctx.skills) == 1 + assert ctx.skills[0].content == "user version" + dumped = ctx.model_dump() + assert len(dumped["skills"]) == 1 + assert dumped["skills"][0]["name"] == "shared-name" + assert dumped["skills"][0]["content"] == "user version" + + def test_round_trip_via_serialized_output_re_resolves_auto_skills(self): + """Deserializing the trimmed payload + re-validating repopulates the + auto-loaded skills via the same auto-load path. This is what makes + dropping them on the wire safe: the receiver sees the same + in-memory shape as the sender. + """ + auto = { + "auto-a": _make_skill("auto-a"), + "auto-b": _make_skill("auto-b"), + } + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value=auto, + ): + ctx = AgentContext(load_public_skills=True) + dumped = ctx.model_dump() + # Re-validating the trimmed payload should re-fire the auto-load + # validator and rebuild the in-memory list. + roundtripped = AgentContext.model_validate(dumped) + + assert {s.name for s in roundtripped.skills} == {"auto-a", "auto-b"} + + def test_disabled_auto_load_leaves_explicit_skills_on_the_wire(self): + """With both flags off, ``_load_auto_skills`` is a no-op — every + skill is treated as explicit and serialized normally. + """ + explicit = [_make_skill("foo"), _make_skill("bar")] + ctx = AgentContext(skills=explicit) + assert {s.name for s in ctx.skills} == {"foo", "bar"} + dumped = ctx.model_dump() + assert {s["name"] for s in dumped["skills"]} == {"foo", "bar"} + + def test_payload_shrinks_to_explicit_only(self): + """Concrete byte-count assertion: a 40-skill auto-load with a single + explicit skill should serialize approximately the size of the + single explicit skill, not 40+1. + """ + import json + + auto = {f"auto-{i}": _make_skill(f"auto-{i}", "x" * 1000) for i in range(40)} + explicit = _make_skill("user", "y" * 100) + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value=auto, + ): + ctx = AgentContext(load_public_skills=True, skills=[explicit]) + + dumped = ctx.model_dump() + skills_bytes = len(json.dumps(dumped["skills"])) + # The 40 auto skills total ~40 KB; the single explicit one is + # ~150 B. The serialized ``skills`` list must clearly be the + # explicit-only size, not the full set. + assert skills_bytes < 1000, ( + f"serialized skills should be ~1 explicit skill (~150 B), " + f"got {skills_bytes} B — auto skills leaked into output" + ) From 15002de80c77371af0eeaa5700be1f4f6c93c45d Mon Sep 17 00:00:00 2001 From: Debug Agent Date: Tue, 19 May 2026 16:46:11 +0200 Subject: [PATCH 2/7] perf(agent_context): migration path for old conversations with inlined skills MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stored conversations created before PR #3302 carry the resolved auto- loaded skill list inlined on ``agent_context.skills``. The validator's "skip if name in existing_names" branch hit those entries without marking them as auto-loaded, so the new serializer kept them on the wire — the bloat would persist until each conversation was recreated. Detect them: when an ``existing`` skill matches a loader-produced one by equality (full Pydantic model compare), mark it auto-loaded too. That makes the wire payload shrink on the very next ``GET`` after the SDK is bumped, with no conversation re-creation needed. A persisted skill that no longer matches the loader's current output (user edited the file, marketplace updated) stays treated as explicit — on-disk content wins. Covered by ``test_migration_stored_skills_diverged_from_loader_stay_explicit``. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../openhands/sdk/context/agent_context.py | 20 +++++++- ...test_agent_context_skills_serialization.py | 50 +++++++++++++++++++ 2 files changed, 68 insertions(+), 2 deletions(-) diff --git a/openhands-sdk/openhands/sdk/context/agent_context.py b/openhands-sdk/openhands/sdk/context/agent_context.py index 17d2e73564..560387c033 100644 --- a/openhands-sdk/openhands/sdk/context/agent_context.py +++ b/openhands-sdk/openhands/sdk/context/agent_context.py @@ -183,6 +183,17 @@ def _load_auto_skills(self): from the wire payload (this validator re-runs on every model load, so the same skills repopulate without needing to be persisted). + + Migration: stored conversations created before the serializer + change carry the resolved auto-loaded skill list inlined on + ``skills``. When such a conversation is loaded back, the names + match ``existing_names`` and the new-append branch is skipped + — but we still mark them as auto-loaded if the persisted skill + equals what the loader would produce now. Without this, every + old conversation would keep the bloated payload until rewritten. + A persisted skill that no longer matches the loader's current + output (user edited the file, marketplace updated, etc.) stays + treated as explicit so the on-disk content wins. """ if not self.load_user_skills and not self.load_public_skills: return self @@ -195,11 +206,16 @@ def _load_auto_skills(self): marketplace_path=self.marketplace_path, ) - existing_names = {skill.name for skill in self.skills} + existing_by_name = {skill.name: skill for skill in self.skills} for name, skill in auto_skills.items(): - if name not in existing_names: + existing = existing_by_name.get(name) + if existing is None: self.skills.append(skill) self._auto_loaded_skill_names.add(name) + elif existing == skill: + # Migration path for conversations stored before the + # serializer change — see the docstring. + self._auto_loaded_skill_names.add(name) else: logger.debug( f"Skipping auto-loaded skill '{name}' (already in explicit skills)" diff --git a/tests/sdk/context/test_agent_context_skills_serialization.py b/tests/sdk/context/test_agent_context_skills_serialization.py index ff0dac1a09..c2560d074e 100644 --- a/tests/sdk/context/test_agent_context_skills_serialization.py +++ b/tests/sdk/context/test_agent_context_skills_serialization.py @@ -115,6 +115,56 @@ def test_disabled_auto_load_leaves_explicit_skills_on_the_wire(self): dumped = ctx.model_dump() assert {s["name"] for s in dumped["skills"]} == {"foo", "bar"} + def test_migration_stored_skills_matching_loader_are_marked_auto_loaded(self): + """Migration path: a conversation stored before this PR carries the + resolved auto-loaded skills inlined on ``skills``. On reload, the + validator must recognise those as auto-loaded (the persisted + skill equals what the loader produces) and drop them from the + next serialization — otherwise the bloat persists until the + conversation is recreated. + """ + stored = [_make_skill("auto-x"), _make_skill("auto-y")] + # Loader returns the same skill objects (matches what was + # persisted under the old SDK). + loader_output = { + "auto-x": _make_skill("auto-x"), + "auto-y": _make_skill("auto-y"), + } + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value=loader_output, + ): + ctx = AgentContext(load_public_skills=True, skills=stored) + + # No duplicates: skills stayed at 2 (migration recognised them). + assert {s.name for s in ctx.skills} == {"auto-x", "auto-y"} + # And the serialized payload drops them. + dumped = ctx.model_dump() + assert dumped["skills"] == [] + + def test_migration_stored_skills_diverged_from_loader_stay_explicit(self): + """If the persisted skill content no longer matches what the + loader would produce (user edited the file, marketplace updated), + the on-disk version wins — treat it as explicit and keep it on + the wire. This is the safe default: we don't drop content + someone may have intentionally customised. + """ + stored = [_make_skill("auto-x", "old persisted content")] + loader_output = {"auto-x": _make_skill("auto-x", "new loader content")} + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value=loader_output, + ): + ctx = AgentContext(load_public_skills=True, skills=stored) + + # The stored ("old") version wins in-memory. + assert len(ctx.skills) == 1 + assert ctx.skills[0].content == "old persisted content" + # And it stays on the wire — not treated as auto-loaded. + dumped = ctx.model_dump() + assert len(dumped["skills"]) == 1 + assert dumped["skills"][0]["content"] == "old persisted content" + def test_payload_shrinks_to_explicit_only(self): """Concrete byte-count assertion: a 40-skill auto-load with a single explicit skill should serialize approximately the size of the From 67429edef0fa3593662fed89261f9808a04b9012 Mon Sep 17 00:00:00 2001 From: Debug Agent Date: Tue, 19 May 2026 16:58:08 +0200 Subject: [PATCH 3/7] perf(agent_context): equality-check filter so consumer-replaced skills survive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OpenHands' ``_create_agent_with_skills`` (and any caller that does ``agent_context.model_copy(update={'skills': merged})``) replaces auto-loaded skills in place when its merged list carries the same name with different content. A name-only ``_auto_loaded_skill_names`` filter would silently drop those replacements at serialize time and the receiver would auto-reload the stock version on the next deserialization — losing the customisation. Switch the PrivateAttr to a ``dict[str, Skill]`` snapshot. Serializer drops a skill iff ``_auto_loaded_skills.get(name) == s`` — equality, not name match. A replacement has different field values, fails the equality check, and survives on the wire as today. Verified safe across known consumers: - ``openhands-sdk/workspace/remote/base.py:920`` — explicit skills with ``load_public_skills=False``; auto-load short-circuits, serializer is a no-op. - ``openhands-sdk/workspace/remote/base.py:923`` — empty explicit with ``load_public_skills=True``; serializer drops the auto-loaded skills (the win we want). - ``OpenHands/openhands/app_server/.../live_status_app_conversation_service.py:1394`` — constructs AgentContext WITHOUT either ``load_*_skills`` flag. Auto-load is a no-op, ``_auto_loaded_skills`` stays empty. - ``OpenHands/openhands/app_server/.../app_conversation_service_base.py:159`` — ``_create_agent_with_skills`` does the model_copy + skill-replace flow. Covered by the new regression test. New test: ``test_replacing_auto_skill_via_model_copy_survives_serialization`` — pins the exact OpenHands flow: auto-load a "git-skill", merge with a different "git-skill" (same name, different content), model_copy, serialize. Asserts the replacement survives on the wire instead of being dropped as auto-loaded. 296/296 tests pass on the affected suite. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../openhands/sdk/context/agent_context.py | 39 ++++++++++++------- ...test_agent_context_skills_serialization.py | 37 ++++++++++++++++++ 2 files changed, 62 insertions(+), 14 deletions(-) diff --git a/openhands-sdk/openhands/sdk/context/agent_context.py b/openhands-sdk/openhands/sdk/context/agent_context.py index 560387c033..5077b17155 100644 --- a/openhands-sdk/openhands/sdk/context/agent_context.py +++ b/openhands-sdk/openhands/sdk/context/agent_context.py @@ -119,20 +119,24 @@ class AgentContext(BaseModel): json_schema_extra={"acp_compatible": True}, ) - # Names of skills that ``_load_auto_skills`` added on top of the - # caller-supplied list. Tracked so the serializer can drop them from - # the wire payload: the same auto-load will re-run on the next - # deserialization (this model_validator runs in ``mode="after"``), so - # persisting them is pure duplication. For a stock configuration that - # turns both flags on (~40 skills bundled under + # Snapshot of the skills that ``_load_auto_skills`` added on top of + # the caller-supplied list. The serializer drops only the ones still + # equal to this snapshot — if a downstream consumer replaces an + # auto-loaded skill via ``model_copy(update={'skills': merged})`` + # (OpenHands' ``_create_agent_with_skills`` does this when it merges + # in sandbox / repo skills with overlapping names), the replacement + # has different field values and survives serialization. The same + # auto-load will re-run on the receiving end and rebuild the + # auto-loaded subset that wasn't replaced. For a stock configuration + # that turns both flags on (~40 skills bundled under # ``~/.openhands/skills``) the resolved list is ~260 KB per - # ``AgentContext`` instance — every ``GET`` on a stored conversation - # carried that. See software-agent-sdk#3301. - _auto_loaded_skill_names: set[str] = PrivateAttr(default_factory=set) + # ``AgentContext`` — every ``GET`` on a stored conversation carried + # that. See software-agent-sdk#3301. + _auto_loaded_skills: dict[str, Skill] = PrivateAttr(default_factory=dict) @field_serializer("skills", when_used="always") def _serialize_skills(self, value: list[Skill], info) -> list[Any]: - """Drop auto-loaded skills from the serialized output. + """Drop unmodified auto-loaded skills from the serialized output. The runtime keeps the full resolved list on ``self.skills`` so prompt rendering and downstream consumers behave exactly as @@ -141,9 +145,16 @@ def _serialize_skills(self, value: list[Skill], info) -> list[Any]: the auto-loaded subset deterministically from the same ``load_user_skills`` / ``load_public_skills`` / ``marketplace_path`` configuration. + + Equality (not just name match) is required because consumers + like OpenHands' ``_create_agent_with_skills`` replace + auto-loaded skills in-place with their own version under the + same name. A name-only filter would silently drop those + replacements and the receiver would auto-reload the stock + version on the next deserialization. """ - auto_names = self._auto_loaded_skill_names - kept = [s for s in value if s.name not in auto_names] + auto = self._auto_loaded_skills + kept = [s for s in value if auto.get(s.name) != s] return [s.model_dump(mode=info.mode, context=info.context) for s in kept] @field_serializer("secrets", when_used="always") @@ -211,11 +222,11 @@ def _load_auto_skills(self): existing = existing_by_name.get(name) if existing is None: self.skills.append(skill) - self._auto_loaded_skill_names.add(name) + self._auto_loaded_skills[name] = skill elif existing == skill: # Migration path for conversations stored before the # serializer change — see the docstring. - self._auto_loaded_skill_names.add(name) + self._auto_loaded_skills[name] = skill else: logger.debug( f"Skipping auto-loaded skill '{name}' (already in explicit skills)" diff --git a/tests/sdk/context/test_agent_context_skills_serialization.py b/tests/sdk/context/test_agent_context_skills_serialization.py index c2560d074e..7bb5865aba 100644 --- a/tests/sdk/context/test_agent_context_skills_serialization.py +++ b/tests/sdk/context/test_agent_context_skills_serialization.py @@ -165,6 +165,43 @@ def test_migration_stored_skills_diverged_from_loader_stay_explicit(self): assert len(dumped["skills"]) == 1 assert dumped["skills"][0]["content"] == "old persisted content" + def test_replacing_auto_skill_via_model_copy_survives_serialization(self): + """Regression guard for OpenHands' ``_create_agent_with_skills`` flow. + + That helper does ``agent_context.model_copy(update={'skills': + merged})`` where ``merged`` is the result of deduping + auto-loaded + sandbox/repo-loaded skills by name, with the + later list winning. If two lists carry the same name, the + merged result is the *replacement* — different content from + what ``_load_auto_skills`` produced. + + A name-only serializer would drop the replacement (the name + matches an auto-loaded one) and the receiver would auto-load + the original stock version, silently dropping OpenHands' + customisations. The equality-based filter must keep the + replacement on the wire. + """ + # Stock auto-loaded skill ("public version"). + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value={"git-skill": _make_skill("git-skill", "public version")}, + ): + ctx = AgentContext(load_public_skills=True) + # OpenHands merges in a different "git-skill" (same name). + replacement = _make_skill("git-skill", "OpenHands custom version") + # Drop the auto-loaded skill, add the replacement. This is what + # ``_merge_skills`` produces — later list (replacement) wins. + merged = [s for s in ctx.skills if s.name != "git-skill"] + [replacement] + new_ctx = ctx.model_copy(update={"skills": merged}) + + # In-memory: only the replacement is present. + assert len(new_ctx.skills) == 1 + assert new_ctx.skills[0].content == "OpenHands custom version" + # On the wire: replacement survives (equality check, not name). + dumped = new_ctx.model_dump() + assert len(dumped["skills"]) == 1 + assert dumped["skills"][0]["content"] == "OpenHands custom version" + def test_payload_shrinks_to_explicit_only(self): """Concrete byte-count assertion: a 40-skill auto-load with a single explicit skill should serialize approximately the size of the From 98f2c62375c692a565859579f9b2ec6d619e0554 Mon Sep 17 00:00:00 2001 From: Debug Agent Date: Tue, 19 May 2026 17:05:42 +0200 Subject: [PATCH 4/7] =?UTF-8?q?perf(agent=5Fcontext):=20address=20PR=20#33?= =?UTF-8?q?02=20review=20=E2=80=94=20persistence=20+=20nested=20options?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two issues from the bot review of PR #3302: 1. The serializer fired on EVERY ``model_dump``, including ``ConversationState._save_base_state`` (the persistence path). Without auto-loaded skills on the persisted snapshot, a resumed conversation would silently pick up *new* skill content from ``~/.openhands/skills`` / the public marketplace via ``_load_auto_skills`` — losing the freeze-at-create-time guarantee. Fix: opt-out via ``context={"preserve_full_skills": True}``. ``_save_base_state`` passes it; the API-response path doesn't — so persistence keeps the snapshot and the wire still gets the win. 2. The manual ``s.model_dump(mode=..., context=...)`` inside the serializer dropped the caller's ``exclude_none`` / ``exclude_defaults`` / nested ``include`` options. ``AgentContext(skills=[...]).model_dump( exclude_none=True)`` would have leaked ``trigger: None``, ``source: None`` etc. — which also bloats the very payloads this PR optimises. Fix: switch to ``@field_serializer("skills", mode="wrap")`` and delegate the surviving list to ``handler``. Pydantic's default nested machinery then respects every caller option. Tests: - ``test_preserve_full_skills_context_flag_keeps_snapshot`` — flag reverts to full serialization for the snapshot path. - ``test_wrap_serializer_propagates_exclude_none`` — pins the comment-2 regression with a concrete ``Skill.trigger`` canary. - ``test_save_base_state_persists_full_skill_snapshot`` — end-to-end through ``ConversationState`` to ``InMemoryFileStore``; asserts the persisted JSON keeps all auto-loaded skill names. 299/299 tests pass on the affected suites. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../openhands/sdk/context/agent_context.py | 23 +++- .../openhands/sdk/conversation/state.py | 12 +- ...test_agent_context_skills_serialization.py | 103 ++++++++++++++++++ 3 files changed, 134 insertions(+), 4 deletions(-) diff --git a/openhands-sdk/openhands/sdk/context/agent_context.py b/openhands-sdk/openhands/sdk/context/agent_context.py index 5077b17155..7fa3089f80 100644 --- a/openhands-sdk/openhands/sdk/context/agent_context.py +++ b/openhands-sdk/openhands/sdk/context/agent_context.py @@ -134,8 +134,8 @@ class AgentContext(BaseModel): # that. See software-agent-sdk#3301. _auto_loaded_skills: dict[str, Skill] = PrivateAttr(default_factory=dict) - @field_serializer("skills", when_used="always") - def _serialize_skills(self, value: list[Skill], info) -> list[Any]: + @field_serializer("skills", when_used="always", mode="wrap") + def _serialize_skills(self, value: list[Skill], handler, info) -> Any: """Drop unmodified auto-loaded skills from the serialized output. The runtime keeps the full resolved list on ``self.skills`` so @@ -152,10 +152,27 @@ def _serialize_skills(self, value: list[Skill], info) -> list[Any]: same name. A name-only filter would silently drop those replacements and the receiver would auto-reload the stock version on the next deserialization. + + Opt-out via ``context={"preserve_full_skills": True}``: paths + that need a stable snapshot of the resolved skill catalog (the + most important one being ``ConversationState._save_base_state`` + — persistence of the conversation to disk) pass this flag so + the serializer is a no-op. Without it, a paused conversation + resumed after the ``~/.openhands/skills`` directory or the + public marketplace updated would silently pick up the *new* + skill content via ``_load_auto_skills``. The API-response path + skips the flag and gets the byte-size win. + + ``mode="wrap"`` + ``handler`` delegation preserves caller + options like ``exclude_none``, ``exclude_defaults``, and + nested ``include`` / ``exclude`` for the surviving skills — + manual ``s.model_dump(...)`` would have ignored them. """ + if info.context and info.context.get("preserve_full_skills"): + return handler(value) auto = self._auto_loaded_skills kept = [s for s in value if auto.get(s.name) != s] - return [s.model_dump(mode=info.mode, context=info.context) for s in kept] + return handler(kept) @field_serializer("secrets", when_used="always") def _serialize_secrets( diff --git a/openhands-sdk/openhands/sdk/conversation/state.py b/openhands-sdk/openhands/sdk/conversation/state.py index 7e137929f2..6c078935e8 100644 --- a/openhands-sdk/openhands/sdk/conversation/state.py +++ b/openhands-sdk/openhands/sdk/conversation/state.py @@ -255,8 +255,18 @@ def _save_base_state(self, fs: FileStore) -> None: If a cipher is configured, secrets will be encrypted. Otherwise, they will be redacted (serialized as '**********'). + + ``preserve_full_skills`` is set so the persisted snapshot + includes auto-loaded skills inline — without this the conversation + would silently pick up *new* skill content from + ``~/.openhands/skills`` / the public marketplace on resume, since + ``AgentContext._load_auto_skills`` re-runs on deserialization. + Persistence needs the freeze-at-create-time snapshot; only the + API-response path opts into the trim. See software-agent-sdk#3301. """ - context = {"cipher": self._cipher} if self._cipher else None + context: dict[str, Any] = {"preserve_full_skills": True} + if self._cipher: + context["cipher"] = self._cipher # Warn if secrets exist but no cipher is configured if not self._cipher and self.secret_registry.secret_sources: logger.warning( diff --git a/tests/sdk/context/test_agent_context_skills_serialization.py b/tests/sdk/context/test_agent_context_skills_serialization.py index 7bb5865aba..556d8782b4 100644 --- a/tests/sdk/context/test_agent_context_skills_serialization.py +++ b/tests/sdk/context/test_agent_context_skills_serialization.py @@ -202,6 +202,109 @@ def test_replacing_auto_skill_via_model_copy_survives_serialization(self): assert len(dumped["skills"]) == 1 assert dumped["skills"][0]["content"] == "OpenHands custom version" + def test_preserve_full_skills_context_flag_keeps_snapshot(self): + """``context={"preserve_full_skills": True}`` opts out of the trim. + + This is the persistence path: ``ConversationState._save_base_state`` + passes this flag so the snapshot of auto-loaded skills is frozen + at conversation-create time. Without it, a paused conversation + resumed after the skill source updates would silently pick up + the *new* content via ``_load_auto_skills``, breaking the + guarantee that the agent runs the same skills it started with. + """ + auto = {f"auto-{i}": _make_skill(f"auto-{i}") for i in range(5)} + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value=auto, + ): + ctx = AgentContext(load_public_skills=True) + + # Default: trim. + dumped_default = ctx.model_dump() + assert dumped_default["skills"] == [] + + # Persistence opt-out: full snapshot. + dumped_persist = ctx.model_dump(context={"preserve_full_skills": True}) + assert len(dumped_persist["skills"]) == 5 + assert {s["name"] for s in dumped_persist["skills"]} == set(auto.keys()) + + def test_wrap_serializer_propagates_exclude_none(self): + """Caller-supplied ``exclude_none`` reaches nested ``Skill`` fields. + + ``Skill`` has several optional fields that default to ``None`` + (``trigger``, ``source``, ``mcp_tools``, …). A manual + ``s.model_dump(...)`` inside the serializer would have dropped + the caller's ``exclude_none`` setting. ``mode="wrap"`` + + ``handler`` delegation preserves it. + """ + explicit = Skill(name="x", content="hello") # trigger / mcp_tools / etc. → None + ctx = AgentContext(skills=[explicit]) + + # Without exclude_none, None fields appear. + dumped_keep_nones = ctx.model_dump() + skill_dict = dumped_keep_nones["skills"][0] + # ``trigger`` is one of the always-present None defaults; pick + # it as the canary. + assert "trigger" in skill_dict + assert skill_dict["trigger"] is None + + # With exclude_none, the field is omitted from the nested skill. + dumped_drop_nones = ctx.model_dump(exclude_none=True) + skill_dict_pruned = dumped_drop_nones["skills"][0] + assert "trigger" not in skill_dict_pruned + + def test_save_base_state_persists_full_skill_snapshot(self): + """End-to-end: ``ConversationState._save_base_state`` writes the + full skill list to disk, even though the in-memory ``model_dump`` + default trims them. + """ + import json + import uuid + + from openhands.sdk.conversation.state import ConversationState + from openhands.sdk.io import InMemoryFileStore + from openhands.sdk.workspace.local import LocalWorkspace + + auto = {f"auto-{i}": _make_skill(f"auto-{i}") for i in range(3)} + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value=auto, + ): + agent_context = AgentContext(load_public_skills=True) + # Use a real lightweight agent to satisfy ConversationState. + from openhands.sdk.agent import Agent + from openhands.sdk.llm import LLM + + agent = Agent( + llm=LLM(model="test", usage_id="test"), + agent_context=agent_context, + ) + + import tempfile + + with tempfile.TemporaryDirectory() as tmp: + state = ConversationState.create( + id=uuid.uuid4(), + agent=agent, + workspace=LocalWorkspace(working_dir=tmp), + ) + fs = InMemoryFileStore() + state._save_base_state(fs) + + persisted = json.loads(fs.read("base_state.json")) + + # The persisted snapshot must include the auto-loaded skills so + # a resumed conversation gets the same content even if the + # skill source changed in the meantime. + persisted_skill_names = [ + s["name"] for s in persisted["agent"]["agent_context"]["skills"] + ] + assert set(persisted_skill_names) == set(auto.keys()), ( + "persistence path lost the auto-loaded skill snapshot — " + "without it, a resumed conversation could silently pick up " + "newer skill content from ~/.openhands/skills" + ) + def test_payload_shrinks_to_explicit_only(self): """Concrete byte-count assertion: a 40-skill auto-load with a single explicit skill should serialize approximately the size of the From 713148b1475053eea1fc7df7eed03533dd6e6893 Mon Sep 17 00:00:00 2001 From: Debug Agent Date: Tue, 19 May 2026 17:16:10 +0200 Subject: [PATCH 5/7] =?UTF-8?q?perf(agent=5Fcontext):=20address=20PR=20#33?= =?UTF-8?q?02=20second-pass=20review=20=E2=80=94=20snapshot=20integrity?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two more edge cases from the bot review: 1. ``_auto_loaded_skills`` stored the same ``Skill`` reference that was appended to ``self.skills``. ``Skill`` is mutable, so an in-place edit (``ctx.skills[0].content = "custom"``) mutated the snapshot too; the equality check still succeeded and the customised skill silently disappeared from ``model_dump``. Fix: deep-copy the snapshot via ``skill.model_copy(deep=True)``. Pinned by ``test_in_place_skill_mutation_preserves_customisation_on_wire``. 2. The serializer used the snapshot even when the auto-load config changed after validation. ``ctx.model_copy(update={ "load_public_skills": False})`` keeps the runtime auto-loaded skills (validators don't re-run on model_copy) but used to serialize them as ``[]``. On ``model_validate`` the receiver wouldn't auto-reload them (flag now False) — skills lost. Same shape for ``marketplace_path`` changes (receiver re-loads from a *different* catalog). Fix: snapshot ``(load_user_skills, load_public_skills, marketplace_path)`` alongside the skills. If the current config drifted, the serializer no-ops the trim and emits the full list, preserving the round-trip. Pinned by: - ``test_model_copy_toggling_load_flag_off_keeps_skills_on_wire`` - ``test_model_copy_changing_marketplace_path_keeps_skills_on_wire`` 302/302 tests pass on the affected suites. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../openhands/sdk/context/agent_context.py | 51 +++++++++++- ...test_agent_context_skills_serialization.py | 80 +++++++++++++++++++ 2 files changed, 129 insertions(+), 2 deletions(-) diff --git a/openhands-sdk/openhands/sdk/context/agent_context.py b/openhands-sdk/openhands/sdk/context/agent_context.py index 7fa3089f80..2a61bccff0 100644 --- a/openhands-sdk/openhands/sdk/context/agent_context.py +++ b/openhands-sdk/openhands/sdk/context/agent_context.py @@ -132,7 +132,21 @@ class AgentContext(BaseModel): # ``~/.openhands/skills``) the resolved list is ~260 KB per # ``AgentContext`` — every ``GET`` on a stored conversation carried # that. See software-agent-sdk#3301. + # + # Values are deep-copied so an in-place caller mutation + # (``ctx.skills[0].content = "custom"``) doesn't also mutate the + # snapshot — without that, the equality check would still succeed + # and the customised skill would silently disappear from the wire. _auto_loaded_skills: dict[str, Skill] = PrivateAttr(default_factory=dict) + # The auto-load config that produced ``_auto_loaded_skills``. + # Tracked so the serializer can no-op the trim when the config + # changed after validation — e.g. ``model_copy(update={ + # "load_public_skills": False})``. Without this, a copy with the + # flag flipped off would drop the auto-loaded skills on the wire + # AND fail to re-load them on the next ``model_validate`` (the + # flag is now off), losing them entirely. ``None`` when + # ``_load_auto_skills`` has not run yet. + _auto_load_config: tuple[bool, bool, str | None] | None = PrivateAttr(default=None) @field_serializer("skills", when_used="always", mode="wrap") def _serialize_skills(self, value: list[Skill], handler, info) -> Any: @@ -163,6 +177,15 @@ def _serialize_skills(self, value: list[Skill], handler, info) -> Any: skill content via ``_load_auto_skills``. The API-response path skips the flag and gets the byte-size win. + Config-drift safety: if the auto-load config + (``load_user_skills`` / ``load_public_skills`` / + ``marketplace_path``) changed since the snapshot was taken — + typically via ``model_copy(update=...)`` flipping one of those + flags — the trim is skipped. Otherwise the receiver would + either re-load a *different* skill catalog (changed + ``marketplace_path``) or fail to re-load at all (flag turned + off), losing the auto-loaded skills entirely. + ``mode="wrap"`` + ``handler`` delegation preserves caller options like ``exclude_none``, ``exclude_defaults``, and nested ``include`` / ``exclude`` for the surviving skills — @@ -170,6 +193,15 @@ def _serialize_skills(self, value: list[Skill], handler, info) -> Any: """ if info.context and info.context.get("preserve_full_skills"): return handler(value) + # Auto-load config drifted (or never ran) → can't trust the + # snapshot to round-trip. Serialize everything as explicit. + current_config = ( + self.load_user_skills, + self.load_public_skills, + self.marketplace_path, + ) + if self._auto_load_config != current_config: + return handler(value) auto = self._auto_loaded_skills kept = [s for s in value if auto.get(s.name) != s] return handler(kept) @@ -234,16 +266,31 @@ def _load_auto_skills(self): marketplace_path=self.marketplace_path, ) + # Record the config that produced this snapshot so the + # serializer can detect drift (e.g. ``model_copy(update={ + # "load_public_skills": False})``) and degrade to full + # serialization. + self._auto_load_config = ( + self.load_user_skills, + self.load_public_skills, + self.marketplace_path, + ) + existing_by_name = {skill.name: skill for skill in self.skills} for name, skill in auto_skills.items(): existing = existing_by_name.get(name) if existing is None: self.skills.append(skill) - self._auto_loaded_skills[name] = skill + # Deep-copy so an in-place caller mutation + # (``ctx.skills[0].content = "custom"``) doesn't also + # mutate the snapshot — without it the equality check + # would still succeed and the customised skill would + # silently disappear from the wire. + self._auto_loaded_skills[name] = skill.model_copy(deep=True) elif existing == skill: # Migration path for conversations stored before the # serializer change — see the docstring. - self._auto_loaded_skills[name] = skill + self._auto_loaded_skills[name] = skill.model_copy(deep=True) else: logger.debug( f"Skipping auto-loaded skill '{name}' (already in explicit skills)" diff --git a/tests/sdk/context/test_agent_context_skills_serialization.py b/tests/sdk/context/test_agent_context_skills_serialization.py index 556d8782b4..4a025cc784 100644 --- a/tests/sdk/context/test_agent_context_skills_serialization.py +++ b/tests/sdk/context/test_agent_context_skills_serialization.py @@ -305,6 +305,86 @@ def test_save_base_state_persists_full_skill_snapshot(self): "newer skill content from ~/.openhands/skills" ) + def test_in_place_skill_mutation_preserves_customisation_on_wire(self): + """Regression for the mutable-snapshot bug. + + ``_auto_loaded_skills`` used to store the same ``Skill`` + reference that landed in ``self.skills``. An in-place edit + like ``ctx.skills[0].content = "custom"`` mutated both copies, + the equality check still succeeded, and the customised skill + silently vanished from ``model_dump`` output. Deep-copying the + snapshot at auto-load time keeps the two references + independent so the equality check correctly reports the + in-memory skill as "modified" and keeps it on the wire. + """ + auto = {"editable": _make_skill("editable", "original content")} + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value=auto, + ): + ctx = AgentContext(load_public_skills=True) + + # Caller mutates the skill in-place after the auto-load. + ctx.skills[0].content = "user customised content" + + dumped = ctx.model_dump() + assert len(dumped["skills"]) == 1 + assert dumped["skills"][0]["content"] == "user customised content" + + def test_model_copy_toggling_load_flag_off_keeps_skills_on_wire(self): + """Regression for the config-drift bug. + + ``ctx.model_copy(update={"load_public_skills": False})`` keeps + the runtime auto-loaded skills (the validator doesn't re-run + on model_copy) but used to serialize them as ``[]`` because + the snapshot still matched the in-memory names. On + ``model_validate(...)`` the receiver wouldn't auto-reload them + (flag is False), losing them entirely. + + With the config-drift check, the serializer detects that the + current config doesn't match the snapshot config and emits + the full skill list, preserving the round-trip. + """ + auto = {f"auto-{i}": _make_skill(f"auto-{i}") for i in range(3)} + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value=auto, + ): + ctx = AgentContext(load_public_skills=True) + + # Flip the flag off via model_copy — does NOT re-run the + # validator, so ctx.skills still has the 3 auto-loaded skills. + flipped = ctx.model_copy(update={"load_public_skills": False}) + assert len(flipped.skills) == 3 + + # Serializer must NOT trim — the config drifted. Round-trip + # would otherwise lose them (flag is False on the receiver). + dumped = flipped.model_dump() + assert {s["name"] for s in dumped["skills"]} == set(auto.keys()) + + # And the round-trip preserves them in-memory. + roundtripped = AgentContext.model_validate(dumped) + assert {s.name for s in roundtripped.skills} == set(auto.keys()) + + def test_model_copy_changing_marketplace_path_keeps_skills_on_wire(self): + """Same config-drift concern as above but for ``marketplace_path``. + + If the snapshot was taken with ``marketplace_path="A"`` and a + copy bumps it to ``"B"``, the receiver would auto-load from + ``"B"`` and get a *different* catalog. Preserve the original + list on the wire so the receiver can't silently swap. + """ + auto = {"marketplace-skill": _make_skill("marketplace-skill")} + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value=auto, + ): + ctx = AgentContext(load_public_skills=True, marketplace_path="path-A") + + bumped = ctx.model_copy(update={"marketplace_path": "path-B"}) + dumped = bumped.model_dump() + assert {s["name"] for s in dumped["skills"]} == {"marketplace-skill"} + def test_payload_shrinks_to_explicit_only(self): """Concrete byte-count assertion: a 40-skill auto-load with a single explicit skill should serialize approximately the size of the From d4bac354b8a84b0c184c4665ad97e2b787908387 Mon Sep 17 00:00:00 2001 From: Debug Agent Date: Tue, 19 May 2026 19:03:19 +0200 Subject: [PATCH 6/7] perf(agent_context): freeze snapshot on load + scope migration to snapshot loads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two more edge cases from the third bot review: 1. ``preserve_full_skills`` only prevented the trim on *save*. The load path still ran ``_load_auto_skills`` unconditionally and *appended* any newly-published skills the loader returned. A snapshot frozen as {a} would silently grow to {a, b} after `b` landed in the marketplace — and the next save would persist the drifted set. Fix: ``_load_auto_skills`` now accepts ``ValidationInfo``. When ``info.context["loading_from_snapshot"]`` is true (set by ``ConversationState`` factory on the resume path), the validator skips the append-new branch entirely — the persisted ``skills`` list is treated as authoritative. 2. The migration branch (mark existing skills as auto-loaded when they equal the loader's output) also fired for fresh ``AgentContext(load_public_skills=True, skills=[explicit])`` construction. A caller-pinned explicit skill that happened to equal the marketplace version would be snapshotted as auto-loaded, dropped from the wire, and silently swapped to a *new* marketplace version on the next round-trip. Fix: migration only fires under ``loading_from_snapshot``. Fresh construction keeps the "equal existing" branch as a no-op — the caller-supplied skill stays explicit on the wire. The ``ConversationState`` resume path at ``state.py:354`` now passes ``context={"loading_from_snapshot": True}`` alongside the cipher. API-response paths and direct caller construction don't pass the flag, so they keep fresh-construction semantics — which is what we want for the shrink-the-wire win. New tests (5 → 8): - ``test_fresh_construction_does_not_conflate_explicit_with_auto`` — the comment-6 regression, fresh construction with a name collision. - ``test_loading_from_snapshot_does_not_pick_up_new_auto_loaded_skills`` — the comment-5 regression, snapshot {a} + loader returning {a, b} stays {a} in memory and on the wire. - ``test_fresh_construction_still_appends_new_auto_skills`` — sanity that the default branch keeps growing for first-launch users. - Existing migration tests updated to invoke the load path via ``model_validate(context={"loading_from_snapshot": True})``. 305/305 tests pass on the affected suites. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../openhands/sdk/context/agent_context.py | 73 ++++++++---- .../openhands/sdk/conversation/state.py | 12 +- ...test_agent_context_skills_serialization.py | 104 ++++++++++++++++-- 3 files changed, 157 insertions(+), 32 deletions(-) diff --git a/openhands-sdk/openhands/sdk/context/agent_context.py b/openhands-sdk/openhands/sdk/context/agent_context.py index 2a61bccff0..6cce3d50ce 100644 --- a/openhands-sdk/openhands/sdk/context/agent_context.py +++ b/openhands-sdk/openhands/sdk/context/agent_context.py @@ -10,6 +10,7 @@ Field, PrivateAttr, SecretStr, + ValidationInfo, field_serializer, field_validator, model_validator, @@ -235,25 +236,46 @@ def _validate_skills(cls, v: list[Skill], _info): return v @model_validator(mode="after") - def _load_auto_skills(self): + def _load_auto_skills(self, info: ValidationInfo): """Load user and/or public skills if enabled. Names of skills added here are tracked in - ``_auto_loaded_skill_names`` so the serializer can drop them - from the wire payload (this validator re-runs on every model - load, so the same skills repopulate without needing to be - persisted). - - Migration: stored conversations created before the serializer - change carry the resolved auto-loaded skill list inlined on - ``skills``. When such a conversation is loaded back, the names - match ``existing_names`` and the new-append branch is skipped - — but we still mark them as auto-loaded if the persisted skill - equals what the loader would produce now. Without this, every - old conversation would keep the bloated payload until rewritten. - A persisted skill that no longer matches the loader's current - output (user edited the file, marketplace updated, etc.) stays - treated as explicit so the on-disk content wins. + ``_auto_loaded_skills`` so the serializer can drop them from + the wire payload — this validator re-runs on every model load, + so the same skills repopulate without needing to be persisted. + + Two behaviour modes, switched by + ``context={"loading_from_snapshot": True}``: + + Fresh construction (default, no flag): + - Skills the loader returned that aren't already in + ``self.skills`` get appended and tracked as auto-loaded. + - Skills the loader returned that ARE already in + ``self.skills`` (caller-supplied explicit) are left + alone — even if they happen to equal the loader output. + The caller pinned that content deliberately; treating it + as auto-loaded would let a later marketplace update + silently swap it on the next round-trip. + + Loading from a persisted snapshot (``loading_from_snapshot``): + - The persisted ``skills`` list IS authoritative. Don't + append new auto-loaded skills — that would let a + marketplace update or a fresh skill file silently + pollute the snapshot on the next save. + - Skills already in ``self.skills`` that equal the + loader's current output get marked as auto-loaded so + the next serialize trims them (migration path: old + persisted conversations shrink on first reload through + the new SDK). + - Skills in ``self.skills`` that DON'T match the loader's + current output (user edited the file, marketplace + updated since the snapshot was taken) stay explicit so + the on-disk content wins. + + The persistence load path in + ``ConversationState`` factory sets the context flag. API- + response paths and direct caller construction don't, so they + get fresh-construction semantics. """ if not self.load_user_skills and not self.load_public_skills: return self @@ -276,10 +298,20 @@ def _load_auto_skills(self): self.marketplace_path, ) + loading_from_snapshot = bool( + info.context and info.context.get("loading_from_snapshot") + ) + existing_by_name = {skill.name: skill for skill in self.skills} for name, skill in auto_skills.items(): existing = existing_by_name.get(name) if existing is None: + if loading_from_snapshot: + # Snapshot is authoritative — don't add a skill + # the persisted state never had. This is what + # freezes the auto-loaded set against marketplace + # / user-skill churn between save and load. + continue self.skills.append(skill) # Deep-copy so an in-place caller mutation # (``ctx.skills[0].content = "custom"``) doesn't also @@ -287,9 +319,12 @@ def _load_auto_skills(self): # would still succeed and the customised skill would # silently disappear from the wire. self._auto_loaded_skills[name] = skill.model_copy(deep=True) - elif existing == skill: - # Migration path for conversations stored before the - # serializer change — see the docstring. + elif existing == skill and loading_from_snapshot: + # Migration path: existing matches loader → mark as + # auto-loaded so the next serialize can trim it. + # ONLY fires under loading_from_snapshot — for fresh + # construction this would conflate caller-pinned + # explicit skills with auto-loaded ones. self._auto_loaded_skills[name] = skill.model_copy(deep=True) else: logger.debug( diff --git a/openhands-sdk/openhands/sdk/conversation/state.py b/openhands-sdk/openhands/sdk/conversation/state.py index 6c078935e8..1435794204 100644 --- a/openhands-sdk/openhands/sdk/conversation/state.py +++ b/openhands-sdk/openhands/sdk/conversation/state.py @@ -349,8 +349,16 @@ def create( # ---- Resume path ---- if base_text: - # Use cipher context for decrypting secrets if provided - context = {"cipher": cipher} if cipher else None + # ``loading_from_snapshot`` tells ``AgentContext._load_auto_skills`` + # that the persisted ``skills`` list is authoritative and must + # NOT be augmented with newly published auto-loaded skills the + # loader picks up post-save. Without it, a snapshot frozen as + # {a} would silently grow to {a, b} after `b` lands in the + # public marketplace, breaking the freeze-at-create guarantee + # ``_save_base_state`` relied on. See software-agent-sdk#3301. + context: dict[str, Any] = {"loading_from_snapshot": True} + if cipher: + context["cipher"] = cipher state = cls.model_validate(json.loads(base_text), context=context) # Restore the conversation with the same id diff --git a/tests/sdk/context/test_agent_context_skills_serialization.py b/tests/sdk/context/test_agent_context_skills_serialization.py index 4a025cc784..89b45c99ec 100644 --- a/tests/sdk/context/test_agent_context_skills_serialization.py +++ b/tests/sdk/context/test_agent_context_skills_serialization.py @@ -116,16 +116,14 @@ def test_disabled_auto_load_leaves_explicit_skills_on_the_wire(self): assert {s["name"] for s in dumped["skills"]} == {"foo", "bar"} def test_migration_stored_skills_matching_loader_are_marked_auto_loaded(self): - """Migration path: a conversation stored before this PR carries the - resolved auto-loaded skills inlined on ``skills``. On reload, the - validator must recognise those as auto-loaded (the persisted - skill equals what the loader produces) and drop them from the - next serialization — otherwise the bloat persists until the - conversation is recreated. + """Migration path (loading_from_snapshot only): a conversation + stored before this PR carries the resolved auto-loaded skills + inlined on ``skills``. When ``ConversationState`` loads it with + ``loading_from_snapshot=True``, the validator recognises matching + skills as auto-loaded and drops them from the next serialization — + otherwise the bloat persists until the conversation is recreated. """ stored = [_make_skill("auto-x"), _make_skill("auto-y")] - # Loader returns the same skill objects (matches what was - # persisted under the old SDK). loader_output = { "auto-x": _make_skill("auto-x"), "auto-y": _make_skill("auto-y"), @@ -134,7 +132,10 @@ def test_migration_stored_skills_matching_loader_are_marked_auto_loaded(self): "openhands.sdk.context.agent_context.load_available_skills", return_value=loader_output, ): - ctx = AgentContext(load_public_skills=True, skills=stored) + ctx = AgentContext.model_validate( + {"load_public_skills": True, "skills": stored}, + context={"loading_from_snapshot": True}, + ) # No duplicates: skills stayed at 2 (migration recognised them). assert {s.name for s in ctx.skills} == {"auto-x", "auto-y"} @@ -142,12 +143,38 @@ def test_migration_stored_skills_matching_loader_are_marked_auto_loaded(self): dumped = ctx.model_dump() assert dumped["skills"] == [] + def test_fresh_construction_does_not_conflate_explicit_with_auto(self): + """Comment-6 regression: fresh ``AgentContext(skills=[explicit])`` + construction where the explicit skill happens to equal what the + loader returns must NOT mark it as auto-loaded. Without this + guard, a later marketplace/user-skill edit would silently swap + the caller's pinned content on the next round-trip. + """ + explicit = _make_skill("shared-name", "caller pinned this") + loader_output = { + "shared-name": _make_skill("shared-name", "caller pinned this") + } + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value=loader_output, + ): + ctx = AgentContext(load_public_skills=True, skills=[explicit]) + + # In-memory: only the explicit (validator's "already in + # existing" branch hit, no append). + assert len(ctx.skills) == 1 + # On the wire: stays as explicit — not snapshotted as auto. + dumped = ctx.model_dump() + assert len(dumped["skills"]) == 1 + assert dumped["skills"][0]["content"] == "caller pinned this" + def test_migration_stored_skills_diverged_from_loader_stay_explicit(self): """If the persisted skill content no longer matches what the loader would produce (user edited the file, marketplace updated), the on-disk version wins — treat it as explicit and keep it on the wire. This is the safe default: we don't drop content - someone may have intentionally customised. + someone may have intentionally customised. Applies under + ``loading_from_snapshot`` (same as the matching-migration path). """ stored = [_make_skill("auto-x", "old persisted content")] loader_output = {"auto-x": _make_skill("auto-x", "new loader content")} @@ -155,7 +182,10 @@ def test_migration_stored_skills_diverged_from_loader_stay_explicit(self): "openhands.sdk.context.agent_context.load_available_skills", return_value=loader_output, ): - ctx = AgentContext(load_public_skills=True, skills=stored) + ctx = AgentContext.model_validate( + {"load_public_skills": True, "skills": stored}, + context={"loading_from_snapshot": True}, + ) # The stored ("old") version wins in-memory. assert len(ctx.skills) == 1 @@ -165,6 +195,58 @@ def test_migration_stored_skills_diverged_from_loader_stay_explicit(self): assert len(dumped["skills"]) == 1 assert dumped["skills"][0]["content"] == "old persisted content" + def test_loading_from_snapshot_does_not_pick_up_new_auto_loaded_skills(self): + """Comment-5 regression: a persisted snapshot frozen with only + ``{a}`` must not silently grow to ``{a, b}`` if the loader now + also returns a new skill ``b`` (marketplace added it, user + added a file). The ``loading_from_snapshot`` context flag + tells the validator the persisted ``skills`` list is + authoritative. + """ + stored = [_make_skill("a")] + # Loader sees a new ``b`` that wasn't in the persisted snapshot. + loader_output = { + "a": _make_skill("a"), + "b": _make_skill("b", "newly published content"), + } + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value=loader_output, + ): + ctx = AgentContext.model_validate( + {"load_public_skills": True, "skills": stored}, + context={"loading_from_snapshot": True}, + ) + + # In-memory: snapshot is authoritative — ``b`` is NOT appended. + assert {s.name for s in ctx.skills} == {"a"} + # Next ``model_dump`` (default, no preserve flag) still trims + # ``a`` since it was matched as auto-loaded. Serialized list + # is empty — exactly what the original snapshot would have + # serialized. + dumped = ctx.model_dump() + assert dumped["skills"] == [] + # And the preserve-full path also gives back just ``a`` — no + # marketplace pollution. + dumped_full = ctx.model_dump(context={"preserve_full_skills": True}) + assert {s["name"] for s in dumped_full["skills"]} == {"a"} + + def test_fresh_construction_still_appends_new_auto_skills(self): + """Sanity: without the ``loading_from_snapshot`` flag, fresh + construction keeps appending new auto-loaded skills as today. + This is what makes the registry hot for first-launch users. + """ + loader_output = { + "auto-1": _make_skill("auto-1"), + "auto-2": _make_skill("auto-2"), + } + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value=loader_output, + ): + ctx = AgentContext(load_public_skills=True) + assert {s.name for s in ctx.skills} == {"auto-1", "auto-2"} + def test_replacing_auto_skill_via_model_copy_survives_serialization(self): """Regression guard for OpenHands' ``_create_agent_with_skills`` flow. From d81d7c05e257908256176fc6884a313945ac73a7 Mon Sep 17 00:00:00 2001 From: Debug Agent Date: Wed, 20 May 2026 09:50:55 +0200 Subject: [PATCH 7/7] perf(agent_context): honour round_trip + preserve snapshot on full resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two edge cases from the fourth bot review: 1. ``model_dump(round_trip=True)`` is Pydantic's canonical signal for "this dump must rehydrate without semantic loss". The trim ran under it too, so a caller using it for an ``AgentContext`` snapshot would silently reload whatever the loader returns now instead of the in-memory catalog they dumped. Fix: ``info.round_trip`` opts out of the trim, same as the explicit ``preserve_full_skills`` context flag. Pinned by ``test_round_trip_dump_preserves_full_skill_list``. 2. The persistence-resume guard (``loading_from_snapshot``) correctly populated ``state.agent.agent_context.skills`` from the snapshot — but the next line, ``state.agent = agent``, unconditionally overwrote the loaded agent with the runtime one. The runtime agent's ``agent_context`` carried a freshly- resolved (and potentially drifted) skill list from its own ``_load_auto_skills`` run. Autosave then rewrote ``base_state.json`` with the drifted set. The reviewer reproduced this end-to-end: initial persisted ``['a']`` → resumed runtime ``['a', 'b']`` → autosave persisted ``['a', 'b']``. Fix: before ``state.agent = agent``, ``model_copy`` the runtime agent's ``agent_context`` with the loaded snapshot's skills. The snapshot stays authoritative for skills; runtime-only fields (LLM client, tool wiring, etc.) come from the runtime agent as today. Pinned by ``test_conversationstate_resume_preserves_snapshot_skills`` — full create → save → resume cycle via ``ConversationState.create`` and ``LocalFileStore``, not just ``_save_base_state`` in isolation. 793/793 tests pass on ``tests/sdk/context/`` + ``tests/sdk/conversation/`` + ``tests/cross/test_agent_secrets_integration.py``. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../openhands/sdk/context/agent_context.py | 11 +- .../openhands/sdk/conversation/state.py | 22 ++++ ...test_agent_context_skills_serialization.py | 118 ++++++++++++++++++ 3 files changed, 150 insertions(+), 1 deletion(-) diff --git a/openhands-sdk/openhands/sdk/context/agent_context.py b/openhands-sdk/openhands/sdk/context/agent_context.py index 6cce3d50ce..c7c1c8a2b0 100644 --- a/openhands-sdk/openhands/sdk/context/agent_context.py +++ b/openhands-sdk/openhands/sdk/context/agent_context.py @@ -192,7 +192,16 @@ def _serialize_skills(self, value: list[Skill], handler, info) -> Any: nested ``include`` / ``exclude`` for the surviving skills — manual ``s.model_dump(...)`` would have ignored them. """ - if info.context and info.context.get("preserve_full_skills"): + # ``round_trip=True`` is Pydantic's canonical signal for "this + # dump must rehydrate without semantic loss" (e.g. for caches, + # snapshots, or anything the caller will ``model_validate`` + # later). Honour it the same way we honour the explicit + # opt-out flag — trimming under round-trip would let the + # reload silently pick up a *different* skill catalog from + # the loader, which is exactly the loss the flag warns against. + if info.round_trip or ( + info.context and info.context.get("preserve_full_skills") + ): return handler(value) # Auto-load config drifted (or never ran) → can't trust the # snapshot to round-trip. Serialize everything as explicit. diff --git a/openhands-sdk/openhands/sdk/conversation/state.py b/openhands-sdk/openhands/sdk/conversation/state.py index 1435794204..c9faf54603 100644 --- a/openhands-sdk/openhands/sdk/conversation/state.py +++ b/openhands-sdk/openhands/sdk/conversation/state.py @@ -376,6 +376,28 @@ def create( # Verify compatibility (agent class + tools) agent.verify(state.agent, events=state._events) + # Preserve the loaded snapshot's skills onto the runtime + # agent's ``agent_context``. Without this, the runtime + # agent (constructed by the caller before resume) carries + # a freshly-resolved skill list from its own + # ``_load_auto_skills`` run — which may have drifted + # since the snapshot was taken (marketplace adds, user + # edits). Assigning ``state.agent = agent`` then loses + # the snapshot, and autosave rewrites ``base_state.json`` + # with the drifted set. The snapshot is authoritative on + # resume; runtime-only fields (LLM client, tool wiring, + # etc.) come from the runtime agent. See + # software-agent-sdk#3301. + loaded_agent_context = state.agent.agent_context if state.agent else None + if loaded_agent_context is not None and agent.agent_context is not None: + agent = agent.model_copy( + update={ + "agent_context": agent.agent_context.model_copy( + update={"skills": loaded_agent_context.skills} + ) + } + ) + # Commit runtime-provided values (may autosave) state._autosave_enabled = True state.agent = agent diff --git a/tests/sdk/context/test_agent_context_skills_serialization.py b/tests/sdk/context/test_agent_context_skills_serialization.py index 89b45c99ec..75a7816326 100644 --- a/tests/sdk/context/test_agent_context_skills_serialization.py +++ b/tests/sdk/context/test_agent_context_skills_serialization.py @@ -491,3 +491,121 @@ def test_payload_shrinks_to_explicit_only(self): f"serialized skills should be ~1 explicit skill (~150 B), " f"got {skills_bytes} B — auto skills leaked into output" ) + + def test_round_trip_dump_preserves_full_skill_list(self): + """Comment-7 regression: ``model_dump(round_trip=True)`` is + Pydantic's canonical "must reload without semantic loss" + signal. Honour it the same as ``preserve_full_skills`` — + otherwise a caller using it for an ``AgentContext`` snapshot + would reload whatever the current external skill source + returns instead of the in-memory catalog they dumped. + """ + auto = {f"auto-{i}": _make_skill(f"auto-{i}") for i in range(3)} + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value=auto, + ): + ctx = AgentContext(load_public_skills=True) + + # Default dump still trims (the API-wire win). + assert ctx.model_dump()["skills"] == [] + # ``round_trip=True`` keeps the full snapshot. + dumped = ctx.model_dump(round_trip=True) + assert {s["name"] for s in dumped["skills"]} == set(auto.keys()) + + def test_conversationstate_resume_preserves_snapshot_skills(self): + """Comment-8 regression: full create→save→resume cycle. + + The persistence load path used to call ``state.agent = agent`` + unconditionally, overwriting the loaded snapshot's + ``agent_context.skills`` with the runtime agent's freshly- + loaded list. If the loader had drifted since save (marketplace + added a skill), the next autosave rewrote ``base_state.json`` + with the drifted set — breaking the freeze-at-create guarantee. + ``ConversationState.create`` resume path now preserves the + snapshot's skills onto the runtime agent's ``agent_context`` + before assignment. + """ + import tempfile + import uuid + + from openhands.sdk.agent import Agent + from openhands.sdk.conversation.state import ConversationState + from openhands.sdk.io import LocalFileStore + from openhands.sdk.llm import LLM + from openhands.sdk.workspace.local import LocalWorkspace + + # ``a`` was around at save time. Snapshot freezes on it. + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value={"a": _make_skill("a", "original a content")}, + ): + agent_at_save = Agent( + llm=LLM(model="test", usage_id="test"), + agent_context=AgentContext(load_public_skills=True), + ) + + with tempfile.TemporaryDirectory() as tmp: + workspace = LocalWorkspace(working_dir=tmp) + conv_id = uuid.uuid4() + + # Save: create state, persist. + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value={"a": _make_skill("a", "original a content")}, + ): + state_saved = ConversationState.create( + id=conv_id, + agent=agent_at_save, + workspace=workspace, + persistence_dir=tmp, + ) + state_saved._save_base_state(LocalFileStore(tmp)) + + # Now the marketplace adds a new skill `b` and changes `a`. + with patch( + "openhands.sdk.context.agent_context.load_available_skills", + return_value={ + "a": _make_skill("a", "updated a content"), + "b": _make_skill("b", "newly published"), + }, + ): + # Runtime agent on resume — its agent_context auto-load + # picks up the drifted skill set. + runtime_agent = Agent( + llm=LLM(model="test", usage_id="test"), + agent_context=AgentContext(load_public_skills=True), + ) + assert {s.name for s in runtime_agent.agent_context.skills} == { + "a", + "b", + } + # Resume the conversation. The fix must preserve the + # snapshot's skills (just `a`, with original content) + # rather than the runtime agent's drifted set. + state_resumed = ConversationState.create( + id=conv_id, + agent=runtime_agent, + workspace=workspace, + persistence_dir=tmp, + ) + + assert state_resumed.agent.agent_context is not None + resumed_skills = {s.name for s in state_resumed.agent.agent_context.skills} + assert resumed_skills == {"a"}, ( + f"resume picked up drifted skills {resumed_skills} from the runtime " + f"agent's fresh auto-load; should have preserved the snapshot {{a}}" + ) + # And the persisted-to-disk content also matches the + # snapshot — autosave didn't drift it. + with open(f"{tmp}/base_state.json") as f: + import json as _json + + persisted = _json.load(f) + persisted_names = { + s["name"] for s in persisted["agent"]["agent_context"]["skills"] + } + assert persisted_names == {"a"}, ( + f"autosave rewrote base_state.json with drifted skills " + f"{persisted_names} instead of the original snapshot" + )