From 0fedc811a6990b11506c4e6c69b3e41e49523ace Mon Sep 17 00:00:00 2001 From: Jason Tsay Date: Mon, 16 Mar 2026 16:31:42 -0400 Subject: [PATCH 1/7] fix: update silent review examples (#110) * fix: update silent review examples remove old graphic code Signed-off-by: Jason Tsay * fix: update onnxruntime version for py3.10 Signed-off-by: Jason Tsay * move onnxruntime to main dep list Signed-off-by: Jason Tsay * explicit version for 3.10 Signed-off-by: Jason Tsay * fix formatting Signed-off-by: Jason Tsay * fix silent error unit test Signed-off-by: Jason Tsay * skip flaky test Signed-off-by: Jason Tsay --------- Signed-off-by: Jason Tsay Signed-off-by: Osher-Elhadad --- .../providers/auto_from_env/auto_from_env.py | 6 +- altk/post_tool/silent_review/silent_review.py | 6 ++ examples/langgraph_agent_example.py | 81 ++++++++++++--- examples/langgraph_agent_example_streamlit.py | 86 +++++++++++++--- examples/langgraph_agent_sparc_example.py | 7 +- ...langgraph_agent_sparc_example_streamlit.py | 6 +- pyproject.toml | 1 + .../post_tool/silent_review_json_data_test.py | 6 +- .../follow_up_detection/test_follow_up.py | 5 +- .../toolguard/test_toolguard_specs.py | 1 + uv.lock | 99 +++++++++++++++++-- 11 files changed, 255 insertions(+), 49 deletions(-) diff --git a/altk/core/llm/providers/auto_from_env/auto_from_env.py b/altk/core/llm/providers/auto_from_env/auto_from_env.py index 8d622c57..a334639d 100644 --- a/altk/core/llm/providers/auto_from_env/auto_from_env.py +++ b/altk/core/llm/providers/auto_from_env/auto_from_env.py @@ -12,7 +12,7 @@ class AutoFromEnvLLMClient(LLMClient): Default adapter for ALTK, will determine which provider to use based on environment variables. Expects the following environment variables to be set: - - ALTK_MODEL_NAME: optional, model name, assumes litellm if ALTK_PROVIDER_NAME not set + - ALTK_MODEL_NAME: optional, model name, assumes litellm if ALTK_LLM_PROVIDER not set - ALTK_LLM_PROVIDER: optional, the corresponding name in the LLMClient registry If both are not set, client is set to None """ @@ -32,15 +32,15 @@ def __init__(self) -> None: provider_type = get_llm(provider_name) init_sig = inspect.signature(provider_type) if "model_name" in init_sig.parameters: - # make sure provider needs provider in init + # check if model_name is required for provider if not self.model_name: raise EnvironmentError( "Missing model name which is required for this provider; please set the 'ALTK_MODEL_NAME' environment variable or instantiate an appropriate LLMClient." ) self._chosen_provider = provider_type(model_name=self.model_name) + self.model_name_in_generate = True else: self._chosen_provider = provider_type() - self.model_name_in_generate = True @classmethod def provider_class(cls) -> Type[Any]: diff --git a/altk/post_tool/silent_review/silent_review.py b/altk/post_tool/silent_review/silent_review.py index f84cfa68..3f62b27f 100644 --- a/altk/post_tool/silent_review/silent_review.py +++ b/altk/post_tool/silent_review/silent_review.py @@ -17,6 +17,12 @@ class BaseSilentReviewComponent(PostToolReflectionComponent): def _get_review_args(self, data: SilentReviewRunInput) -> tuple: assert isinstance(data.messages, list) and len(data.messages) > 0 + if "data" in data.messages[0]: + return ( + data.messages[0]["data"]["content"], + data.tool_spec, + data.tool_response, + ) return (data.messages[0]["content"], data.tool_spec, data.tool_response) def _run(self, data: SilentReviewRunInput) -> SilentReviewRunOutput: # type: ignore diff --git a/examples/langgraph_agent_example.py b/examples/langgraph_agent_example.py index 504ddd15..50a71ac3 100644 --- a/examples/langgraph_agent_example.py +++ b/examples/langgraph_agent_example.py @@ -8,10 +8,18 @@ """ import random +import warnings +import json -from langgraph.prebuilt import create_react_agent +from langchain_anthropic import ChatAnthropic +from langgraph.graph import StateGraph, START, END from langchain_core.tools import tool from typing_extensions import Annotated +from langgraph.prebuilt import ToolNode +from langchain_core.messages import BaseMessage, HumanMessage +from langchain_core.messages.base import messages_to_dict +import operator +from typing import TypedDict, List from langgraph.prebuilt import InjectedState from altk.post_tool.silent_review.silent_review import ( @@ -23,6 +31,7 @@ from dotenv import load_dotenv load_dotenv() +warnings.filterwarnings("ignore", category=UserWarning) retries = 0 @@ -36,32 +45,78 @@ def get_weather(city: str, state: Annotated[dict, InjectedState]) -> dict[str, s else: result = {"weather": f"It's sunny and {random.randint(50, 90)}F in {city}!"} + return result + + +class AgentState(TypedDict): + messages: Annotated[List[BaseMessage], operator.add] + next: str + + +def post_tool_hook(state: AgentState) -> AgentState: + # Creates a post-tool node that reviews for silent errors + global retries + tool_response = json.loads(state["messages"][-1].content) # Use SilentReview component to check if it's a silent error review_input = SilentReviewRunInput( - messages=state["messages"], tool_response=result + messages=messages_to_dict(state["messages"]), tool_response=tool_response ) reviewer = SilentReviewForJSONDataComponent() review_result = reviewer.process(data=review_input, phase=AgentPhase.RUNTIME) - if review_result.outcome == Outcome.NOT_ACCOMPLISHED: # Agent should retry tool call if silent error was detected print("(ALTK: Silent error detected, retry the get_weather tool!)") retries += 1 - return {"weather": "!!! Silent error detected, RETRY the get_weather tool !!!"} + return { + "next": "agent", + "messages": [ + HumanMessage( + content="!!! Silent error detected, RETRY the get_weather tool !!!" + ) + ], + } else: - return result + return {"next": "final_message"} -agent = create_react_agent( - model="anthropic:claude-sonnet-4-20250514", - tools=[get_weather], - prompt="You are a helpful assistant", -) +def final_message_node(state): + return state -# Runs the agent -result = agent.invoke( - {"messages": [{"role": "user", "content": "what is the weather in sf"}]} + +tools = [get_weather] +llm = ChatAnthropic(model="claude-sonnet-4-20250514") +llm_with_tools = llm.bind_tools(tools, tool_choice="get_weather") + + +def call_model(state: AgentState): + messages = state["messages"] + response = llm_with_tools.invoke(messages) + return {"messages": [response]} + + +# creates agent with pre-tool node that conditionally goes to tool node +builder = StateGraph(AgentState) +builder.add_node("agent", call_model) +builder.add_node("call_tool", ToolNode(tools)) +builder.add_node("post_tool_hook", post_tool_hook) +builder.add_node("final_message", final_message_node) +builder.add_edge(START, "agent") +builder.add_conditional_edges( + "agent", + lambda state: "call_tool" if state["messages"][-1].tool_calls else "final_message", + {"call_tool": "call_tool", "final_message": "final_message"}, +) +builder.add_edge("call_tool", "post_tool_hook") +builder.add_conditional_edges( + "post_tool_hook", + lambda state: state["next"], + {"agent": "agent", "final_message": "final_message"}, ) +builder.add_edge("final_message", END) +agent = builder.compile() + +# Runs the agent, try running this multiple times to see the ALTK detect the silent error +result = agent.invoke({"messages": [HumanMessage(content="what is the weather in sf")]}) print(result["messages"][-1].content) if retries > 0: print(f"(get_weather was retried: {retries} times)") diff --git a/examples/langgraph_agent_example_streamlit.py b/examples/langgraph_agent_example_streamlit.py index 88722d62..b8e3a87e 100644 --- a/examples/langgraph_agent_example_streamlit.py +++ b/examples/langgraph_agent_example_streamlit.py @@ -9,10 +9,18 @@ """ import random +import warnings +import json -from langgraph.prebuilt import create_react_agent +from langchain_anthropic import ChatAnthropic +from langgraph.graph import StateGraph, START, END from langchain_core.tools import tool from typing_extensions import Annotated +from langgraph.prebuilt import ToolNode +from langchain_core.messages import BaseMessage, HumanMessage +from langchain_core.messages.base import messages_to_dict +import operator +from typing import TypedDict, List from langgraph.prebuilt import InjectedState import streamlit as st @@ -24,6 +32,7 @@ from dotenv import load_dotenv +warnings.filterwarnings("ignore", category=UserWarning) load_dotenv() tool_silent_error_raised = False silent_error_raised = False @@ -42,36 +51,83 @@ def get_weather(city: str, state: Annotated[dict, InjectedState]) -> dict[str, s else: result = {"weather": f"It's sunny and {random.randint(50, 90)}F in {city}!"} + return result + + +class AgentState(TypedDict): + messages: Annotated[List[BaseMessage], operator.add] + next: str + + +def post_tool_hook(state: AgentState) -> AgentState: + # Creates a post-tool node that reviews for silent errors if use_silent_review: + global retries + tool_response = json.loads(state["messages"][-1].content) # Use SilentReview component to check if it's a silent error review_input = SilentReviewRunInput( - messages=state["messages"], tool_response=result + messages=messages_to_dict(state["messages"]), tool_response=tool_response ) reviewer = SilentReviewForJSONDataComponent() review_result = reviewer.process(data=review_input, phase=AgentPhase.RUNTIME) - if review_result.outcome == Outcome.NOT_ACCOMPLISHED: # Agent should retry tool call if silent error was detected - print("Silent error detected, retry the get_weather tool!") + print("(ALTK: Silent error detected, retry the get_weather tool!)") + retries += 1 global silent_error_raised silent_error_raised = True - retries += 1 return { - "weather": "!!! Silent error detected, RETRY the get_weather tool !!!" + "next": "agent", + "messages": [ + HumanMessage( + content="!!! Silent error detected, RETRY the get_weather tool !!!" + ) + ], } else: - return result + return {"next": "final_message"} else: - return result + return {"next": "final_message"} + +def final_message_node(state): + return state -agent = create_react_agent( - model="anthropic:claude-sonnet-4-20250514", - tools=[get_weather], - prompt="You are a helpful weather assistant.", + +tools = [get_weather] +llm = ChatAnthropic(model="claude-sonnet-4-20250514") +llm_with_tools = llm.bind_tools(tools, tool_choice="get_weather") + + +def call_model(state: AgentState): + messages = state["messages"] + response = llm_with_tools.invoke(messages) + return {"messages": [response]} + + +# creates agent with pre-tool node that conditionally goes to tool node +builder = StateGraph(AgentState) +builder.add_node("agent", call_model) +builder.add_node("call_tool", ToolNode(tools)) +builder.add_node("post_tool_hook", post_tool_hook) +builder.add_node("final_message", final_message_node) +builder.add_edge(START, "agent") +builder.add_conditional_edges( + "agent", + lambda state: "call_tool" if state["messages"][-1].tool_calls else "final_message", + {"call_tool": "call_tool", "final_message": "final_message"}, +) +builder.add_edge("call_tool", "post_tool_hook") +builder.add_conditional_edges( + "post_tool_hook", + lambda state: state["next"], + {"agent": "agent", "final_message": "final_message"}, ) +builder.add_edge("final_message", END) +agent = builder.compile() + -st.title("ALTK Chatbot example with Silent Review") +st.title("ALTK Chatbot example with Silent Error Review") st.markdown( "This demo demonstrates using the ALTK to check for silent errors on an agent. The weather service will randomly silently fail. \ \n- With Silent Error Review, the silent error is detected and then the agent is suggested to retry. \ @@ -89,10 +145,10 @@ def get_weather(city: str, state: Annotated[dict, InjectedState]) -> dict[str, s with st.chat_message("user"): st.markdown(prompt) - st.session_state.messages.append({"role": "user", "content": prompt}) + st.session_state.messages.append(HumanMessage(content=prompt)) with st.chat_message("assistant"): - inputs = {"messages": [("user", prompt)]} + inputs = {"messages": [HumanMessage(content=prompt)]} result = agent.invoke(inputs) if tool_silent_error_raised: diff --git a/examples/langgraph_agent_sparc_example.py b/examples/langgraph_agent_sparc_example.py index 7e3675d2..19f862ab 100644 --- a/examples/langgraph_agent_sparc_example.py +++ b/examples/langgraph_agent_sparc_example.py @@ -163,9 +163,10 @@ def call_model(state: AgentState): builder.add_edge(START, "agent") builder.add_conditional_edges( "agent", - lambda state: "tool_pre_hook" - if state["messages"][-1].tool_calls - else "final_message", + lambda state: ( + "tool_pre_hook" if state["messages"][-1].tool_calls else "final_message" + ), + {"tool_pre_hook": "tool_pre_hook", "final_message": "final_message"}, ) builder.add_conditional_edges( "tool_pre_hook", diff --git a/examples/langgraph_agent_sparc_example_streamlit.py b/examples/langgraph_agent_sparc_example_streamlit.py index 20181795..ba2a607f 100644 --- a/examples/langgraph_agent_sparc_example_streamlit.py +++ b/examples/langgraph_agent_sparc_example_streamlit.py @@ -182,9 +182,9 @@ def call_model(state: AgentState): builder.add_edge(START, "agent") builder.add_conditional_edges( "agent", - lambda state: "tool_pre_hook" - if state["messages"][-1].tool_calls - else "final_message", + lambda state: ( + "tool_pre_hook" if state["messages"][-1].tool_calls else "final_message" + ), ) builder.add_conditional_edges( "tool_pre_hook", diff --git a/pyproject.toml b/pyproject.toml index d663130d..b35af3ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ dependencies = [ "langchain-text-splitters>=1.0.0", "nltk>=3.9.1", "scipy>=1.15.3", + "onnxruntime==1.23.2 ; python_version == '3.10'", # last version that supports python3.10 ] description = "The Agent Lifecycle Toolkit (ALTK) is a library of components to help agent builders improve their agent with minimal integration effort and setup." diff --git a/tests/post_tool/silent_review_json_data_test.py b/tests/post_tool/silent_review_json_data_test.py index aca3a421..e7c08352 100644 --- a/tests/post_tool/silent_review_json_data_test.py +++ b/tests/post_tool/silent_review_json_data_test.py @@ -26,7 +26,7 @@ def build_test_input() -> SilentReviewRunInput: }, tool_response={ "name": "get_weather", - "result": {"city": "NYC", "temperature": "75F", "condition": "Sunny"}, + "result": {"city": "NYC"}, }, ) @@ -50,7 +50,7 @@ def test_silent_review_json(): result = middleware.process(data=data, phase=AgentPhase.RUNTIME) - # the user query is suppposed to mention a city and it doesn't. The fact that we get a response back + # the user query is suppposed to mention a temperature and it doesn't. The fact that we get a response back # could indicate the presence of a silent error which is why the outcome is 0 assert result.outcome.value == 0.0 @@ -62,6 +62,6 @@ async def test_silent_review_json_async(): middleware = SilentReviewForJSONDataComponent(config=config) result = await middleware.aprocess(data=data, phase=AgentPhase.RUNTIME) - # the user query is suppposed to mention a city and it doesn't. The fact that we get a response back + # the user query is suppposed to mention a temperature and it doesn't. The fact that we get a response back # could indicate the presence of a silent error which is why the outcome is 0 assert result.outcome.value == 0.0 diff --git a/tests/pre_llm/routing/follow_up_detection/test_follow_up.py b/tests/pre_llm/routing/follow_up_detection/test_follow_up.py index 9d051181..94280ece 100644 --- a/tests/pre_llm/routing/follow_up_detection/test_follow_up.py +++ b/tests/pre_llm/routing/follow_up_detection/test_follow_up.py @@ -66,8 +66,9 @@ def test_follow_up_detected_by_callback(caplog, llm_client): AIMessage(content="For which year?"), ], user_query="2021", - detect_follow_up=lambda messages, user_query: user_query.isdigit() - and user_query == "2021", + detect_follow_up=lambda messages, user_query: ( + user_query.isdigit() and user_query == "2021" + ), ), phase=AgentPhase.RUNTIME, ) diff --git a/tests/pre_tool/toolguard/test_toolguard_specs.py b/tests/pre_tool/toolguard/test_toolguard_specs.py index 5630ed3d..f844a6d3 100644 --- a/tests/pre_tool/toolguard/test_toolguard_specs.py +++ b/tests/pre_tool/toolguard/test_toolguard_specs.py @@ -49,6 +49,7 @@ def out_dir(): # Main Test # --------------------------------------------------------------------------- @pytest.mark.asyncio +@pytest.mark.skip(reason="flaky test") async def test_tool_guard_calculator_policy(out_dir: str): funcs = [ divide_tool, diff --git a/uv.lock b/uv.lock index 38836ecf..adad1950 100644 --- a/uv.lock +++ b/uv.lock @@ -64,6 +64,7 @@ dependencies = [ { name = "nltk" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "onnxruntime", version = "1.23.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "openai" }, { name = "pydantic" }, { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, @@ -173,6 +174,7 @@ requires-dist = [ { name = "nl2flow", marker = "sys_platform != 'win32' and extra == 'refraction'", specifier = ">=0.1.2" }, { name = "nltk", specifier = ">=3.9.1" }, { name = "numpy", specifier = ">=2.2.6" }, + { name = "onnxruntime", marker = "python_full_version == '3.10.*'", specifier = "==1.23.2" }, { name = "openai", specifier = ">=1.0.0" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "pydash", marker = "extra == 'routing'", specifier = ">=8.0.5" }, @@ -974,7 +976,8 @@ dependencies = [ { name = "mmh3" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "onnxruntime" }, + { name = "onnxruntime", version = "1.23.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "onnxruntime", version = "1.24.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "opentelemetry-api" }, { name = "opentelemetry-exporter-otlp-proto-grpc" }, { name = "opentelemetry-sdk" }, @@ -1044,6 +1047,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, ] +[[package]] +name = "coloredlogs" +version = "15.0.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "humanfriendly", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cc/c7/eed8f27100517e8c0e6b923d5f0845d0cb99763da6fdee00478f91db7325/coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0", size = 278520, upload-time = "2021-06-11T10:22:45.202Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018, upload-time = "2021-06-11T10:22:42.561Z" }, +] + [[package]] name = "colorlog" version = "6.10.1" @@ -2355,6 +2370,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a8/af/48ac8483240de756d2438c380746e7130d1c6f75802ef22f3c6d49982787/huggingface_hub-0.36.2-py3-none-any.whl", hash = "sha256:48f0c8eac16145dfce371e9d2d7772854a4f591bcb56c9cf548accf531d54270", size = 566395, upload-time = "2026-02-06T09:24:11.133Z" }, ] +[[package]] +name = "humanfriendly" +version = "10.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyreadline3", marker = "python_full_version < '3.11' and sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cc/3f/2c29224acb2e2df4d2046e4c73ee2662023c58ff5b113c4c1adac0886c43/humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc", size = 360702, upload-time = "2021-09-17T21:40:43.31Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", size = 86794, upload-time = "2021-09-17T21:40:39.897Z" }, +] + [[package]] name = "hypothesis" version = "6.151.5" @@ -4995,15 +5022,63 @@ wheels = [ [[package]] name = "onnxruntime" -version = "1.24.1" +version = "1.23.2" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.11' and platform_machine == 'x86_64' and sys_platform == 'darwin'", + "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'darwin')", +] dependencies = [ - { name = "flatbuffers" }, + { name = "coloredlogs", marker = "python_full_version < '3.11'" }, + { name = "flatbuffers", marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "packaging", marker = "python_full_version < '3.11'" }, + { name = "protobuf", marker = "python_full_version < '3.11'" }, + { name = "sympy", marker = "python_full_version < '3.11'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/35/d6/311b1afea060015b56c742f3531168c1644650767f27ef40062569960587/onnxruntime-1.23.2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:a7730122afe186a784660f6ec5807138bf9d792fa1df76556b27307ea9ebcbe3", size = 17195934, upload-time = "2025-10-27T23:06:14.143Z" }, + { url = "https://files.pythonhosted.org/packages/db/db/81bf3d7cecfbfed9092b6b4052e857a769d62ed90561b410014e0aae18db/onnxruntime-1.23.2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:b28740f4ecef1738ea8f807461dd541b8287d5650b5be33bca7b474e3cbd1f36", size = 19153079, upload-time = "2025-10-27T23:05:57.686Z" }, + { url = "https://files.pythonhosted.org/packages/2e/4d/a382452b17cf70a2313153c520ea4c96ab670c996cb3a95cc5d5ac7bfdac/onnxruntime-1.23.2-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8f7d1fe034090a1e371b7f3ca9d3ccae2fabae8c1d8844fb7371d1ea38e8e8d2", size = 15219883, upload-time = "2025-10-22T03:46:21.66Z" }, + { url = "https://files.pythonhosted.org/packages/fb/56/179bf90679984c85b417664c26aae4f427cba7514bd2d65c43b181b7b08b/onnxruntime-1.23.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4ca88747e708e5c67337b0f65eed4b7d0dd70d22ac332038c9fc4635760018f7", size = 17370357, upload-time = "2025-10-22T03:46:57.968Z" }, + { url = "https://files.pythonhosted.org/packages/cd/6d/738e50c47c2fd285b1e6c8083f15dac1a5f6199213378a5f14092497296d/onnxruntime-1.23.2-cp310-cp310-win_amd64.whl", hash = "sha256:0be6a37a45e6719db5120e9986fcd30ea205ac8103fd1fb74b6c33348327a0cc", size = 13467651, upload-time = "2025-10-27T23:06:11.904Z" }, + { url = "https://files.pythonhosted.org/packages/44/be/467b00f09061572f022ffd17e49e49e5a7a789056bad95b54dfd3bee73ff/onnxruntime-1.23.2-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:6f91d2c9b0965e86827a5ba01531d5b669770b01775b23199565d6c1f136616c", size = 17196113, upload-time = "2025-10-22T03:47:33.526Z" }, + { url = "https://files.pythonhosted.org/packages/9f/a8/3c23a8f75f93122d2b3410bfb74d06d0f8da4ac663185f91866b03f7da1b/onnxruntime-1.23.2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:87d8b6eaf0fbeb6835a60a4265fde7a3b60157cf1b2764773ac47237b4d48612", size = 19153857, upload-time = "2025-10-22T03:46:37.578Z" }, + { url = "https://files.pythonhosted.org/packages/3f/d8/506eed9af03d86f8db4880a4c47cd0dffee973ef7e4f4cff9f1d4bcf7d22/onnxruntime-1.23.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bbfd2fca76c855317568c1b36a885ddea2272c13cb0e395002c402f2360429a6", size = 15220095, upload-time = "2025-10-22T03:46:24.769Z" }, + { url = "https://files.pythonhosted.org/packages/e9/80/113381ba832d5e777accedc6cb41d10f9eca82321ae31ebb6bcede530cea/onnxruntime-1.23.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da44b99206e77734c5819aa2142c69e64f3b46edc3bd314f6a45a932defc0b3e", size = 17372080, upload-time = "2025-10-22T03:47:00.265Z" }, + { url = "https://files.pythonhosted.org/packages/3a/db/1b4a62e23183a0c3fe441782462c0ede9a2a65c6bbffb9582fab7c7a0d38/onnxruntime-1.23.2-cp311-cp311-win_amd64.whl", hash = "sha256:902c756d8b633ce0dedd889b7c08459433fbcf35e9c38d1c03ddc020f0648c6e", size = 13468349, upload-time = "2025-10-22T03:47:25.783Z" }, + { url = "https://files.pythonhosted.org/packages/1b/9e/f748cd64161213adeef83d0cb16cb8ace1e62fa501033acdd9f9341fff57/onnxruntime-1.23.2-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:b8f029a6b98d3cf5be564d52802bb50a8489ab73409fa9db0bf583eabb7c2321", size = 17195929, upload-time = "2025-10-22T03:47:36.24Z" }, + { url = "https://files.pythonhosted.org/packages/91/9d/a81aafd899b900101988ead7fb14974c8a58695338ab6a0f3d6b0100f30b/onnxruntime-1.23.2-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:218295a8acae83905f6f1aed8cacb8e3eb3bd7513a13fe4ba3b2664a19fc4a6b", size = 19157705, upload-time = "2025-10-22T03:46:40.415Z" }, + { url = "https://files.pythonhosted.org/packages/3c/35/4e40f2fba272a6698d62be2cd21ddc3675edfc1a4b9ddefcc4648f115315/onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:76ff670550dc23e58ea9bc53b5149b99a44e63b34b524f7b8547469aaa0dcb8c", size = 15226915, upload-time = "2025-10-22T03:46:27.773Z" }, + { url = "https://files.pythonhosted.org/packages/ef/88/9cc25d2bafe6bc0d4d3c1db3ade98196d5b355c0b273e6a5dc09c5d5d0d5/onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f9b4ae77f8e3c9bee50c27bc1beede83f786fe1d52e99ac85aa8d65a01e9b77", size = 17382649, upload-time = "2025-10-22T03:47:02.782Z" }, + { url = "https://files.pythonhosted.org/packages/c0/b4/569d298f9fc4d286c11c45e85d9ffa9e877af12ace98af8cab52396e8f46/onnxruntime-1.23.2-cp312-cp312-win_amd64.whl", hash = "sha256:25de5214923ce941a3523739d34a520aac30f21e631de53bba9174dc9c004435", size = 13470528, upload-time = "2025-10-22T03:47:28.106Z" }, + { url = "https://files.pythonhosted.org/packages/3d/41/fba0cabccecefe4a1b5fc8020c44febb334637f133acefc7ec492029dd2c/onnxruntime-1.23.2-cp313-cp313-macosx_13_0_arm64.whl", hash = "sha256:2ff531ad8496281b4297f32b83b01cdd719617e2351ffe0dba5684fb283afa1f", size = 17196337, upload-time = "2025-10-22T03:46:35.168Z" }, + { url = "https://files.pythonhosted.org/packages/fe/f9/2d49ca491c6a986acce9f1d1d5fc2099108958cc1710c28e89a032c9cfe9/onnxruntime-1.23.2-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:162f4ca894ec3de1a6fd53589e511e06ecdc3ff646849b62a9da7489dee9ce95", size = 19157691, upload-time = "2025-10-22T03:46:43.518Z" }, + { url = "https://files.pythonhosted.org/packages/1c/a1/428ee29c6eaf09a6f6be56f836213f104618fb35ac6cc586ff0f477263eb/onnxruntime-1.23.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:45d127d6e1e9b99d1ebeae9bcd8f98617a812f53f46699eafeb976275744826b", size = 15226898, upload-time = "2025-10-22T03:46:30.039Z" }, + { url = "https://files.pythonhosted.org/packages/f2/2b/b57c8a2466a3126dbe0a792f56ad7290949b02f47b86216cd47d857e4b77/onnxruntime-1.23.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8bace4e0d46480fbeeb7bbe1ffe1f080e6663a42d1086ff95c1551f2d39e7872", size = 17382518, upload-time = "2025-10-22T03:47:05.407Z" }, + { url = "https://files.pythonhosted.org/packages/4a/93/aba75358133b3a941d736816dd392f687e7eab77215a6e429879080b76b6/onnxruntime-1.23.2-cp313-cp313-win_amd64.whl", hash = "sha256:1f9cc0a55349c584f083c1c076e611a7c35d5b867d5d6e6d6c823bf821978088", size = 13470276, upload-time = "2025-10-22T03:47:31.193Z" }, + { url = "https://files.pythonhosted.org/packages/7c/3d/6830fa61c69ca8e905f237001dbfc01689a4e4ab06147020a4518318881f/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9d2385e774f46ac38f02b3a91a91e30263d41b2f1f4f26ae34805b2a9ddef466", size = 15229610, upload-time = "2025-10-22T03:46:32.239Z" }, + { url = "https://files.pythonhosted.org/packages/b6/ca/862b1e7a639460f0ca25fd5b6135fb42cf9deea86d398a92e44dfda2279d/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e2b9233c4947907fd1818d0e581c049c41ccc39b2856cc942ff6d26317cee145", size = 17394184, upload-time = "2025-10-22T03:47:08.127Z" }, +] + +[[package]] +name = "onnxruntime" +version = "1.24.1" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'darwin'", + "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'darwin'", + "(python_full_version >= '3.13' and platform_machine != 'x86_64') or (python_full_version >= '3.13' and sys_platform != 'darwin')", + "(python_full_version == '3.12.*' and platform_machine != 'x86_64') or (python_full_version == '3.12.*' and sys_platform != 'darwin')", + "(python_full_version == '3.11.*' and platform_machine != 'x86_64') or (python_full_version == '3.11.*' and sys_platform != 'darwin')", +] +dependencies = [ + { name = "flatbuffers", marker = "python_full_version >= '3.11'" }, { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "packaging" }, - { name = "protobuf" }, - { name = "sympy" }, + { name = "packaging", marker = "python_full_version >= '3.11'" }, + { name = "protobuf", marker = "python_full_version >= '3.11'" }, + { name = "sympy", marker = "python_full_version >= '3.11'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/d2/88/d9757c62a0f96b5193f8d447a141eefd14498c404cc5caf1a6f3233cf102/onnxruntime-1.24.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:79b3119ab9f4f3817062e6dbe7f4a44937de93905e3a31ba34313d18cb49e7be", size = 17212018, upload-time = "2026-02-05T17:32:13.986Z" }, @@ -6469,7 +6544,8 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "onnxruntime" }, + { name = "onnxruntime", version = "1.23.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "onnxruntime", version = "1.24.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "protobuf" }, { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "scipy", version = "1.17.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, @@ -6649,6 +6725,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bd/24/12818598c362d7f300f18e74db45963dbcb85150324092410c8b49405e42/pyproject_hooks-1.2.0-py3-none-any.whl", hash = "sha256:9e5c6bfa8dcc30091c74b0cf803c81fdd29d94f01992a7707bc97babb1141913", size = 10216, upload-time = "2024-09-29T09:24:11.978Z" }, ] +[[package]] +name = "pyreadline3" +version = "3.5.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0f/49/4cea918a08f02817aabae639e3d0ac046fef9f9180518a3ad394e22da148/pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7", size = 99839, upload-time = "2024-09-19T02:40:10.062Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/dc/491b7661614ab97483abf2056be1deee4dc2490ecbf7bff9ab5cdbac86e1/pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6", size = 83178, upload-time = "2024-09-19T02:40:08.598Z" }, +] + [[package]] name = "pyright" version = "1.1.408" From cafb51fa28c96ed28e3ec97fdd527daae8d2b896 Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 16 Mar 2026 20:43:50 +0000 Subject: [PATCH 2/7] chore: bump version to 0.10.1 [skip ci] Signed-off-by: Osher-Elhadad --- CHANGELOG.md | 43 +++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- uv.lock | 2 +- 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 15604213..5930cb41 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,49 @@ +## v0.10.1 (2026-03-16) + +### Bug Fixes + +- Bug in sparc type assignment + ([#108](https://github.com/AgentToolkit/agent-lifecycle-toolkit/pull/108), + [`beceaa5`](https://github.com/AgentToolkit/agent-lifecycle-toolkit/commit/beceaa5594cf8a84162c623f454710faef2f4179)) + +- Toolguard APIs ([#104](https://github.com/AgentToolkit/agent-lifecycle-toolkit/pull/104), + [`a9b90ba`](https://github.com/AgentToolkit/agent-lifecycle-toolkit/commit/a9b90bafc08af98100bc06313ac3120d0bf48e94)) + +- Toolguard comply with Python 3.10 + ([#109](https://github.com/AgentToolkit/agent-lifecycle-toolkit/pull/109), + [`c34467f`](https://github.com/AgentToolkit/agent-lifecycle-toolkit/commit/c34467f42bb7788ef251b0e5d3f88a8336bfbfc5)) + +- Toolguard updates + ([`4672cd2`](https://github.com/AgentToolkit/agent-lifecycle-toolkit/commit/4672cd2ac3e624ecab0814a3b0ca9676c89214e8)) + +- Update milvus-lite and disable associated tests + ([#111](https://github.com/AgentToolkit/agent-lifecycle-toolkit/pull/111), + [`64f356d`](https://github.com/AgentToolkit/agent-lifecycle-toolkit/commit/64f356d22be1e9cfe922ac903948910e8dc4989f)) + +- Update onnxruntime version for py3.10 + ([#110](https://github.com/AgentToolkit/agent-lifecycle-toolkit/pull/110), + [`187a7ac`](https://github.com/AgentToolkit/agent-lifecycle-toolkit/commit/187a7ac9ab8ef45d0b3e109da23df187698a5f35)) + +- Update silent review examples + ([#110](https://github.com/AgentToolkit/agent-lifecycle-toolkit/pull/110), + [`187a7ac`](https://github.com/AgentToolkit/agent-lifecycle-toolkit/commit/187a7ac9ab8ef45d0b3e109da23df187698a5f35)) + +### Documentation + +- Add ToolGuard component to landing page + ([#105](https://github.com/AgentToolkit/agent-lifecycle-toolkit/pull/105), + [`63e94ea`](https://github.com/AgentToolkit/agent-lifecycle-toolkit/commit/63e94ea7fb731c1a1ddc3c93ff94f034bcacb751)) + +- Update images ([#97](https://github.com/AgentToolkit/agent-lifecycle-toolkit/pull/97), + [`bb2ee3f`](https://github.com/AgentToolkit/agent-lifecycle-toolkit/commit/bb2ee3fea891d811a71389e5d60912ff092ee73e)) + +- Updates ([#96](https://github.com/AgentToolkit/agent-lifecycle-toolkit/pull/96), + [`dd5cec4`](https://github.com/AgentToolkit/agent-lifecycle-toolkit/commit/dd5cec4d560df0003535b47f520973f4242b6806)) + + ## v0.10.0 (2025-12-23) ### Bug Fixes diff --git a/pyproject.toml b/pyproject.toml index b35af3ae..fcf54029 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "agent-lifecycle-toolkit" requires-python = ">=3.10" -version = "0.10.0" +version = "0.10.1" classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Software Development :: Libraries :: Application Frameworks", diff --git a/uv.lock b/uv.lock index adad1950..78eeb012 100644 --- a/uv.lock +++ b/uv.lock @@ -43,7 +43,7 @@ wheels = [ [[package]] name = "agent-lifecycle-toolkit" -version = "0.10.0" +version = "0.10.1" source = { editable = "." } dependencies = [ { name = "aiofiles" }, From 723dbf4f36d13c23621ac87dd181c523f54f3ee6 Mon Sep 17 00:00:00 2001 From: Osher-Elhadad Date: Tue, 5 May 2026 21:56:57 +0300 Subject: [PATCH 3/7] feat: add compact-tool-schema mode, runtime pipeline flag, and SPARC actionable recommendations - Add compact_tool_schema ('auto'|'never'|'always') and compact_tool_threshold config fields to SPARCReflectionConfig for scalable function-selection prompts - Add runtime_pipeline flag to switch between fast runtime prompts and evaluation-time prompts that return actionable unified-diff recommendations - Add SPARCRecommendation / SPARCRecommendationTarget types and _rubric_score, _rubric_confidence, _extract_recommendations helpers in sparc.py - Add CompactMode adapter with compact tool-inventory rendering in adapters.py - Restructure SPARC metric JSON files (general, function_selection, parameter) with runtime variants; add common_principles.py for shared metric principles - Update semantic_checker.py and transformation_prompts.py for new pipeline modes - Update output_parser.py with significant improvements - Add comprehensive test suite: compact schema adapter, eval metrics schema, no benchmark leakage, reflection score, runtime prompts, transformation prompts, and validating LLM client tests Signed-off-by: Osher-Elhadad --- altk/core/llm/output_parser.py | 215 +- altk/pre_tool/core/__init__.py | 4 + altk/pre_tool/core/config.py | 36 + altk/pre_tool/core/types.py | 109 + .../sparc/function_calling/metrics/base.py | 10 + .../metrics/common_principles.py | 57 + .../metrics/function_call/general.py | 1 + .../function_call/general_metrics.json | 2171 +++++++++-------- .../general_metrics_runtime.json | 1577 ++++++------ .../function_selection/function_selection.py | 1 + .../function_selection_metrics.json | 1172 ++++----- .../function_selection_metrics_runtime.json | 910 +++---- .../metrics/parameter/parameter.py | 1 + .../metrics/parameter/parameter_metrics.json | 1218 ++++----- .../parameter/parameter_metrics_runtime.json | 926 +++---- .../function_calling/pipeline/adapters.py | 60 +- .../function_calling/pipeline/pipeline.py | 4 + .../pipeline/semantic_checker.py | 18 +- .../pipeline/transformation_prompts.py | 67 +- altk/pre_tool/sparc/sparc.py | 127 + tests/core/test_validating_llm_client.py | 308 +++ .../sparc/test_adapter_compact_schema.py | 204 ++ .../sparc/test_eval_metrics_schema.py | 214 ++ .../sparc/test_no_benchmark_leakage.py | 163 ++ tests/pre_tool/sparc/test_reflection_score.py | 289 +++ tests/pre_tool/sparc/test_runtime_prompts.py | 286 +++ .../sparc/test_transformation_prompts.py | 63 + 27 files changed, 6319 insertions(+), 3892 deletions(-) create mode 100644 altk/pre_tool/sparc/function_calling/metrics/common_principles.py create mode 100644 tests/core/test_validating_llm_client.py create mode 100644 tests/pre_tool/sparc/test_adapter_compact_schema.py create mode 100644 tests/pre_tool/sparc/test_eval_metrics_schema.py create mode 100644 tests/pre_tool/sparc/test_no_benchmark_leakage.py create mode 100644 tests/pre_tool/sparc/test_reflection_score.py create mode 100644 tests/pre_tool/sparc/test_runtime_prompts.py create mode 100644 tests/pre_tool/sparc/test_transformation_prompts.py diff --git a/altk/core/llm/output_parser.py b/altk/core/llm/output_parser.py index 7a4e15f2..8d47f88e 100644 --- a/altk/core/llm/output_parser.py +++ b/altk/core/llm/output_parser.py @@ -26,8 +26,25 @@ def json_schema_to_pydantic_model( - schema: Dict[str, Any], model_name: str = "AutoModel" + schema: Dict[str, Any], + model_name: str = "AutoModel", + free_form_object_as_str: bool = False, ) -> Type[BaseModel]: + """Build a Pydantic model from a JSON Schema dict. + + Args: + schema: JSON Schema dict. + model_name: name of the generated Pydantic model. + free_form_object_as_str: when ``True``, any free-form ``type: object`` + property (one without its own ``properties`` sub-schema) is + modeled as a JSON-formatted ``str`` instead of a ``dict``. This + is the workaround for OpenAI's structured-output API, which + requires ``additionalProperties: false`` on every object schema — + a constraint that free-form dicts cannot meet. The caller is + expected to use :func:`relax_freeform_object_schema` when + validating the raw output so the JSON-string form is accepted. + Default ``False`` preserves backward-compatible behavior. + """ fields = {} required_fields = set(schema.get("required", [])) @@ -41,9 +58,25 @@ def json_schema_to_pydantic_model( "null": type(None), } - def parse_type(type_def: Union[str, List[str]]) -> Type[T]: + def _map_object_for_prop(prop_schema: Dict[str, Any]) -> Type: + """Return dict/str for a property whose declared type is ``object``. + + A property is "free-form" if it has no ``properties`` sub-schema; the + OpenAI workaround only applies to those. + """ + if free_form_object_as_str and "properties" not in prop_schema: + return str + return dict + + def parse_type( + type_def: Union[str, List[str], None], + prop_schema: Dict[str, Any], + ) -> Type[T]: + def _lookup(t: str) -> Type: + return _map_object_for_prop(prop_schema) if t == "object" else type_mapping.get(t, Any) + if isinstance(type_def, list): - python_types = [type_mapping.get(t, Any) for t in type_def] + python_types = [_lookup(t) for t in type_def] if type(None) in python_types: python_types.remove(type(None)) if len(python_types) == 1: @@ -52,11 +85,12 @@ def parse_type(type_def: Union[str, List[str]]) -> Type[T]: return Optional[Union[tuple(python_types)]] # type: ignore else: return Union[tuple(python_types)] # type: ignore - else: - return type_mapping.get(type_def, Any) + if isinstance(type_def, str): + return _lookup(type_def) + return Any # type: ignore[return-value] for prop_name, prop_schema in schema.get("properties", {}).items(): - field_type: Any = parse_type(prop_schema.get("type")) + field_type: Any = parse_type(prop_schema.get("type"), prop_schema) default = ... if prop_name in required_fields else None description = prop_schema.get("description", None) field_args = {"description": description} if description else {} @@ -65,6 +99,28 @@ def parse_type(type_def: Union[str, List[str]]) -> Type[T]: return create_model(model_name, **fields) # type: ignore +def relax_freeform_object_schema(schema: Dict[str, Any]) -> Dict[str, Any]: + """Return a deep copy of *schema* with free-form ``"type": "object"`` + properties widened to accept ``"string"`` as well. + + This is the validation-time counterpart to + ``json_schema_to_pydantic_model(..., free_form_object_as_str=True)``: when + the Pydantic model emits a JSON string for a free-form object field, + ``jsonschema.validate`` against the original schema would reject it. This + helper widens those fields so the same schema accepts both object-literal + and stringified forms. Schemas where the object has sub-``properties`` are + left alone. + """ + import copy + + relaxed = copy.deepcopy(schema) + for _prop, prop_schema in relaxed.get("properties", {}).items(): + t = prop_schema.get("type") + if t == "object" and "properties" not in prop_schema: + prop_schema["type"] = ["object", "string"] + return relaxed + + class OutputValidationError(Exception): """Raised when LLM output cannot be validated against the provided schema.""" @@ -82,8 +138,113 @@ class ValidatingLLMClient(BaseLLMClient, ABC): - Validates and parses the response. - Retries only invalid items (single or batch) up to `retries` times. - Falls back to single-item loops if no batch method is configured. + + Production knobs (instance-level, with class-level defaults): + - ``free_form_object_as_str``: when ``True``, free-form ``type: object`` + schema fields are modeled in Pydantic as ``str`` (and the validation + schema is widened at runtime to accept both object and string). Use + this for providers that require ``additionalProperties: false`` on + every object schema (notably OpenAI's structured-output API). + - ``prompt_based_validation``: when ``True``, the schema is always + injected into the system prompt and no native ``response_format`` + kwarg is forwarded. Use for providers that don't support OpenAI-style + structured output (e.g. watsonx). + - ``default_generation_kwargs``: dict of kwargs merged into every + ``generate``/``generate_async`` call (e.g. ``{"max_tokens": 8096, + "temperature": 0}``). Caller-provided kwargs override the defaults. """ + # Class-level defaults — override on subclasses or per instance in + # ``configure_validation`` / constructor kwargs. + free_form_object_as_str: bool = False + prompt_based_validation: bool = False + + def __init__( + self, + *, + free_form_object_as_str: Optional[bool] = None, + prompt_based_validation: Optional[bool] = None, + default_generation_kwargs: Optional[Dict[str, Any]] = None, + **base_kwargs: Any, + ) -> None: + if free_form_object_as_str is not None: + self.free_form_object_as_str = free_form_object_as_str + if prompt_based_validation is not None: + self.prompt_based_validation = prompt_based_validation + self.default_generation_kwargs: Dict[str, Any] = dict( + default_generation_kwargs or {} + ) + super().__init__(**base_kwargs) + # Wrap the subclass's _parse_llm_response so empty / malformed LLM + # outputs retry gracefully (the retry loop treats "" as invalid) + # rather than raising an unrecoverable ValueError. + # This particularly covers reasoning models that exhaust max_tokens + # on "thinking" tokens and return finish_reason="length" with no + # content but non-empty reasoning_content. + orig_parse = self._parse_llm_response + self._parse_llm_response = self._build_safe_parse(orig_parse) # type: ignore[assignment] + + def configure_validation( + self, + *, + free_form_object_as_str: Optional[bool] = None, + prompt_based_validation: Optional[bool] = None, + default_generation_kwargs: Optional[Dict[str, Any]] = None, + ) -> "ValidatingLLMClient": + """Update the validation knobs after construction (chainable).""" + if free_form_object_as_str is not None: + self.free_form_object_as_str = free_form_object_as_str + if prompt_based_validation is not None: + self.prompt_based_validation = prompt_based_validation + if default_generation_kwargs is not None: + self.default_generation_kwargs = dict(default_generation_kwargs) + return self + + @staticmethod + def _build_safe_parse(orig): # noqa: ANN001, ANN205 + """Wrap ``_parse_llm_response`` so parse failures become retry-worthy + empty strings instead of raising. Also surfaces a targeted warning + when a reasoning-only response exhausted the token budget.""" + import logging as _logging + + _logger = _logging.getLogger("altk.core.llm.output_parser") + + def _safe_parse(raw): # noqa: ANN001, ANN202 + try: + return orig(raw) + except (ValueError, KeyError): + # Detect: choice with reasoning_content but finish_reason='length' + _choices = getattr(raw, "choices", None) or ( + raw.get("choices", []) if isinstance(raw, dict) else [] + ) + if _choices: + c0 = _choices[0] + _msg = getattr(c0, "message", None) or ( + c0.get("message", {}) if isinstance(c0, dict) else {} + ) + _reasoning = ( + getattr(_msg, "reasoning_content", None) + or ( + _msg.get("reasoning_content") + if isinstance(_msg, dict) + else None + ) + ) + _finish = getattr(c0, "finish_reason", None) or ( + c0.get("finish_reason") if isinstance(c0, dict) else None + ) + if _reasoning and _finish == "length": + _logger.warning( + "LLM reasoning consumed the entire token budget " + "(finish_reason='length'). Consider increasing " + "max_tokens. Will retry." + ) + return "" + _logger.debug("LLM returned empty/unparseable response; will retry.") + return "" + + return _safe_parse + @classmethod @abstractmethod def provider_class(cls) -> Type[Any]: @@ -168,8 +329,16 @@ def _validate( raise ImportError( "jsonschema is required for JSON Schema validation. Install with: pip install jsonschema" ) + # Widen free-form object props to also accept strings when we're + # configured to round-trip them as JSON strings (see + # ``free_form_object_as_str`` in the class docstring). + effective_schema = ( + relax_freeform_object_schema(schema) + if self.free_form_object_as_str + else schema + ) try: - jsonschema.validate(instance=data, schema=schema) + jsonschema.validate(instance=data, schema=effective_schema) except jsonschema.ValidationError as e: raise OutputValidationError( f"JSON Schema validation error: {e.message}" @@ -225,6 +394,17 @@ def generate( """ Synchronous single-item generation with validation + retries. """ + # Instance defaults — caller kwargs win. + if self.default_generation_kwargs: + merged = {**self.default_generation_kwargs} + merged.update(kwargs) + kwargs = merged + # Providers that don't support native structured output switch to + # prompt-based schema injection and drop any OpenAI-style + # ``response_format`` field. + if self.prompt_based_validation: + include_schema_in_system_prompt = True + schema_field = None current = prompt instr = None if include_schema_in_system_prompt: @@ -233,7 +413,10 @@ def generate( if schema_field: kwargs[schema_field] = schema if isinstance(schema, dict): - new_schema = json_schema_to_pydantic_model(schema) + new_schema = json_schema_to_pydantic_model( + schema, + free_form_object_as_str=self.free_form_object_as_str, + ) kwargs[schema_field] = new_schema last_error: Optional[str] = None @@ -289,6 +472,17 @@ async def generate_async( """ Asynchronous single-item generation with validation + retries. """ + # Instance defaults — caller kwargs win. + if self.default_generation_kwargs: + merged = {**self.default_generation_kwargs} + merged.update(kwargs) + kwargs = merged + # Providers that don't support native structured output switch to + # prompt-based schema injection and drop any OpenAI-style + # ``response_format`` field. + if self.prompt_based_validation: + include_schema_in_system_prompt = True + schema_field = None current = prompt instr = None if include_schema_in_system_prompt: @@ -297,7 +491,10 @@ async def generate_async( if schema_field: kwargs[schema_field] = schema if isinstance(schema, dict): - new_schema = json_schema_to_pydantic_model(schema) + new_schema = json_schema_to_pydantic_model( + schema, + free_form_object_as_str=self.free_form_object_as_str, + ) kwargs[schema_field] = new_schema last_error: Optional[str] = None diff --git a/altk/pre_tool/core/__init__.py b/altk/pre_tool/core/__init__.py index 04ec3a9e..00580858 100644 --- a/altk/pre_tool/core/__init__.py +++ b/altk/pre_tool/core/__init__.py @@ -6,6 +6,8 @@ SPARCReflectionDecision, SPARCReflectionIssue, SPARCReflectionIssueType, + SPARCRecommendation, + SPARCRecommendationTarget, PreToolReflectionRunInput, PreToolReflectionRunOutput, PreToolReflectionBuildInput, @@ -28,6 +30,8 @@ "SPARCReflectionDecision", "SPARCReflectionIssue", "SPARCReflectionIssueType", + "SPARCRecommendation", + "SPARCRecommendationTarget", "PreToolReflectionRunInput", "PreToolReflectionRunOutput", "PreToolReflectionBuildInput", diff --git a/altk/pre_tool/core/config.py b/altk/pre_tool/core/config.py index 03e458f3..5bad4a55 100644 --- a/altk/pre_tool/core/config.py +++ b/altk/pre_tool/core/config.py @@ -90,10 +90,46 @@ class SPARCReflectionConfig(BaseModel): description="Whether to include raw reflection pipeline response in output", ) + # Tool-inventory rendering (function_selection / agentic_constraints prompts) + compact_tool_schema: str = Field( + default="auto", + description=( + "How to render the tool inventory in function-selection prompts: " + "'auto' (compact form once len(specs) >= compact_tool_threshold, " + "full summary otherwise), 'never' (always full summary: " + "description + {param_name: type}), 'always' (always compact: " + "description + list of parameter names)." + ), + pattern="^(auto|never|always)$", + ) + compact_tool_threshold: int = Field( + default=20, + description=( + "Tool-count threshold for compact_tool_schema='auto'. Inventories " + "with at least this many tools switch to the compact form." + ), + ge=1, + ) + verbose_logging: bool = Field( default=False, description="Enable verbose logging for debugging" ) + # Prompt-variant selection: runtime (default, fast prompts, no actionable + # recommendations) vs. evaluation-time (heavier prompts that return + # unified-diff recommendations with importance scores). Evaluation mode + # is meant for post-hoc analysis / prompt improvement feedback, not for + # low-latency pre-action reflection. + runtime_pipeline: bool = Field( + default=True, + description=( + "If True, load the *_runtime.json metric files (faster, no " + "actionable recommendations). If False, load the evaluation-time " + "metric files that emit unified-diff recommendations with " + "importance scores — useful for offline analysis." + ), + ) + class Config: """Pydantic configuration.""" diff --git a/altk/pre_tool/core/types.py b/altk/pre_tool/core/types.py index c41ac0c6..a3341fb0 100644 --- a/altk/pre_tool/core/types.py +++ b/altk/pre_tool/core/types.py @@ -24,6 +24,59 @@ class SPARCReflectionIssueType(str, Enum): ERROR = "error" +class SPARCRecommendationTarget(str, Enum): + """Artifact a recommendation is meant to be applied to.""" + + SYSTEM_PROMPT = "system_prompt" + TOOL_DESCRIPTION = "tool_description" + PARAMETER_DESCRIPTION = "parameter_description" + PARAMETER_EXAMPLES = "parameter_examples" + + +class SPARCRecommendation(BaseModel): + """Actionable recommendation for improving an agent's prompts or tool specs. + + Emitted ONLY in evaluation-time mode (``runtime_pipeline=False``). + Runtime mode omits recommendations to keep prompts short and latency low. + """ + + target: SPARCRecommendationTarget = Field( + description="Which artifact the diff applies to.", + ) + tool_name: Optional[str] = Field( + default=None, + description=( + "Required when target is TOOL_DESCRIPTION, PARAMETER_DESCRIPTION, " + "or PARAMETER_EXAMPLES; None for SYSTEM_PROMPT." + ), + ) + parameter_name: Optional[str] = Field( + default=None, + description=( + "Required when target is PARAMETER_DESCRIPTION or " + "PARAMETER_EXAMPLES; None otherwise." + ), + ) + diff: str = Field( + description=( + "Unified git-diff-format patch of the proposed change. " + "Example: '--- a/system_prompt\\n+++ b/system_prompt\\n@@\\n" + "-old line\\n+new line'." + ), + ) + rationale: str = Field( + description="Brief reason why this change addresses the observed issue.", + ) + importance: float = Field( + ge=0.0, + le=1.0, + description=( + "How impactful this recommendation is at preventing similar " + "issues (0=cosmetic, 1=critical)." + ), + ) + + class SPARCReflectionIssue(BaseModel): """Represents an issue identified during reflection.""" @@ -31,6 +84,29 @@ class SPARCReflectionIssue(BaseModel): metric_name: str explanation: str correction: Optional[Dict[str, Any]] = None + # Per-metric rubric rating (integer 1-5) for semantic metrics; None for + # static/transform/error issues that don't produce a rubric output. + output_value: Optional[float] = Field( + default=None, + description=( + "Raw rubric rating returned by the metric's LLM judge (int 1-5 for " + "semantic metrics). None for static / transform / error issues." + ), + ) + # Model's self-reported confidence in its judgment, in [0, 1]. + confidence: Optional[float] = Field( + default=None, + description="Model-reported confidence in the assessment (0.0-1.0).", + ) + # Recommendations returned by the metric's LLM judge (evaluation-time + # mode only). Empty list in runtime mode. + recommendations: List[SPARCRecommendation] = Field( + default_factory=list, + description=( + "Actionable recommendations emitted by this metric's evaluation-" + "time prompt. Empty in runtime_pipeline=True mode." + ), + ) class SPARCReflectionResult(BaseModel): @@ -38,12 +114,45 @@ class SPARCReflectionResult(BaseModel): decision: SPARCReflectionDecision issues: List[SPARCReflectionIssue] = Field(default_factory=list) + # Aggregated rubric score across all semantic metrics that produced a + # rating (mean of output_value's). Scale: 1-5 (same as per-metric). ``None`` + # when no semantic metric ran (e.g. static-only or transform-only track). + score: Optional[float] = Field( + default=None, + description=( + "Aggregated 1-5 rubric score: mean of the per-metric output_value " + "ratings across every semantic metric that produced a rating. " + "None if no semantic metrics contributed a rating." + ), + ) + # Flat view of every recommendation emitted across all issues — handy + # for downstream CLEAR aggregation which rolls up by tool / system + # prompt. Empty in runtime_pipeline=True mode. + all_recommendations: List[SPARCRecommendation] = Field( + default_factory=list, + description=( + "Flat list of every SPARCRecommendation emitted by any metric " + "in this reflection. Empty in runtime_pipeline=True mode." + ), + ) @property def has_issues(self) -> bool: """Check if any issues were found.""" return len(self.issues) > 0 + @property + def approved(self) -> bool: + """Boolean convenience form of the APPROVE decision.""" + return self.decision == SPARCReflectionDecision.APPROVE + + @property + def normalized_score(self) -> Optional[float]: + """``score`` mapped from 1-5 into 0.0-1.0 (None when ``score`` is None).""" + if self.score is None: + return None + return max(0.0, min(1.0, (self.score - 1.0) / 4.0)) + class PreToolReflectionRunInput(ComponentInput): tool_specs: list[dict[str, Any]] = Field( diff --git a/altk/pre_tool/sparc/function_calling/metrics/base.py b/altk/pre_tool/sparc/function_calling/metrics/base.py index e15dbea8..f298350c 100644 --- a/altk/pre_tool/sparc/function_calling/metrics/base.py +++ b/altk/pre_tool/sparc/function_calling/metrics/base.py @@ -1,5 +1,8 @@ from abc import ABC +from altk.pre_tool.sparc.function_calling.metrics.common_principles import ( + COMMON_PRINCIPLES, +) from altk.pre_tool.sparc.metrics import Metric, MetricPrompt @@ -9,6 +12,12 @@ class FunctionMetricsPrompt(MetricPrompt, ABC): Subclasses must define class attrs: - system_template: str - user_template: str + + The ``{{ common_principles }}`` placeholder in ``system_template`` is + filled from :mod:`.common_principles` so every function-call metric + inherits the same production-quality guardrails (evidence hierarchy, + trajectory awareness, redundancy-by-args, recovery-after-failure, + confirmation-scope, read-only-exploration). """ system_template: str @@ -20,6 +29,7 @@ def __init__(self, metric: Metric, task_description: str) -> None: system_template=self.system_template, user_template=self.user_template, system_kwargs_defaults={ + "common_principles": COMMON_PRINCIPLES, "task_description": task_description, "metric_jsonschema": metric.to_jsonschema(), }, diff --git a/altk/pre_tool/sparc/function_calling/metrics/common_principles.py b/altk/pre_tool/sparc/function_calling/metrics/common_principles.py new file mode 100644 index 00000000..501ba6a7 --- /dev/null +++ b/altk/pre_tool/sparc/function_calling/metrics/common_principles.py @@ -0,0 +1,57 @@ +"""Shared evaluation principles injected into every function-calling metric prompt. + +These were previously duplicated across 5 runtime JSON files. Centralizing +them keeps production-quality guardrails consistent and keeps the per-metric +``task_description`` focused on what's unique to that metric. + +The block is injected via ``FunctionMetricsPrompt.__init__`` into the system +template placeholder ``{{ common_principles }}``. +""" + +# Keep this block concise — every metric prompt pays for it in tokens. +COMMON_PRINCIPLES = """\ +### Common Evaluation Principles + +Apply these rules across every judgment. They override anything in the +task description that conflicts with them. + +1. Evidence hierarchy — when sources disagree, trust them in this order: + system prompt > tool outputs > user messages > assistant messages. + Treat values the system prompt fixes (policy, current date/year, + identity, environment anchors) as ground truth even if an assistant + turn contradicts them. +2. Trajectory awareness — the call under review may be one step in an + ongoing trajectory; additional tool calls may follow and will be + judged separately. Do NOT penalize a call for not, by itself, + completing the user's full goal. +3. Redundancy — the clearest case is when a prior call used the SAME + function name AND the SAME arguments AND its result is still valid. + Two other patterns also count: + - Information already obtained: a prior tool output contains the + value the agent is now re-querying, or the agent re-reads a + record it just wrote in the same turn. + - Parameter-permutation spinning: the agent calls the SAME function + again after a prior same-function call returned empty or errored, + with only reordered arguments, flipped boolean flags to their + defaults, pagination bumps on an empty result set, or similar + surface changes — and NO new conversational information has + arrived since the prior call. Legitimate recovery requires a + different strategy: a different tool, an argument change grounded + in new information the agent just received, or a user-intent + pivot. Shuffling surface params on a dead query is not recovery. + Beyond these patterns, different arguments or a meaningful change + in context (time, state, scope) are exploration, not redundancy. +4. Recovery after failure — if earlier tool calls returned empty + results, errors, or validation failures, subsequent calls that try + different parameters, alternative tools, or fallback strategies are + legitimate recovery. Do not fault "should have called X first" when + X already ran and failed. +5. Read-only exploration passes — a read-only or information-gathering + call that is plausibly relevant to the user's request should be + APPROVED even if strictly unnecessary, UNLESS it exposes sensitive + data the user did not authorize or contradicts an explicit scope. +6. Evidence-based judgment — base every label on explicit evidence. + When evidence is ambiguous, do NOT fabricate a problem. Plausible + parameters from visible context are not "hallucinated" merely + because the value is not repeated verbatim elsewhere. +""" diff --git a/altk/pre_tool/sparc/function_calling/metrics/function_call/general.py b/altk/pre_tool/sparc/function_calling/metrics/function_call/general.py index fa3dd3b9..1df6d894 100644 --- a/altk/pre_tool/sparc/function_calling/metrics/function_call/general.py +++ b/altk/pre_tool/sparc/function_calling/metrics/function_call/general.py @@ -4,6 +4,7 @@ ) _general_system = ( + "{{ common_principles }}\n\n" "### Task Description and Role:\n\n" "{{ task_description }}\n\n" "Your output must conform to the following JSON schema, in the same order as the fields appear in the schema:\n" diff --git a/altk/pre_tool/sparc/function_calling/metrics/function_call/general_metrics.json b/altk/pre_tool/sparc/function_calling/metrics/function_call/general_metrics.json index d4ff2e14..53f04167 100644 --- a/altk/pre_tool/sparc/function_calling/metrics/function_call/general_metrics.json +++ b/altk/pre_tool/sparc/function_calling/metrics/function_call/general_metrics.json @@ -1,1088 +1,1211 @@ [ - { - "name": "general_hallucination_check", - "task_description": "Evaluate whether each parameter value in the function call is correct and **directly supported** by the provided conversation history and adhere the tool specifications. Your assessment must be based **strictly on explicit evidence** from these sources and correctly formatted based on the API specifications. Do **not** assume or hallucinate any information that is not clearly documented and provided.\n\n---\n\n#### 1. Grounding Sources\n\nA parameter value is considered grounded if it originates from one of the following:\n\n- An explicit user message in the conversation \n- An assistant message that the user confirmed or acknowledged \n- The output of a previous tool call \n- A documented default value in the API specification \n\n---\n\n#### 2. Parameter Value Classification\n\nEach parameter value must be labeled using one of the following categories:\n\n- **CORRECT** \n The value is explicitly mentioned, clearly implied in the conversation, or matches a documented default.\n\n- **MISSING_INFORMATION** \n The value is underspecified or incomplete given the current context (e.g., '05/06' when a full date is required).\n\n- **FORMAT ERROR** \n The value is conceptually correct but incorrectly formatted (e.g., wrong units, casing, or structure) based on the API specifications and descriptions.\n\n- **CONTRADICTORY_VALUES** \n The value violates documented constraints or logical relationships between parameters (e.g., `start_time` should be < `end_time`).\n\n- **DEFAULT_ISSUE** \n The default value of the parameter is not the correct value based on the conversation history.\n\n---\n\n#### 3. Handling Default Values\n\n- Optional parameters may only use default values that are **explicitly documented** in the API specification. \n- Defaults that are assumed or undocumented count as hallucinations. \n- If no value is provided and no default exists, the parameter should be omitted.\n\n---\n\n#### 4. Acceptable Transformations\n\nSome transformations are permitted, but only when grounded:\n\n- **Synonyms** (e.g., \"NYC\" -> \"New York\") are allowed only when normalized in the conversation. \n- **Unit conversions** are valid only if the conversation explicitly mentions the conversion. \n- **Format changes** (e.g., date or time reformatting) are acceptable only if acknowledged by the assistant or supported by the specification.\n\n---\n\n#### 5. Multi-Call Context\n\nIf the tool call appears in a sequence:\n\n- You may use outputs from earlier tool calls to justify parameter values. \n- Consider dependencies and ordering between calls when assessing grounding. \n\n---\n\n#### Conservative Judgment Principle\n\nWhen in doubt, err on the side of caution. If grounding cannot be clearly established, treat the parameter as incorrect.\n", - "jsonschema": { - "title": "general_hallucination_check", - "description": "Assessment of tool call grounding accuracy, following the rubric defined above.", - "type": "object", - "additionalProperties": false, - "properties": { - "evidence": { - "type": "string", - "description": "Provide EXACT quotes from user messages, assistant responses, prior tool outputs, or API specifications that demonstrate whether each parameter value is grounded. Do not paraphrase or interpret-use only direct evidence." - }, - "explanation": { + { + "name": "general_hallucination_check", + "task_description": "Evaluate whether each parameter value in the function call is correct and **directly supported** by the provided conversation history AND adheres to the API specification. Label each parameter using one of the categories below.\n\n---\n\n#### 1. Grounding Sources\n\nA value is grounded when it comes from any of these paths:\n\n- **Explicit statement** \u2014 an explicit user message, an assistant message the user confirmed, a previous tool output, or a value fixed by the system prompt.\n- **Documented default** \u2014 a default that the API spec documents for the parameter.\n- **Pick-one from explicit options** \u2014 the user enumerated several acceptable values and the agent picked one of them. The pick is grounded even if the user did not name a single preferred value.\n- **Specialization from an abstract set/range/category** \u2014 the user named a set (\"any of these\"), a range (\"around N\"), or a category (\"a fast one\"), and the agent supplied a concrete instance that falls INSIDE the user's named scope. A concrete value that satisfies the user's abstract constraint is grounded.\n- **Plausible optional value** \u2014 for OPTIONAL parameters (not in the spec's `required` array), any value consistent with the documented type/enum/pattern is grounded unless it contradicts explicit user intent or a higher-priority source.\n\n---\n\n#### 2. Parameter Value Classification\n\n- **CORRECT** \u2014 grounded via any path in \u00a71 (explicit statement, documented default, pick-one from options, specialization inside the user's named set, or a plausible optional value).\n- **MISSING_INFORMATION** \u2014 required AND no grounding path in \u00a71 applies: no explicit evidence, no documented default, no user-provided option set to pick from, no abstract user intent that can be specialized, and no plausible optional-value interpretation. Do NOT use this label just because the value is not quoted verbatim from the user.\n- **FORMAT_ERROR** \u2014 conceptually right value but wrong format (units, casing, structure, pattern) per the spec. Format issues are separate from value issues \u2014 a value that is semantically wrong is NOT a FORMAT_ERROR.\n- **CONTRADICTORY_VALUES** \u2014 the value violates a documented constraint, a logical relationship between parameters, OR falls OUTSIDE the user's explicitly named set/range/category (specialization grounding failed because the concrete choice is not a member of the abstract intent).\n- **DEFAULT_ISSUE** \u2014 a default is used but the correct value is determinable from context.\n\n---\n\n#### 3. Default and Optional Values\n\nOptional parameters (NOT in `required`) may be omitted OR set to any value consistent with the documented type/enum/pattern. Never label omission of an optional parameter as MISSING_INFORMATION. A plausible optional value \u2014 including a neutral boolean (e.g. `false`), a neutral numeric default, or an empty filter \u2014 is CORRECT unless it contradicts explicit user intent or a documented constraint. For REQUIRED parameters, only explicitly documented defaults count as grounded defaults; an un-documented default is DEFAULT_ISSUE.\n\n---\n\n#### 4. Acceptable Transformations\n\nSynonyms, unit conversions, and format re-formattings are valid when supported by the conversation or the spec. An under-specified value (e.g. one missing a year, region, unit, or other component) should be grounded from a higher-priority source (system prompt, prior tool output) before being flagged \u2014 the grounded canonical form is CORRECT.\n\n---\n\n#### Required-Parameter Omission\n\nA parameter listed in the spec's `required` array that is completely absent from the arguments is ALWAYS MISSING_INFORMATION \u2014 no grounding path (explicit statement, documented default, pick-one, specialization, or plausible-optional) substitutes for actually providing a value. The grounding paths only apply to values the agent ACTUALLY PROVIDED; they never excuse omission of a required parameter.\n\n#### User-Context Discarded\n\nWhen a parameter is spec-optional but the user (or prior tool output / system prompt) provided an explicit narrowing value for it \u2014 a search term, a filter, a category, an identifier \u2014 and the agent OMITS the parameter or sets it to an \"empty\" sentinel (empty string, null, default wildcard), this is NOT a \"plausible optional value\". It is USER_CONTEXT_DISCARDED and should be labeled CONTRADICTORY_VALUES (the call contradicts explicit user intent). Score the call 2-3 accordingly. This rule applies regardless of whether the spec marks the parameter as required; it turns on whether the user supplied narrowing context for it.\n\n---\n\n#### Stringency\n\nApply this rubric strictly for REQUIRED parameters whose value is missing, fabricated, or inconsistent with the spec. Apply it leniently for OPTIONAL parameters (not in the spec's `required` array) and for values reasonably inferable from visible context. A value is NOT an issue unless it contradicts explicit user intent, a documented constraint, or higher-priority evidence.\n\n---\n\n#### Actionable Recommendations\n\nWhen the call reveals a fixable gap in the agent's prompts or tool specifications, emit one or more `actionable_recommendations` as unified-diff patches. Each recommendation specifies: `target` (which artifact to change: `system_prompt`, `tool_description`, `parameter_description`, `parameter_examples`), an optional `tool_name` / `parameter_name` for tool-scoped targets, a `diff` in unified-diff format, a short `rationale`, and an `importance` score in [0, 1] (0 = cosmetic, 1 = critical). When the call has no fixable artifact gap \u2014 or the call is already correct \u2014 return an empty array.", + "jsonschema": { + "title": "general_hallucination_check", + "description": "Assessment of tool call grounding accuracy, following the rubric defined above.", + "type": "object", + "additionalProperties": false, + "properties": { + "evidence": { + "type": "string", + "description": "Provide EXACT quotes from user messages, assistant responses, prior tool outputs, or API specifications that demonstrate whether each parameter value is grounded. Do not paraphrase or interpret-use only direct evidence." + }, + "explanation": { + "type": "string", + "description": "For EACH parameter value, explain its grounding status by citing specific evidence from the conversation history or tool specification. Address whether the value is correctly grounded, hallucinated, needs more information, has format errors, or contradicts other parameters. Include direct quotes that support or contradict the value." + }, + "output": { + "type": "integer", + "minimum": 1, + "maximum": 5, + "threshold_low": 4, + "threshold_high": 5, + "description": "Grounding Quality Score (1-5). Judge the CALL AS A WHOLE, aggregating per-parameter grounding. Use only evidence from the system prompt, prior tool outputs, user messages, and assistant messages; do not appeal to world knowledge.\n\n- **5**: Every parameter is grounded via an explicit source, a documented default, a pick-one from user-provided options, or a specialization inside the user's named set/range/category. Any optional parameter is either omitted or set to a plausible value consistent with the spec.\n Example: user gives a full identifier and a timeframe; the agent passes them verbatim.\n Example: user enumerates two acceptable values and the agent picks one of them.\n\n- **4**: Every parameter is grounded, but one value is a mild inference \u2014 e.g. a common default like a count of 1, a minimum/ maximum from the spec, or a concrete specialization of an abstract intent where the user's scope clearly allows it.\n\n- **3**: One required parameter is weakly grounded \u2014 a specialization from an abstract intent where the user's scope is ambiguous, or a reasonable but unconfirmed inference. Other parameters are CORRECT.\n\n- **2**: Multiple parameters are weakly grounded or one required value clearly contradicts a documented constraint, a higher-priority source, or falls OUTSIDE the user's explicitly named set (CONTRADICTORY_VALUES).\n\n- **1**: One or more required parameters have no grounding path at all (no explicit evidence, no pick-one, no in-scope specialization, no documented default) \u2014 they are fabricated. Only reach this level when you can state positively that NO grounding path applies." + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1, + "threshold_low": 0, + "threshold_high": 1, + "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." + }, + "correction": { + "type": "object", + "description": "Corrections to the tool call. If there are no issues, provide an empty object {}. If there are issues, include parameter_issues and optionally a corrected tool_call if a full reconstruction is possible.", + "properties": { + "parameter_issues": { + "type": "array", + "description": "List of parameters with issues. Leave empty if no issues were found.", + "items": { + "type": "object", + "properties": { + "parameter_name": { "type": "string", - "description": "For EACH parameter value, explain its grounding status by citing specific evidence from the conversation history or tool specification. Address whether the value is correctly grounded, hallucinated, needs more information, has format errors, or contradicts other parameters. Include direct quotes that support or contradict the value." - }, - "output": { - "type": "integer", - "minimum": 1, - "maximum": 5, - "threshold_low": 4, - "threshold_high": 5, - "description": "Reflection Quality Score (1-5):\n\n- 5: All values are correct - fully grounded and well-formatted.\n All parameter values are explicitly grounded in the conversation or tool output, and exactly match the expected types, formats, and units.\n Example:\n User: \"Book a hotel in Paris for two adults from August 10th to 12th.\"\n \u2192 Tool call includes: location: \"Paris\", guests: 2, check_in: \"2025-08-10\", check_out: \"2025-08-12\"\n\n- 4: All values are correct based on the conversation history and tool specifications - can be logically inferred (can be implicitly grounded).\n Some parameter values are not directly mentioned but can be logically inferred from the context. No evidence of hallucination.\n Example:\n User: \"I want to book a hotel in Tokyo for me and my wife.\"\n \u2192 Tool call includes: location: \"Tokyo\", guests: 2\n\n- 3: One or more values are probably incorrect or ungrounded - ambiguous or undetermined\n It's unclear whether some parameter values are grounded. They might be inferred, hallucinated, or drawn from partial context, or some parameter values partially match the expected format but are likely to be rejected or misinterpreted by automatic processing.\n Example:\n User: \"Book a hotel in Rome.\"\n \u2192 Tool call includes: location: \"Rome\", guests: 2\n\n- 2: One or more values are probably incorrect - likely hallucinated or incorrect format.\n Several parameter values do not align with the conversation context and are likely hallucinated, or have format errors.\n Example:\n User: \"Find a hotel.\"\n \u2192 Tool call includes: location: \"New York\", guests: 4\n\n- 1: One or more values are clearly incorrect - certainly hallucinated or fabricated.\n Some parameter values are clearly invented, irrelevant, contradict known context, or do not follow the API specifications.\n Example:\n User: \"Remind me to call John.\"\n \u2192 Tool call includes: location: \"Barcelona\", guests: 6, check_in: \"2025-07-01\"" - }, - "confidence": { - "type": "number", - "minimum": 0, - "maximum": 1, - "threshold_low": 0, - "threshold_high": 1, - "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." - }, - "correction": { - "type": "object", - "description": "Corrections to the tool call. If there are no issues, provide an empty object {}. If there are issues, include parameter_issues and optionally a corrected tool_call if a full reconstruction is possible.", - "properties": { - "parameter_issues": { - "type": "array", - "description": "List of parameters with issues. Leave empty if no issues were found.", - "items": { - "type": "object", - "properties": { - "parameter_name": { - "type": "string", - "description": "Name of the parameter with an issue." - }, - "reason_types": { - "type": "array", - "description": "List of issue categories affecting this parameter.", - "items": { - "type": "string", - "enum": [ - "MISSING_INFORMATION", - "FORMAT_ERROR", - "CONTRADICTORY_VALUES", - "DEFAULT_ISSUE", - "OTHER" - ] - } - }, - "reasons": { - "type": "string", - "description": "Brief explanation of the issue(s) affecting this parameter." - }, - "corrected_value": { - "type": "object", - "description": "Object containing parameter name as key and corrected value as value. Use 'need_more_information' as the key and clarification question for the user if additional information is required. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-grounded, return an empty object {}.", - "additionalProperties": true - } - }, - "required": [] - } - }, - "tool_call": { - "type": "object", - "description": "Optional complete corrected tool call. Include only if all corrected values are available and no values require further user input.", - "properties": { - "name": { - "type": "string", - "description": "Name of the function to call." - }, - "arguments": { - "type": "object", - "description": "Full set of corrected arguments for the function call, including both modified and retained values.", - "additionalProperties": true - } - }, - "required": [] - } - }, - "required": [] - }, - "actionable_recommendations": { + "description": "Name of the parameter with an issue." + }, + "reason_types": { "type": "array", - "description": "Provide clear, actionable recommendations that can be automatically applied to prevent similar issues. Only include recommendations when there are actual issues (output <= 3). Focus on:\n\n1. PARAMETER_DOCUMENTATION: Add or update parameter documentation to clarify requirements, formats, or defaults.\n2. PARAMETER_EXAMPLES: Add concrete examples showing correct parameter usage.\n3. SYSTEM_PROMPT_INSTRUCTION: Add a general instruction to the system prompt (do not mention specific parameter names).", + "description": "List of issue categories affecting this parameter.", "items": { - "type": "object", - "properties": { - "recommendation": { - "type": "string", - "enum": [ - "PARAMETER_DOCUMENTATION", - "PARAMETER_EXAMPLES", - "SYSTEM_PROMPT_INSTRUCTION" - ], - "description": "The type of recommendation being made." - }, - "parameter_name": { - "type": "string", - "description": "The name of the parameter to which the recommendation applies. Omit for SYSTEM_PROMPT_INSTRUCTION." - }, - "details": { - "type": "string", - "description": "Brief explanation of what should be changed and why." - }, - "quote": { - "type": "string", - "description": "Exact text to add to documentation or system prompt. Must be directly usable without modification." - } - }, - "required": [ - "recommendation", - "details", - "quote" - ] + "type": "string", + "enum": [ + "MISSING_INFORMATION", + "FORMAT_ERROR", + "CONTRADICTORY_VALUES", + "DEFAULT_ISSUE", + "OTHER" + ] } + }, + "reasons": { + "type": "string", + "description": "Brief explanation of the issue(s) affecting this parameter." + }, + "corrected_value": { + "type": "object", + "description": "Object containing parameter name as key and corrected value as value. Use 'need_more_information' as the key and clarification question for the user if additional information is required. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-grounded, return an empty object {}.", + "additionalProperties": true + } + }, + "required": [] + } + }, + "tool_call": { + "type": "object", + "description": "Optional complete corrected tool call. Include only if all corrected values are available and no values require further user input.", + "properties": { + "name": { + "type": "string", + "description": "Name of the function to call." + }, + "arguments": { + "type": "object", + "description": "Full set of corrected arguments for the function call, including both modified and retained values.", + "additionalProperties": true } + }, + "required": [] + } + }, + "required": [] + }, + "actionable_recommendations": { + "type": "array", + "description": "List of actionable recommendations as unified-diff patches. Emit only when the call reveals a fixable gap in the agent's prompts or tool specifications. Empty array when there is nothing to fix (including when the call is already correct).", + "items": { + "type": "object", + "additionalProperties": false, + "properties": { + "target": { + "type": "string", + "enum": [ + "system_prompt", + "tool_description", + "parameter_description", + "parameter_examples" + ], + "description": "Which artifact the diff applies to. `system_prompt` changes the agent's top-level policy; the other three target a specific tool (and possibly one of its parameters)." + }, + "tool_name": { + "type": [ + "string", + "null" + ], + "description": "Required when target is tool_description, parameter_description, or parameter_examples. Use the exact function/tool name. Must be null when target is system_prompt." + }, + "parameter_name": { + "type": [ + "string", + "null" + ], + "description": "Required when target is parameter_description or parameter_examples. Null otherwise." + }, + "diff": { + "type": "string", + "description": "Unified git-diff-format patch of the proposed change. Format: `--- a/\\n+++ b/\\n@@ ... @@\\n-\\n+`. When there is no line-number context, use `@@` as the single hunk header. Keep diffs minimal \u2014 only the lines that meaningfully change." + }, + "rationale": { + "type": "string", + "description": "One-sentence reason why this change addresses the observed issue. <= 200 chars." + }, + "importance": { + "type": "number", + "minimum": 0, + "maximum": 1, + "description": "How impactful this recommendation is at preventing similar issues in future calls (0 = cosmetic, 1 = critical). Calibrate consistently across metrics so downstream aggregation is comparable." + } }, "required": [ - "evidence", - "explanation", - "output", - "confidence", - "correction", - "actionable_recommendations" + "target", + "diff", + "rationale", + "importance" ] - }, - "examples": [ + } + } + }, + "required": [ + "evidence", + "explanation", + "output", + "confidence", + "correction", + "actionable_recommendations" + ] + }, + "examples": [ + { + "user_kwargs": { + "conversation_context": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "system", - "content": "You are an helpful assistant." - }, - { - "role": "user", - "content": "Can you show me my profile details?" - }, - { - "role": "assistant", - "content": "Logged in as user 55; auth token acquired." - }, - { - "role": "assistant", - "content": "Ready to fetch your profile." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "get_user_profile", - "description": "Retrieves a user's profile details", - "parameters": { - "type": "object", - "properties": { - "user_id": { - "type": "integer", - "description": "The ID of the user whose profile is being retrieved" - } - }, - "required": [ - "user_id" - ] - } - } - } - ], - "tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "get_user_profile", - "arguments": "{ \"user_id\": 55 }" - } + "role": "system", + "content": "You are an helpful assistant." + }, + { + "role": "user", + "content": "Can you show me my profile details?" + }, + { + "role": "assistant", + "content": "Logged in as user 55; auth token acquired." + }, + { + "role": "assistant", + "content": "Ready to fetch your profile." + } + ], + "tool_inventory": [ + { + "type": "function", + "function": { + "name": "get_user_profile", + "description": "Retrieves a user's profile details", + "parameters": { + "type": "object", + "properties": { + "user_id": { + "type": "integer", + "description": "The ID of the user whose profile is being retrieved" } - }, - "output": { - "evidence": "Assistant message: \"Logged in as user 55; auth token acquired.\"", - "explanation": "The parameter value user_id=55 is directly grounded in the conversation history. The assistant's message explicitly states \"Logged in as user 55\", providing the exact user_id that is used in the tool call. This is a clear example of proper grounding from conversation context.", - "output": 5, - "confidence": 0.99, - "correction": {}, - "actionable_recommendations": [] + }, + "required": [ + "user_id" + ] } + } + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "get_user_profile", + "arguments": "{ \"user_id\": 55 }" + } + } + }, + "output": { + "evidence": "Assistant message: \"Logged in as user 55; auth token acquired.\"", + "explanation": "The parameter value user_id=55 is directly grounded in the conversation history. The assistant's message explicitly states \"Logged in as user 55\", providing the exact user_id that is used in the tool call. This is a clear example of proper grounding from conversation context.", + "output": 5, + "confidence": 0.99, + "correction": {}, + "actionable_recommendations": [] + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "system", + "content": "You are an helpful assistant." }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "system", - "content": "You are an helpful assistant." - }, - { - "role": "user", - "content": "Can you show me my profile details?" - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "get_user_profile", - "description": "Retrieves a user's profile details", - "parameters": { - "type": "object", - "properties": { - "user_id": { - "type": "integer", - "description": "The ID of the user whose profile is being retrieved" - } - }, - "required": ["user_id"] - } - } - } - ], - "tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "get_user_profile", - "arguments": "{ \"user_id\": 12345 }" - } - } - }, - "output": { - "evidence": "User never mentioned an ID. Tool spec requires user_id but does not define a default. The assistant fabricated '12345'.", - "explanation": "The assistant hallucinated a user_id value with no grounding in conversation or tool specification.", - "output": 1, - "confidence": 1, - "correction": { - "parameter_issues": [ - { - "parameter_name": "user_id", - "reason_types": ["MISSING_INFORMATION"], - "reasons": "No user ID in context or conversation. The assistant invented '12345'.", - "corrected_value": { - "need_more_information": "Please provide your user ID so I can fetch the profile." - } - } - ] - }, - "actionable_recommendations": [ - { - "recommendation": "PARAMETER_DOCUMENTATION", - "parameter_name": "user_id", - "details": "Clarify that user_id must come from conversation or authentication, never assumed.", - "quote": "Required. Must be explicitly provided by the user or obtained from a prior authentication tool call. Never assume or fabricate this value." - }, - { - "recommendation": "SYSTEM_PROMPT_INSTRUCTION", - "details": "Add instruction to never fabricate parameter values.", - "quote": "Never invent or fabricate parameter values. If required information is missing from the conversation, ask the user to provide it before making tool calls." + "role": "user", + "content": "Can you show me my profile details?" + } + ], + "tool_inventory": [ + { + "type": "function", + "function": { + "name": "get_user_profile", + "description": "Retrieves a user's profile details", + "parameters": { + "type": "object", + "properties": { + "user_id": { + "type": "integer", + "description": "The ID of the user whose profile is being retrieved" } - ] + }, + "required": [ + "user_id" + ] + } + } + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "get_user_profile", + "arguments": "{ \"user_id\": 12345 }" + } + } + }, + "output": { + "evidence": "User never mentioned an ID. Tool spec requires user_id but does not define a default. The assistant fabricated '12345'.", + "explanation": "The assistant hallucinated a user_id value with no grounding in conversation or tool specification.", + "output": 1, + "confidence": 1, + "correction": { + "parameter_issues": [ + { + "parameter_name": "user_id", + "reason_types": [ + "MISSING_INFORMATION" + ], + "reasons": "No user ID in context or conversation. The assistant invented '12345'.", + "corrected_value": { + "need_more_information": "Please provide your user ID so I can fetch the profile." } + } + ] + }, + "actionable_recommendations": [ + { + "target": "system_prompt", + "tool_name": null, + "parameter_name": null, + "diff": "--- a/system_prompt\n+++ b/system_prompt\n@@\n+Before you call a tool, verify every required argument is explicitly grounded in the conversation, a prior tool output, or a documented default. If a required argument is unknown, ask the user instead of guessing.", + "rationale": "Reinforces evidence-based argument selection so the agent does not fabricate required values.", + "importance": 0.8 }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "system", - "content": "You are a travel booking assistant." - }, - { - "role": "user", - "content": "Book me a flight to Paris on September 15th." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "book_flight", - "description": "Books a flight to a specified destination on a given date.", - "parameters": { - "type": "object", - "properties": { - "destination": { - "type": "string", - "description": "The destination city or airport for the flight." - }, - "departure_date": { - "type": "string", - "description": "The date of departure, in YYYY-MM-DD format." - }, - "seat_class": { - "type": "string", - "description": "Seat class: economy, business, or first.", - "enum": ["economy", "business", "first"] - } - }, - "required": ["destination", "departure_date", "seat_class"] - } - } - } - ], - "tool_call": { - "id": "call_002", - "type": "function", - "function": { - "name": "book_flight", - "arguments": "{ \"destination\": \"Paris\", \"departure_date\": \"2025-09-15\", \"seat_class\": \"first\" }" - } - } - }, - "output": { - "evidence": "User asked for flight to Paris on 2025-09-15 but never specified seat_class. Assistant hallucinated 'first'.", - "explanation": "The destination and date are grounded. The seat_class parameter is hallucinated because the user never mentioned it.", - "output": 1, - "confidence": 0.92, - "correction": { - "parameter_issues": [ - { - "parameter_name": "seat_class", - "reason_types": ["MISSING_INFORMATION"], - "reasons": "Seat class was never provided by the user or tool defaults.", - "corrected_value": { - "need_more_information": "Which seat class would you like: economy, business, or first?" - } - } - ] + "target": "parameter_description", + "tool_name": "get_user_profile", + "parameter_name": "user_id", + "diff": "--- a/tool/get_user_profile#user_id\n+++ b/tool/get_user_profile#user_id\n@@\n+Required. Must be a value explicitly stated by the user or returned by a prior tool call; do not synthesize.", + "rationale": "Prevents the agent from inventing an unsupported value for this required parameter.", + "importance": 0.75 + } + ] + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "system", + "content": "You are a travel booking assistant." + }, + { + "role": "user", + "content": "Book me a flight to Paris on September 15th." + } + ], + "tool_inventory": [ + { + "type": "function", + "function": { + "name": "book_flight", + "description": "Books a flight to a specified destination on a given date.", + "parameters": { + "type": "object", + "properties": { + "destination": { + "type": "string", + "description": "The destination city or airport for the flight." }, - "actionable_recommendations": [ - { - "recommendation": "PARAMETER_DOCUMENTATION", - "parameter_name": "seat_class", - "details": "Clarify that seat_class is required and has no default value.", - "quote": "Required. Must be explicitly provided by the user. Valid values: economy, business, first. No default value." + "departure_date": { + "type": "string", + "description": "The date of departure, in YYYY-MM-DD format." }, - { - "recommendation": "SYSTEM_PROMPT_INSTRUCTION", - "details": "Add instruction to confirm required parameters before tool calls.", - "quote": "Before making tool calls, ensure all required parameters have been provided by the user. If any required information is missing, ask the user for it." + "seat_class": { + "type": "string", + "description": "Seat class: economy, business, or first.", + "enum": [ + "economy", + "business", + "first" + ] } - ] + }, + "required": [ + "destination", + "departure_date", + "seat_class" + ] + } + } + } + ], + "tool_call": { + "id": "call_002", + "type": "function", + "function": { + "name": "book_flight", + "arguments": "{ \"destination\": \"Paris\", \"departure_date\": \"2025-09-15\", \"seat_class\": \"first\" }" + } + } + }, + "output": { + "evidence": "User asked for flight to Paris on 2025-09-15 but never specified seat_class. Assistant hallucinated 'first'.", + "explanation": "The destination and date are grounded. The seat_class parameter is hallucinated because the user never mentioned it.", + "output": 1, + "confidence": 0.92, + "correction": { + "parameter_issues": [ + { + "parameter_name": "seat_class", + "reason_types": [ + "MISSING_INFORMATION" + ], + "reasons": "Seat class was never provided by the user or tool defaults.", + "corrected_value": { + "need_more_information": "Which seat class would you like: economy, business, or first?" } + } + ] + }, + "actionable_recommendations": [ + { + "target": "system_prompt", + "tool_name": null, + "parameter_name": null, + "diff": "--- a/system_prompt\n+++ b/system_prompt\n@@\n+Before you call a tool, verify every required argument is explicitly grounded in the conversation, a prior tool output, or a documented default. If a required argument is unknown, ask the user instead of guessing.", + "rationale": "Reinforces evidence-based argument selection so the agent does not fabricate required values.", + "importance": 0.8 }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "system", - "content": "You are a flight booking assistant." - }, - { - "role": "user", - "content": "I want to fly to Rome on 15/09/2025 in economy class." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "book_flight", - "description": "Books a flight to a specified destination on a given date.", - "parameters": { - "type": "object", - "properties": { - "destination": { "type": "string" }, - "departure_date": { "type": "string", "description": "Date in YYYY-MM-DD format" }, - "seat_class": { - "type": "string", - "enum": ["economy", "business", "first"] - } - }, - "required": ["destination", "departure_date", "seat_class"] - } - } - } - ], - "tool_call": { - "id": "call_003", - "type": "function", - "function": { - "name": "book_flight", - "arguments": "{ \"destination\": \"Rome\", \"departure_date\": \"15/09/2025\", \"seat_class\": \"economy\" }" - } - } - }, - "output": { - "evidence": "User provided date as '15/09/2025'. Tool spec requires YYYY-MM-DD. Assistant forwarded incorrect format directly.", - "explanation": "All parameters are grounded, but departure_date is in the wrong format.", - "output": 2, - "confidence": 0.95, - "correction": { - "parameter_issues": [ - { - "parameter_name": "departure_date", - "reason_types": ["FORMAT_ERROR"], - "reasons": "Expected 'YYYY-MM-DD'. Received '15/09/2025'.", - "corrected_value": { "departure_date": "2025-09-15" } - } - ] - }, - "actionable_recommendations": [ - { - "recommendation": "PARAMETER_DOCUMENTATION", - "parameter_name": "departure_date", - "details": "Specify exact date format requirement.", - "quote": "Required format: YYYY-MM-DD (e.g., 2025-09-15). Always convert user-provided dates to this format before making the tool call." + "target": "parameter_description", + "tool_name": "book_flight", + "parameter_name": "seat_class", + "diff": "--- a/tool/book_flight#seat_class\n+++ b/tool/book_flight#seat_class\n@@\n+Required. Must be a value explicitly stated by the user or returned by a prior tool call; do not synthesize.", + "rationale": "Prevents the agent from inventing an unsupported value for this required parameter.", + "importance": 0.75 + } + ] + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "system", + "content": "You are a flight booking assistant." + }, + { + "role": "user", + "content": "I want to fly to Rome on 15/09/2025 in economy class." + } + ], + "tool_inventory": [ + { + "type": "function", + "function": { + "name": "book_flight", + "description": "Books a flight to a specified destination on a given date.", + "parameters": { + "type": "object", + "properties": { + "destination": { + "type": "string" }, - { - "recommendation": "PARAMETER_EXAMPLES", - "parameter_name": "departure_date", - "details": "Show example of date format conversion.", - "quote": "Example: User says '15/09/2025' → convert to '2025-09-15' before calling the tool." + "departure_date": { + "type": "string", + "description": "Date in YYYY-MM-DD format" }, - { - "recommendation": "SYSTEM_PROMPT_INSTRUCTION", - "details": "Add instruction to convert dates to required formats.", - "quote": "Always convert dates and times to the exact format specified in the API documentation before making tool calls, regardless of how the user provides them." + "seat_class": { + "type": "string", + "enum": [ + "economy", + "business", + "first" + ] } - ] + }, + "required": [ + "destination", + "departure_date", + "seat_class" + ] } + } } - ] - }, - { - "name": "general_value_format_alignment", - "task_description": "Evaluate whether ALL parameter values in the function call conform to the required type, format, and unit conventions defined in the API specification.\n\nYour judgment must be strictly evidence-based: rely exclusively on the provided API documentation and the parameter values. Do not assume or infer formatting rules that are not explicitly documented.\n\nIMPORTANT: Evaluate ALL parameter values in the tool call to ensure they meet the exact type, format, and unit requirements. Consider the collective format compliance of the entire function call.\n---\n\n### 1. Data Type Compliance\n- Check that each value matches the required data type (e.g., string, integer, boolean, object, array)\n- Verify that numbers are represented as numeric types, not strings\n- Ensure strings are quoted correctly if required\n- Confirm booleans are true/false (not \"true\"/\"false\") when the type is boolean\n\n---\n\n### 2. Format Specification Compliance\n- Validate adherence to documented format constraints for each parameter, including:\n - Date and time formats (e.g., ISO 8601, YYYY-MM-DD, timezone presence)\n - Currency (e.g., $USD, symbol placement, decimal precision)\n - Special patterns (e.g., phone numbers, email addresses, postal codes)\n- Enforce regular expressions or formatting rules defined in the schema\n\n---\n\n### 3. Unit Compliance\n- Verify presence or absence of unit suffixes/prefixes as specified (e.g., \"kg\", \"seconds\")\n- Confirm unit types are consistent with spec (e.g., Celsius vs Fahrenheit)\n- Reject any extra units when raw values are required\n\n---\n\n### 4. Consistency Requirements\n- Ensure values are consistent with related parameters (e.g., both timestamps in UTC)\n- Validate compatibility between parameter values in the same call\n- Check that parameter relationships don't violate constraints\n\n---\n\n### Format Alignment Rating Scale\n- **5 - Perfect Alignment**: All parameter values exactly match required types, formats, and units\n- **4 - Minor Deviation**: All values are fundamentally correct but some have minor formatting issues that are still likely to be accepted by the API or parser unless strict validation is enforced\n- **3 - Moderate Deviation**: Some parameter types or formats are partially incorrect; may cause issues\n- **2 - Major Deviation**: Multiple parameters have significant type and format violations; likely to fail\n- **1 - Complete Mismatch**: One or more parameter values are entirely incompatible with specification\n\n---\n\n### Final Rule\nBe conservative in your assessment:\n- If the spec is ambiguous and values seem plausible, prefer a higher score (4-5)\n- If the spec is clear and values deviate, assign a lower score (1-3)\n- Consider the cumulative impact of all format issues across parameters\n\nYour evaluation is critical: even minor formatting issues can lead to tool failures or incorrect outputs. Careful review ensures reliability in downstream tool use.", - "jsonschema": { - "title": "general_value_format_alignment", - "description": "Assessment of all parameter values' compliance with required type, format, and unit specifications, based on the rubric above.", - "type": "object", - "additionalProperties": false, - "properties": { - "evidence": { - "type": "string", - "description": "Quote the specification's type/format definitions for each parameter and include the actual parameter values provided. Cite EXACT text from the tool specification that supports your judgment." - }, - "explanation": { + ], + "tool_call": { + "id": "call_003", + "type": "function", + "function": { + "name": "book_flight", + "arguments": "{ \"destination\": \"Rome\", \"departure_date\": \"15/09/2025\", \"seat_class\": \"economy\" }" + } + } + }, + "output": { + "evidence": "User provided date as '15/09/2025'. Tool spec requires YYYY-MM-DD. Assistant forwarded incorrect format directly.", + "explanation": "All parameters are grounded, but departure_date is in the wrong format.", + "output": 2, + "confidence": 0.95, + "correction": { + "parameter_issues": [ + { + "parameter_name": "departure_date", + "reason_types": [ + "FORMAT_ERROR" + ], + "reasons": "Expected 'YYYY-MM-DD'. Received '15/09/2025'.", + "corrected_value": { + "departure_date": "2025-09-15" + } + } + ] + }, + "actionable_recommendations": [ + { + "target": "system_prompt", + "tool_name": null, + "parameter_name": null, + "diff": "--- a/system_prompt\n+++ b/system_prompt\n@@\n+Before you call a tool, verify every required argument is explicitly grounded in the conversation, a prior tool output, or a documented default. If a required argument is unknown, ask the user instead of guessing.", + "rationale": "Reinforces evidence-based argument selection so the agent does not fabricate required values.", + "importance": 0.8 + }, + { + "target": "parameter_description", + "tool_name": "book_flight", + "parameter_name": "departure_date", + "diff": "--- a/tool/book_flight#departure_date\n+++ b/tool/book_flight#departure_date\n@@\n+Required. Must be a value explicitly stated by the user or returned by a prior tool call; do not synthesize.", + "rationale": "Prevents the agent from inventing an unsupported value for this required parameter.", + "importance": 0.75 + } + ] + } + } + ] + }, + { + "name": "general_value_format_alignment", + "task_description": "Evaluate whether every parameter value in the call conforms to the type, format, and unit requirements defined in the API specification. Format compliance applies ONLY to values that are actually present; spec-optional params (not in `required`) that are omitted are NOT format errors. Format is independent of value correctness \u2014 a semantically wrong value with the right type/format is NOT a format issue here (it belongs to the grounding metrics). This rubric cares only about whether the serialized value matches the spec's type/format/pattern/unit.\n\n---\n\n### 1. Data Type Compliance\n- Each value matches the required JSON type (string, integer, number, boolean, object, array).\n- Numbers are numeric, booleans are true/false literals (not quoted strings) when the type is boolean.\n\n### 2. Format Specification Compliance\n- Dates/times, currency, patterns (phone, email, postal code) respect the spec. Regex/`format`/`pattern` constraints are honored.\n\n### 3. Unit Compliance\n- Required unit suffixes/prefixes are present; forbidden ones are absent; unit families (e.g. Celsius vs Fahrenheit) match the spec.\n\n### 4. Cross-Parameter Consistency\n- end_date >= start_date; paired values use consistent scales/units.\n\n---\n\n### Rating Scale\n- **5**: all values match types/formats/units exactly.\n- **4**: fundamentally correct, only minor formatting quirks a lax parser accepts.\n- **3**: partial type/format issues that may cause downstream errors.\n- **2**: multiple significant type/format violations.\n- **1**: at least one value is fundamentally incompatible with the spec.\n\n---\n\n### Final Rule\nIf the spec is ambiguous and values are plausible, prefer 4-5. If the spec is clear and values deviate, assign 1-3 per the severity of the deviation.\n\n---\n\n#### Stringency\n\nApply this rubric strictly for REQUIRED parameters whose value is missing, fabricated, or inconsistent with the spec. Apply it leniently for OPTIONAL parameters (not in the spec's `required` array) and for values reasonably inferable from visible context. A value is NOT an issue unless it contradicts explicit user intent, a documented constraint, or higher-priority evidence.\n\n---\n\n#### Actionable Recommendations\n\nWhen the call reveals a fixable gap in the agent's prompts or tool specifications, emit one or more `actionable_recommendations` as unified-diff patches. Each recommendation specifies: `target` (which artifact to change: `system_prompt`, `tool_description`, `parameter_description`, `parameter_examples`), an optional `tool_name` / `parameter_name` for tool-scoped targets, a `diff` in unified-diff format, a short `rationale`, and an `importance` score in [0, 1] (0 = cosmetic, 1 = critical). When the call has no fixable artifact gap \u2014 or the call is already correct \u2014 return an empty array.", + "jsonschema": { + "title": "general_value_format_alignment", + "description": "Assessment of all parameter values' compliance with required type, format, and unit specifications, based on the rubric above.", + "type": "object", + "additionalProperties": false, + "properties": { + "evidence": { + "type": "string", + "description": "Quote the specification's type/format definitions for each parameter and include the actual parameter values provided. Cite EXACT text from the tool specification that supports your judgment." + }, + "explanation": { + "type": "string", + "description": "For each parameter value, explain why it conforms or does not conform to the specification's requirements. Address data type, format, units, and any pattern or constraint violations. For incorrect values, describe exactly what is wrong and how it affects the overall tool call." + }, + "output": { + "type": "integer", + "minimum": 1, + "maximum": 5, + "threshold_low": 4, + "threshold_high": 5, + "description": "Type/Format/Unit Alignment Score (1-5). Judge ONLY the serialized representation of present values against the spec's type/format/pattern/unit. A semantically wrong value with a correct format is NOT flagged here.\n\n- **5**: Every present value matches its type, format, pattern, and unit exactly; missing optional parameters are not format errors.\n- **4**: All values are fundamentally correct with at most a minor formatting quirk a lax parser accepts (e.g. missing leading zero).\n- **3**: One value has a partial format issue that may cause a strict parser to reject or misinterpret it.\n- **2**: Multiple significant format violations (wrong type, wrong unit family, wrong date/pattern shape) likely to fail at the API boundary.\n- **1**: At least one value is fundamentally incompatible with the spec (e.g. a free-form string where the spec demands a boolean; an incoherent pattern).\n\nPrefer 4-5 when the spec is ambiguous and values are plausible. Prefer 1-3 when the spec is clear and values deviate; rate by severity." + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1, + "threshold_low": 0, + "threshold_high": 1, + "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." + }, + "correction": { + "type": "object", + "description": "If output >= 4, return {}. If output <= 3, provide parameter_issues and a corrected tool_call.", + "properties": { + "parameter_issues": { + "type": "array", + "description": "List of parameters with format issues. Leave empty if no issues were found.", + "items": { + "type": "object", + "properties": { + "parameter_name": { "type": "string", - "description": "For each parameter value, explain why it conforms or does not conform to the specification's requirements. Address data type, format, units, and any pattern or constraint violations. For incorrect values, describe exactly what is wrong and how it affects the overall tool call." - }, - "output": { - "type": "integer", - "minimum": 1, - "maximum": 5, - "threshold_low": 4, - "threshold_high": 5, - "description": "An integer from 1 to 5 indicating how well all parameter values conform to the required types, formats, and units as defined in the API specification.\n\nScore meanings:\n\n5 - Perfect Alignment: All parameter values exactly match the expected types, formats, and units. Example: all dates in 'YYYY-MM-DD', all numbers as integers, all strings properly formatted.\n\n4 - Minor Deviation: All values are fundamentally correct but some have small formatting issues (e.g., missing leading zeros) that are still likely to be accepted by the API or parser unless strict validation is enforced. Example: expected '2025-08-05', value is '2025-8-5'.\n\n3 - Moderate Deviation: Some parameter values partially match the expected format but are likely to be rejected or misinterpreted by automatic processing. Example: mix of correct and incorrectly formatted dates.\n\n2 - Major Deviation: Multiple parameter values significantly violate the expected type, format, or unit and are very likely to fail. Example: wrong data types, completely wrong formats.\n\n1 - Complete Mismatch: One or more parameter values are entirely incompatible with the required format or type. Example: expected boolean, value is 'maybe'; expected date, value is random text." - }, - "confidence": { - "type": "number", - "minimum": 0, - "maximum": 1, - "threshold_low": 0, - "threshold_high": 1, - "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." - }, - "correction": { - "type": "object", - "description": "If output >= 4, return {}. If output <= 3, provide parameter_issues and a corrected tool_call.", - "properties": { - "parameter_issues": { - "type": "array", - "description": "List of parameters with format issues. Leave empty if no issues were found.", - "items": { - "type": "object", - "properties": { - "parameter_name": { - "type": "string", - "description": "Name of the parameter with a format issue." - }, - "reason_types": { - "type": "array", - "description": "List of format issues identified. Use one or more of: TYPE_ERROR, FORMAT_ERROR, UNIT_ERROR, PATTERN_ERROR, CONSISTENCY_ERROR, OTHER.", - "items": { - "type": "string", - "enum": [ - "TYPE_ERROR", - "FORMAT_ERROR", - "UNIT_ERROR", - "PATTERN_ERROR", - "CONSISTENCY_ERROR", - "OTHER" - ] - } - }, - "reasons": { - "type": "string", - "description": "Short explanation of the specific issue(s) with the parameter's format, type, or unit." - }, - "corrected_value": { - "type": "object", - "description": "An object containing the corrected parameter value in the form: { \"\": }. If you cannot correct the value, use 'need_more_information' as the key and provide a clarification question for the user. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-formatted, return an empty object {}.", - "additionalProperties": true - } - }, - "required": [ - "parameter_name", - "reason_types", - "reasons", - "corrected_value" - ] - } - }, - "tool_call": { - "type": "object", - "description": "Complete corrected tool call with all format issues fixed, including both modified and retained parameter values. Must be included if a full reconstruction is possible.", - "properties": { - "name": { - "type": "string", - "description": "Name of the function to call." - }, - "arguments": { - "type": "object", - "description": "Set of corrected arguments for the function call, with all format issues resolved.", - "additionalProperties": true - } - }, - "required": [ - "name", - "arguments" - ] - } - }, - "required": [] - }, - "actionable_recommendations": { + "description": "Name of the parameter with a format issue." + }, + "reason_types": { "type": "array", - "description": "Provide actionable suggestions for avoiding format errors in future tool calls, only if there are issues. Address root causes like unclear format specs, missing examples, or lack of validation. Include suggestions such as:\n\n1. PARAMETER_FORMAT_DOCUMENTATION: Clarify or expand parameter format expectations in the API documentation.\n2. PARAMETER_EXAMPLES: Add example values with correct formatting in the parameter documentation.\n3. PARAMETER_VALIDATION: Introduce regex or rule-based format validation in the API to catch errors before tool calls.\n4. PARAMETER_CONVERSION: Add conversion helpers for user input to ensure correct formatting.\n5. UNIT_STANDARDS: Standardize unit expectations and flag missing or incorrect units in the tools implementation.\n6. PARAMETER_NAMING_CONVENTIONS: Suggest clearer parameter names that indicate expected formats or units.\n7. SYSTEM_PROMPT_ADDITIONS: Recommend specific additions to the system prompt that would help the agent avoid similar format issues in the future.\n8. PRE_CALL_REFLECTION_STRATEGIES: Propose strategies for the agent to reflect on parameter values before making tool calls, such as checking format compliance.\n9. PARAMETER_SOURCE_TRACKING: Suggest implementing a mechanism to track where each parameter value originated from to improve transparency and debugging.\n10. CLARIFICATION_PROMPTS: Recommend adding clarification prompts when parameter values are ambiguous or missing expected formats.\n11. OTHER: Any other specific recommendations that would help prevent similar format issues in the future.", + "description": "List of format issues identified. Use one or more of: TYPE_ERROR, FORMAT_ERROR, UNIT_ERROR, PATTERN_ERROR, CONSISTENCY_ERROR, OTHER.", "items": { - "type": "object", - "properties": { - "recommendation": { - "type": "string", - "description": "A specific, actionable recommendation to improve the agent's parameter formatting process.", - "enum": [ - "PARAMETER_FORMAT_DOCUMENTATION", - "PARAMETER_EXAMPLES", - "PARAMETER_VALIDATION", - "PARAMETER_CONVERSION", - "UNIT_STANDARDS", - "PARAMETER_NAMING_CONVENTIONS", - "SYSTEM_PROMPT_ADDITIONS", - "PRE_CALL_REFLECTION_STRATEGIES", - "PARAMETER_SOURCE_TRACKING", - "CLARIFICATION_PROMPTS", - "OTHER" - ] - }, - "parameter_name": { - "type": "string", - "description": "The name of the parameter to which the recommendation applies." - }, - "details": { - "type": "string", - "description": "Detailed explanation of the recommendation, including what specific changes should be made, how they will improve parameter formatting, and any relevant examples or best practices." - }, - "quote": { - "type": "string", - "description": "The specific quote of the additions made to the documentation, examples, or instructions." - } - }, - "required": [ - "recommendation", - "parameter_name", - "details", - "quote" - ] + "type": "string", + "enum": [ + "TYPE_ERROR", + "FORMAT_ERROR", + "UNIT_ERROR", + "PATTERN_ERROR", + "CONSISTENCY_ERROR", + "OTHER" + ] } + }, + "reasons": { + "type": "string", + "description": "Short explanation of the specific issue(s) with the parameter's format, type, or unit." + }, + "corrected_value": { + "type": "object", + "description": "An object containing the corrected parameter value in the form: { \"\": }. If you cannot correct the value, use 'need_more_information' as the key and provide a clarification question for the user. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-formatted, return an empty object {}.", + "additionalProperties": true + } + }, + "required": [ + "parameter_name", + "reason_types", + "reasons", + "corrected_value" + ] + } + }, + "tool_call": { + "type": "object", + "description": "Complete corrected tool call with all format issues fixed, including both modified and retained parameter values. Must be included if a full reconstruction is possible.", + "properties": { + "name": { + "type": "string", + "description": "Name of the function to call." + }, + "arguments": { + "type": "object", + "description": "Set of corrected arguments for the function call, with all format issues resolved.", + "additionalProperties": true } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [] + }, + "actionable_recommendations": { + "type": "array", + "description": "List of actionable recommendations as unified-diff patches. Emit only when the call reveals a fixable gap in the agent's prompts or tool specifications. Empty array when there is nothing to fix (including when the call is already correct).", + "items": { + "type": "object", + "additionalProperties": false, + "properties": { + "target": { + "type": "string", + "enum": [ + "system_prompt", + "tool_description", + "parameter_description", + "parameter_examples" + ], + "description": "Which artifact the diff applies to. `system_prompt` changes the agent's top-level policy; the other three target a specific tool (and possibly one of its parameters)." + }, + "tool_name": { + "type": [ + "string", + "null" + ], + "description": "Required when target is tool_description, parameter_description, or parameter_examples. Use the exact function/tool name. Must be null when target is system_prompt." + }, + "parameter_name": { + "type": [ + "string", + "null" + ], + "description": "Required when target is parameter_description or parameter_examples. Null otherwise." + }, + "diff": { + "type": "string", + "description": "Unified git-diff-format patch of the proposed change. Format: `--- a/\\n+++ b/\\n@@ ... @@\\n-\\n+`. When there is no line-number context, use `@@` as the single hunk header. Keep diffs minimal \u2014 only the lines that meaningfully change." + }, + "rationale": { + "type": "string", + "description": "One-sentence reason why this change addresses the observed issue. <= 200 chars." + }, + "importance": { + "type": "number", + "minimum": 0, + "maximum": 1, + "description": "How impactful this recommendation is at preventing similar issues in future calls (0 = cosmetic, 1 = critical). Calibrate consistently across metrics so downstream aggregation is comparable." + } }, "required": [ - "evidence", - "explanation", - "output", - "confidence", - "correction", - "actionable_recommendations" + "target", + "diff", + "rationale", + "importance" ] + } + } + }, + "required": [ + "evidence", + "explanation", + "output", + "confidence", + "correction", + "actionable_recommendations" + ] + }, + "examples": [ + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Book a flight to Paris on 2025-08-15 for 2 adults in economy class." + } + ], + "tool_inventory": [ + { + "type": "function", + "function": { + "name": "book_flight", + "description": "Books a flight with specified parameters", + "parameters": { + "type": "object", + "properties": { + "destination": { + "type": "string", + "description": "Destination city" + }, + "departure_date": { + "type": "string", + "format": "date", + "description": "Date in YYYY-MM-DD format" + }, + "passengers": { + "type": "integer", + "description": "Number of passengers" + }, + "class": { + "type": "string", + "enum": [ + "economy", + "business", + "first" + ], + "description": "Flight class" + } + }, + "required": [ + "destination", + "departure_date", + "passengers", + "class" + ] + } + } + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "book_flight", + "arguments": "{\"destination\": \"Paris\", \"departure_date\": \"2025-08-15\", \"passengers\": 2, \"class\": \"economy\"}" + } + } }, - "examples": [ + "output": { + "evidence": "Tool spec requires: destination (string), departure_date (string, YYYY-MM-DD format), passengers (integer), class (enum: economy/business/first). Values provided: 'Paris' (string), '2025-08-15' (string, YYYY-MM-DD), 2 (integer), 'economy' (valid enum).", + "explanation": "All parameter values perfectly conform to their specifications: destination is a proper string, departure_date follows the exact YYYY-MM-DD format, passengers is provided as an integer (not string), and class matches one of the required enum values. No format issues detected.", + "output": 5, + "confidence": 0.98, + "correction": {}, + "actionable_recommendations": [] + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Set a timer for 5 minutes and schedule a meeting for tomorrow at 3 PM." + } + ], + "tool_inventory": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Book a flight to Paris on 2025-08-15 for 2 adults in economy class." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "book_flight", - "description": "Books a flight with specified parameters", - "parameters": { - "type": "object", - "properties": { - "destination": { - "type": "string", - "description": "Destination city" - }, - "departure_date": { - "type": "string", - "format": "date", - "description": "Date in YYYY-MM-DD format" - }, - "passengers": { - "type": "integer", - "description": "Number of passengers" - }, - "class": { - "type": "string", - "enum": [ - "economy", - "business", - "first" - ], - "description": "Flight class" - } - }, - "required": [ - "destination", - "departure_date", - "passengers", - "class" - ] - } - } - } - ], - "tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "book_flight", - "arguments": "{\"destination\": \"Paris\", \"departure_date\": \"2025-08-15\", \"passengers\": 2, \"class\": \"economy\"}" - } + "type": "function", + "function": { + "name": "set_timer_and_meeting", + "description": "Sets a timer and schedules a meeting", + "parameters": { + "type": "object", + "properties": { + "timer_duration": { + "type": "string", + "description": "Duration in format 'number unit' (e.g., '5 minutes')" + }, + "meeting_date": { + "type": "string", + "description": "Date in YYYY-MM-DD format" + }, + "meeting_time": { + "type": "string", + "description": "Time in HH:MM 24-hour format" } - }, - "output": { - "evidence": "Tool spec requires: destination (string), departure_date (string, YYYY-MM-DD format), passengers (integer), class (enum: economy/business/first). Values provided: 'Paris' (string), '2025-08-15' (string, YYYY-MM-DD), 2 (integer), 'economy' (valid enum).", - "explanation": "All parameter values perfectly conform to their specifications: destination is a proper string, departure_date follows the exact YYYY-MM-DD format, passengers is provided as an integer (not string), and class matches one of the required enum values. No format issues detected.", - "output": 5, - "confidence": 0.98, - "correction": {}, - "actionable_recommendations": [] + }, + "required": [ + "timer_duration", + "meeting_date", + "meeting_time" + ] } + } + } + ], + "tool_call": { + "id": "call_002", + "type": "function", + "function": { + "name": "set_timer_and_meeting", + "arguments": "{\"timer_duration\": \"300\", \"meeting_date\": \"tomorrow\", \"meeting_time\": \"3 PM\"}" + } + } + }, + "output": { + "evidence": "Tool spec requires: timer_duration (string matching pattern '^\\d+\\s+(minutes?|seconds?|hours?)$'), meeting_date (string, YYYY-MM-DD format), meeting_time (string, HH:MM 24-hour format). Values provided: '300' (no unit, violates pattern), 'tomorrow' (not YYYY-MM-DD), '3 PM' (not 24-hour format).", + "explanation": "All three parameters have significant format violations: timer_duration '300' lacks the required unit and space (should be '5 minutes'), meeting_date 'tomorrow' doesn't follow YYYY-MM-DD format (should be '2025-08-07'), and meeting_time '3 PM' uses 12-hour format instead of required 24-hour HH:MM (should be '15:00'). These errors will likely cause API rejection.", + "output": 2, + "confidence": 0.92, + "correction": { + "parameter_issues": [ + { + "parameter_name": "timer_duration", + "reason_types": [ + "FORMAT_ERROR", + "UNIT_ERROR" + ], + "reasons": "Value '300' lacks required unit and space format, should be '5 minutes'", + "corrected_value": { + "timer_duration": "5 minutes" + } + }, + { + "parameter_name": "meeting_date", + "reason_types": [ + "FORMAT_ERROR" + ], + "reasons": "Value 'tomorrow' doesn't follow required YYYY-MM-DD format", + "corrected_value": { + "meeting_date": "2025-08-07" + } + }, + { + "parameter_name": "meeting_time", + "reason_types": [ + "FORMAT_ERROR" + ], + "reasons": "Value '3 PM' uses 12-hour format instead of required 24-hour HH:MM", + "corrected_value": { + "meeting_time": "15:00" + } + } + ], + "tool_call": { + "name": "set_timer_and_meeting", + "arguments": { + "timer_duration": "5 minutes", + "meeting_date": "2025-08-07", + "meeting_time": "15:00" + } + } + }, + "actionable_recommendations": [ + { + "target": "system_prompt", + "tool_name": null, + "parameter_name": null, + "diff": "--- a/system_prompt\n+++ b/system_prompt\n@@\n+Before you call a tool, verify every required argument is explicitly grounded in the conversation, a prior tool output, or a documented default. If a required argument is unknown, ask the user instead of guessing.", + "rationale": "Reinforces evidence-based argument selection so the agent does not fabricate required values.", + "importance": 0.8 }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Set a timer for 5 minutes and schedule a meeting for tomorrow at 3 PM." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "set_timer_and_meeting", - "description": "Sets a timer and schedules a meeting", - "parameters": { - "type": "object", - "properties": { - "timer_duration": { - "type": "string", - "description": "Duration in format 'number unit' (e.g., '5 minutes')" - }, - "meeting_date": { - "type": "string", - "description": "Date in YYYY-MM-DD format" - }, - "meeting_time": { - "type": "string", - "description": "Time in HH:MM 24-hour format" - } - }, - "required": [ - "timer_duration", - "meeting_date", - "meeting_time" - ] - } - } - } - ], - "tool_call": { - "id": "call_002", - "type": "function", - "function": { - "name": "set_timer_and_meeting", - "arguments": "{\"timer_duration\": \"300\", \"meeting_date\": \"tomorrow\", \"meeting_time\": \"3 PM\"}" - } - } + "target": "parameter_description", + "tool_name": "set_timer_and_meeting", + "parameter_name": "timer_duration", + "diff": "--- a/tool/set_timer_and_meeting#timer_duration\n+++ b/tool/set_timer_and_meeting#timer_duration\n@@\n+Required. Must be a value explicitly stated by the user or returned by a prior tool call; do not synthesize.", + "rationale": "Prevents the agent from inventing an unsupported value for this required parameter.", + "importance": 0.75 + } + ] + } + } + ] + }, + { + "name": "general_conversation_grounded_correctness", + "task_description": "Evaluate the overall correctness and appropriateness of the tool call based SOLELY on the conversation context, WITHOUT requiring explicit tool specifications. If tool specifications appear in the conversation (e.g. in system prompts or assistant messages), you MAY use them; otherwise reason from the conversation alone.\n\n---\n\n### 1. Tool Call Appropriateness\n\n- **Intent Alignment**: does it address the user's request or a logical next step (including preliminary probes and exploratory lookups that precede a later mutating action)?\n- **Relevance**: is the chosen function plausibly useful for the current context?\n\n### 2. Conversation Grounding\n\nParameter values should be grounded in this priority order: system prompt > tool outputs > explicit user statements > confirmed assistant statements > reasonable inference. Tool outputs and system-prompt facts (policy, current date/year, identity) override conflicting assistant statements. Recognize these grounding paths:\n\n- **Explicit** \u2014 value is stated verbatim in a higher-priority source.\n- **Pick-one** \u2014 user enumerated several acceptable values; the agent picked one of them.\n- **Specialization** \u2014 user named a set/range/category; the agent picked a concrete value INSIDE that scope. Flag only when the concrete value lands outside the user's named scope.\n- **Reasonable inference** \u2014 a value that a careful reader of the conversation would derive without surprise (e.g. typical defaults like a party size of 1).\n\n### 3. Prerequisites\n\n- Explicit prerequisites the policy or prior tool messages document (e.g. authentication acquired, order id fetched before a mutation) must be satisfied or be the subject of this call itself.\n- Do NOT fabricate prerequisites not documented in the conversation.\n\n---\n\n### Rating Scale\n- **5**: fully appropriate, well-grounded, prerequisites satisfied (or this call is itself gathering a prerequisite).\n- **4**: well-grounded with only minor reasonable inferences.\n- **3**: generally appropriate but with weak grounding on some parameters or questionable timing.\n- **2**: multiple ungrounded parameters or a relevance issue.\n- **1**: clearly inappropriate, contradicts the conversation, or contains fabricated parameter values.\n\n---\n\n#### Required-Parameter Omission\n\nA parameter listed in the spec's `required` array that is completely absent from the arguments is ALWAYS MISSING_INFORMATION \u2014 no grounding path (explicit statement, documented default, pick-one, specialization, or plausible-optional) substitutes for actually providing a value. The grounding paths only apply to values the agent ACTUALLY PROVIDED; they never excuse omission of a required parameter.\n\n#### User-Context Discarded\n\nWhen a parameter is spec-optional but the user (or prior tool output / system prompt) provided an explicit narrowing value for it \u2014 a search term, a filter, a category, an identifier \u2014 and the agent OMITS the parameter or sets it to an \"empty\" sentinel (empty string, null, default wildcard), this is NOT a \"plausible optional value\". It is USER_CONTEXT_DISCARDED and should be labeled CONTRADICTORY_VALUES (the call contradicts explicit user intent). Score the call 2-3 accordingly. This rule applies regardless of whether the spec marks the parameter as required; it turns on whether the user supplied narrowing context for it.\n\n---\n\n#### Stringency\n\nApply a lenient bar when the call is information-gathering or exploratory (reads, lookups, probes) that is plausibly on-path. Apply a strict bar to the grounding of REQUIRED parameter values that drive a mutating action \u2014 those must be explicitly supported by the conversation, a prior tool output, a system-prompt anchor, or a documented default.\n\n#### Confirmation Scope\n\nExplicit user confirmation is required only before actions that change persistent state on behalf of the user (creating, modifying, or deleting records; sending messages; committing transactions). Inspect-only or information-gathering calls \u2014 reads, lookups, searches, descriptions, status checks \u2014 do NOT require prior user confirmation, even when a later mutating action in the same flow will.\n\n---\n\n#### Actionable Recommendations\n\nWhen the call reveals a fixable gap in the agent's prompts or tool specifications, emit one or more `actionable_recommendations` as unified-diff patches. Each recommendation specifies: `target` (which artifact to change: `system_prompt`, `tool_description`, `parameter_description`, `parameter_examples`), an optional `tool_name` / `parameter_name` for tool-scoped targets, a `diff` in unified-diff format, a short `rationale`, and an `importance` score in [0, 1] (0 = cosmetic, 1 = critical). When the call has no fixable artifact gap \u2014 or the call is already correct \u2014 return an empty array.", + "jsonschema": { + "title": "general_conversation_grounded_correctness", + "description": "Holistic assessment of tool call correctness based on conversation grounding, appropriateness, and logical consistency without requiring explicit tool specifications.", + "type": "object", + "additionalProperties": false, + "properties": { + "evidence": { + "type": "string", + "description": "Provide EXACT quotes from the conversation that support or contradict the tool call. Include: (1) relevant user messages, (2) assistant messages, (3) prior tool outputs, and (4) any tool specifications mentioned in the conversation. Organize evidence by parameter and by aspect (appropriateness, grounding, prerequisites)." + }, + "explanation": { + "type": "string", + "description": "Provide an analysis covering: (1) Tool Call Appropriateness - why this tool call makes sense (or doesn't) in the conversation flow, (2) Parameter Grounding - for EACH parameter, explain its grounding source and quality, noting conflicts between sources and resolution, (3) Prerequisites - whether all necessary preconditions are satisfied, (4) Format and Consistency - whether parameters make logical sense together, (5) Overall Correctness - summary judgment on whether the tool call is fully justified by the conversation." + }, + "output": { + "type": "integer", + "minimum": 1, + "maximum": 5, + "threshold_low": 4, + "threshold_high": 5, + "description": "Overall Call Correctness Score (1-5), without requiring explicit tool specs. Combine appropriateness (intent fit, on-trajectory sequencing) and grounding of present values (see the grounding paths above). Mid-trajectory probes plausibly on-path are NOT penalized for not, by themselves, completing the user's full goal.\n\n- **5**: Right action at the right time; every required value is grounded via an explicit source, a documented default, a pick-one, or an in-scope specialization; no redundancy; any documented prerequisite is satisfied or this call is itself gathering one.\n- **4**: Appropriate and timely, with most values explicitly grounded and a small number of in-scope inferences or abstract specializations the user clearly allowed.\n- **3**: Generally appropriate but with one weakly grounded required value or minor timing concern. The call might succeed but could reasonably be questioned by a careful reader.\n- **2**: Multiple weakly grounded required values OR a relevance issue (call appears off-path for the stated goal even after accounting for exploration).\n- **1**: Clearly inappropriate, redundant (SAME function + SAME arguments), fundamentally misaligned with the conversation, or contains fabricated required values (no grounding path applies)." + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1, + "threshold_low": 0, + "threshold_high": 1, + "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Higher confidence when conversation context is clear and complete. Lower confidence when context is ambiguous, conversation is incomplete, or tool specifications are not available." + }, + "correction": { + "type": "object", + "description": "Corrections for the tool call. For correct calls (output >= 4), return {}. For problematic calls (output <= 3), provide either need_more_information OR corrected_tool_call.\n\nCRITICAL FORMAT REQUIREMENT: When providing corrected_tool_call, the 'arguments' field MUST be a JSON object/dict (e.g., {\"key\": \"value\"}), NOT a JSON string (e.g., NOT \"{\\\"key\\\": \\\"value\\\"}\").", + "properties": { + "need_more_information": { + "type": "array", + "description": "List of questions to ask the user if information is missing. Each question should be specific and actionable.", + "items": { + "type": "string" + } + }, + "corrected_tool_call": { + "type": "object", + "description": "Complete corrected tool call with all issues addressed. Only provide if correction is possible without additional user input. IMPORTANT: The 'arguments' field must be a JSON object (dict), not a JSON string.", + "properties": { + "name": { + "type": "string", + "description": "Name of the function to call." }, - "output": { - "evidence": "Tool spec requires: timer_duration (string matching pattern '^\\d+\\s+(minutes?|seconds?|hours?)$'), meeting_date (string, YYYY-MM-DD format), meeting_time (string, HH:MM 24-hour format). Values provided: '300' (no unit, violates pattern), 'tomorrow' (not YYYY-MM-DD), '3 PM' (not 24-hour format).", - "explanation": "All three parameters have significant format violations: timer_duration '300' lacks the required unit and space (should be '5 minutes'), meeting_date 'tomorrow' doesn't follow YYYY-MM-DD format (should be '2025-08-07'), and meeting_time '3 PM' uses 12-hour format instead of required 24-hour HH:MM (should be '15:00'). These errors will likely cause API rejection.", - "output": 2, - "confidence": 0.92, - "correction": { - "parameter_issues": [ - { - "parameter_name": "timer_duration", - "reason_types": [ - "FORMAT_ERROR", - "UNIT_ERROR" - ], - "reasons": "Value '300' lacks required unit and space format, should be '5 minutes'", - "corrected_value": { - "timer_duration": "5 minutes" - } - }, - { - "parameter_name": "meeting_date", - "reason_types": [ - "FORMAT_ERROR" - ], - "reasons": "Value 'tomorrow' doesn't follow required YYYY-MM-DD format", - "corrected_value": { - "meeting_date": "2025-08-07" - } - }, - { - "parameter_name": "meeting_time", - "reason_types": [ - "FORMAT_ERROR" - ], - "reasons": "Value '3 PM' uses 12-hour format instead of required 24-hour HH:MM", - "corrected_value": { - "meeting_time": "15:00" - } - } - ], - "tool_call": { - "name": "set_timer_and_meeting", - "arguments": { - "timer_duration": "5 minutes", - "meeting_date": "2025-08-07", - "meeting_time": "15:00" - } - } - }, - "actionable_recommendations": [ - { - "recommendation": "PARAMETER_EXAMPLES", - "parameter_name": "timer_duration", - "details": "Add clear examples in the API documentation showing timer_duration as '5 minutes', meeting_date as '2025-08-07', and meeting_time as '15:00' to prevent format confusion.", - "quote": "Examples: `{ \"timer_duration\": \"5 minutes\" }`, `{ \"timer_duration\": \"10 seconds\" }`" - }, - { - "recommendation": "SYSTEM_PROMPT_ADDITIONS", - "parameter_name": "timer_duration", - "details": "Add to system prompt to always convert relative dates, 12-hour times, and ensure timer durations include both number and unit with a space before calling tools.", - "quote": "Always convert relative dates like 'tomorrow' to YYYY-MM-DD format, convert 12-hour time to 24-hour HH:MM format, and ensure timer durations include both number and unit with a space." - }, - { - "recommendation": "PARAMETER_VALIDATION", - "parameter_name": "timer_duration", - "details": "Implement pre-call validation to check timer_duration pattern, date format compliance, and time format before making tool calls.", - "quote": "Validate 'timer_duration' matches pattern '^\\d+\\s+(minutes?|seconds?|hours?)$', 'meeting_date' is in 'YYYY-MM-DD', and 'meeting_time' is in 'HH:MM' 24-hour format before calling the tool." - } - ] + "arguments": { + "type": "object", + "description": "Corrected arguments for the function call as a JSON object (not a string). Example: {\"param1\": \"value1\", \"param2\": 123}", + "additionalProperties": true } + }, + "required": [ + "name", + "arguments" + ] } - ] - }, - { - "name": "general_conversation_grounded_correctness", - "task_description": "Evaluate the overall correctness and appropriateness of the tool call based SOLELY on the conversation context, WITHOUT requiring explicit tool specifications. This metric assesses whether the tool call is a logical, non-redundant action that is fully grounded in the conversation history.\n\n**IMPORTANT**: If tool specifications appear in the conversation (e.g., in system prompts or assistant messages), you MAY use them to enhance your assessment. However, your primary focus should be on conversation grounding.\n\n---\n\n### 1. Tool Call Appropriateness\n\nAssess whether the tool call makes sense given the conversation flow:\n\n- **Intent Alignment**: Does the tool call directly address the user's request or a logical next step?\n- **Non-Redundancy**: Is this tool call necessary, or has equivalent information already been obtained?\n- **Timing**: Are there prerequisite actions that should have been completed first?\n- **Relevance**: Is the chosen function appropriate for the current conversation context?\n\n---\n\n### 2. Conversation Grounding Hierarchy\n\nParameter values should be grounded in the following priority order:\n\n1. **Tool Outputs** (highest priority): Values from previous tool call results\n2. **Explicit User Statements**: Direct mentions in user messages\n3. **Confirmed Assistant Statements**: Information the assistant provided that the user acknowledged\n4. **Logical Inference**: Values that can be reasonably inferred from context (use cautiously)\n\n**Key Principle**: When there's a conflict between sources, tool outputs take precedence over user/assistant statements, as they represent ground truth from the system.\n\n---\n\n### 3. Parameter Value Assessment\n\nFor each parameter in the tool call:\n\n- **Explicit Grounding**: Is the value directly stated in the conversation?\n- **Implicit Grounding**: Can the value be reasonably inferred from context?\n- **Assumption Detection**: Is the value assumed without any conversational basis?\n- **Conflict Resolution**: If multiple sources provide different values, is the most authoritative source used?\n\n---\n\n### 4. Format and Consistency (Conversation-Based)\n\nEven without formal specifications:\n\n- **Internal Consistency**: Do parameter values make logical sense together (e.g., end_date after start_date)?\n- **Observable Patterns**: If tool specifications are mentioned in the conversation, do values conform to described patterns?\n- **Common Sense Validation**: Do values follow basic logical constraints (e.g., no negative quantities for counts)?\n\n---\n\n### 5. Prerequisite Satisfaction\n\nCheck if the tool call has all necessary preconditions:\n\n- **Dependent Information**: Is all required information available from the conversation or prior tool calls?\n- **State Requirements**: Based on conversation context, is the system in the right state for this call?\n- **Missing Steps**: Are there obvious gaps in the conversation that should be addressed first?\n\n---\n\n### Correctness Rating Scale\n\n- **5 - Fully Correct**: Tool call is perfectly appropriate, all parameters are explicitly or clearly inferably grounded in conversation, no redundancy, all prerequisites satisfied\n- **4 - Mostly Correct**: Tool call is appropriate and well-grounded, with only minor implicit inferences that are very reasonable\n- **3 - Partially Correct**: Tool call is generally appropriate but has some parameters with weak grounding or minor timing/relevance issues\n- **2 - Problematic**: Tool call has significant issues—multiple ungrounded parameters, questionable relevance, or missing prerequisites\n- **1 - Incorrect**: Tool call is inappropriate, redundant, or has parameters that are clearly fabricated or contradict conversation context\n\n---\n\n### Evaluation Guidelines\n\n- **Conservative Assessment**: When grounding is unclear, prefer lower scores\n- **Prioritize Tool Outputs**: Always favor values from tool results over human statements\n- **Consider Conversation Flow**: Assess whether the tool call makes sense in the sequence of events\n- **Watch for Hallucinations**: Flag any parameters that appear without conversational basis\n- **Use Available Specs**: If tool specifications appear in the conversation, incorporate them into your assessment\n\nYour evaluation should reflect a holistic view of whether a human reviewing the conversation would consider this tool call to be correct, appropriate, and fully justified by the available information.", - "jsonschema": { - "title": "general_conversation_grounded_correctness", - "description": "Holistic assessment of tool call correctness based on conversation grounding, appropriateness, and logical consistency without requiring explicit tool specifications.", + }, + "required": [] + }, + "actionable_recommendations": { + "type": "array", + "description": "List of actionable recommendations as unified-diff patches. Emit only when the call reveals a fixable gap in the agent's prompts or tool specifications. Empty array when there is nothing to fix (including when the call is already correct).", + "items": { "type": "object", "additionalProperties": false, "properties": { - "evidence": { - "type": "string", - "description": "Provide EXACT quotes from the conversation that support or contradict the tool call. Include: (1) relevant user messages, (2) assistant messages, (3) prior tool outputs, and (4) any tool specifications mentioned in the conversation. Organize evidence by parameter and by aspect (appropriateness, grounding, prerequisites)." - }, - "explanation": { - "type": "string", - "description": "Provide an analysis covering: (1) Tool Call Appropriateness - why this tool call makes sense (or doesn't) in the conversation flow, (2) Parameter Grounding - for EACH parameter, explain its grounding source and quality, noting conflicts between sources and resolution, (3) Prerequisites - whether all necessary preconditions are satisfied, (4) Format and Consistency - whether parameters make logical sense together, (5) Overall Correctness - summary judgment on whether the tool call is fully justified by the conversation." - }, - "output": { - "type": "integer", - "minimum": 1, - "maximum": 5, - "threshold_low": 4, - "threshold_high": 5, - "description": "Overall Correctness Score (1-5):\n\n5 - Fully Correct: Tool call is the right action at the right time. All parameters are explicitly grounded in conversation (user messages, confirmed assistant statements, or tool outputs). No redundancy. All prerequisites satisfied. If tool specs are in conversation, the call aligns with them.\nExample: User asks \"What's the weather in Boston?\", assistant calls get_weather(location=\"Boston\") with no prior weather call for Boston.\n\n4 - Mostly Correct: Tool call is appropriate and timely. Most parameters are explicitly grounded, a few may be reasonably inferred from clear context. No significant issues. Prerequisites satisfied.\nExample: User says \"Book a table for dinner tonight\", assistant calls book_restaurant(date=\"2025-12-17\", time=\"19:00\", party_size=2) where date is inferred from 'tonight' and party_size from typical default.\n\n3 - Partially Correct: Tool call is generally appropriate but has noticeable issues—some parameters are weakly grounded, minor relevance concerns, or unclear if prerequisites are met. The call might work but has questionable elements.\nExample: User says \"I need a hotel\", assistant calls book_hotel(location=\"New York\", check_in=\"2025-12-20\") where location and dates aren't mentioned in conversation.\n\n2 - Problematic: Tool call has significant correctness issues—multiple parameters lack grounding, the timing is questionable, important prerequisites are missing, or the call seems partially redundant. Likely to fail or produce incorrect results.\nExample: User asks \"What flights are available?\", but assistant calls book_flight(...) with specific dates and destinations not discussed, skipping the search step.\n\n1 - Incorrect: Tool call is inappropriate, clearly redundant, or fundamentally misaligned with conversation. Parameters are fabricated or contradict known information. Missing critical prerequisites. The call should not be made.\nExample: User asks about restaurant recommendations, assistant calls cancel_reservation(reservation_id=\"12345\") with no prior discussion of cancellations or that ID." - }, - "confidence": { - "type": "number", - "minimum": 0, - "maximum": 1, - "threshold_low": 0, - "threshold_high": 1, - "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Higher confidence when conversation context is clear and complete. Lower confidence when context is ambiguous, conversation is incomplete, or tool specifications are not available." - }, - "correction": { - "type": "object", - "description": "Corrections for the tool call. For correct calls (output >= 4), return {}. For problematic calls (output <= 3), provide either need_more_information OR corrected_tool_call.\n\nCRITICAL FORMAT REQUIREMENT: When providing corrected_tool_call, the 'arguments' field MUST be a JSON object/dict (e.g., {\"key\": \"value\"}), NOT a JSON string (e.g., NOT \"{\\\"key\\\": \\\"value\\\"}\").", - "properties": { - "need_more_information": { - "type": "array", - "description": "List of questions to ask the user if information is missing. Each question should be specific and actionable.", - "items": { - "type": "string" - } - }, - "corrected_tool_call": { - "type": "object", - "description": "Complete corrected tool call with all issues addressed. Only provide if correction is possible without additional user input. IMPORTANT: The 'arguments' field must be a JSON object (dict), not a JSON string.", - "properties": { - "name": { - "type": "string", - "description": "Name of the function to call." - }, - "arguments": { - "type": "object", - "description": "Corrected arguments for the function call as a JSON object (not a string). Example: {\"param1\": \"value1\", \"param2\": 123}", - "additionalProperties": true - } - }, - "required": ["name", "arguments"] - } - }, - "required": [] - }, - "actionable_recommendations": { - "type": "array", - "description": "Provide clear, actionable recommendations that can be automatically applied to improve conversation grounding. Only include recommendations when there are actual issues (output <= 3). Focus on:\n\n1. SYSTEM_PROMPT_INSTRUCTION: Add a general instruction to the system prompt about grounding, prerequisites, or clarification (do not mention specific parameter names).", - "items": { - "type": "object", - "properties": { - "recommendation": { - "type": "string", - "enum": [ - "SYSTEM_PROMPT_INSTRUCTION" - ], - "description": "The type of recommendation being made." - }, - "details": { - "type": "string", - "description": "Brief explanation of what should be changed and why." - }, - "quote": { - "type": "string", - "description": "Exact text to add to documentation or system prompt. Must be directly usable without modification." - } - }, - "required": [ - "recommendation", - "details", - "quote" - ] - } - } + "target": { + "type": "string", + "enum": [ + "system_prompt", + "tool_description", + "parameter_description", + "parameter_examples" + ], + "description": "Which artifact the diff applies to. `system_prompt` changes the agent's top-level policy; the other three target a specific tool (and possibly one of its parameters)." + }, + "tool_name": { + "type": [ + "string", + "null" + ], + "description": "Required when target is tool_description, parameter_description, or parameter_examples. Use the exact function/tool name. Must be null when target is system_prompt." + }, + "parameter_name": { + "type": [ + "string", + "null" + ], + "description": "Required when target is parameter_description or parameter_examples. Null otherwise." + }, + "diff": { + "type": "string", + "description": "Unified git-diff-format patch of the proposed change. Format: `--- a/\\n+++ b/\\n@@ ... @@\\n-\\n+`. When there is no line-number context, use `@@` as the single hunk header. Keep diffs minimal \u2014 only the lines that meaningfully change." + }, + "rationale": { + "type": "string", + "description": "One-sentence reason why this change addresses the observed issue. <= 200 chars." + }, + "importance": { + "type": "number", + "minimum": 0, + "maximum": 1, + "description": "How impactful this recommendation is at preventing similar issues in future calls (0 = cosmetic, 1 = critical). Calibrate consistently across metrics so downstream aggregation is comparable." + } }, "required": [ - "evidence", - "explanation", - "output", - "confidence", - "correction", - "actionable_recommendations" + "target", + "diff", + "rationale", + "importance" ] + } + } + }, + "required": [ + "evidence", + "explanation", + "output", + "confidence", + "correction", + "actionable_recommendations" + ] + }, + "examples": [ + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "What's the weather like in Boston?" + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "get_weather", + "arguments": "{\"location\": \"Boston\"}" + } + } }, - "examples": [ + "output": { + "evidence": "User message: \"What's the weather like in Boston?\"\nTool call parameter: location=\"Boston\"\nNo prior tool calls in conversation.", + "explanation": "Tool Call Appropriateness: The tool call directly addresses the user's explicit request for weather information. It's timely, relevant, and non-redundant.\n\nParameter Grounding: The 'location' parameter value \"Boston\" is explicitly mentioned in the user's question, providing perfect grounding.\n\nPrerequisites: No prerequisites are needed for a weather query beyond having the location, which is provided.\n\nFormat and Consistency: The location parameter is a simple string matching the user's input, internally consistent.\n\nOverall Correctness: This tool call is perfectly appropriate and fully grounded in the conversation.", + "output": 5, + "confidence": 0.98, + "correction": {}, + "actionable_recommendations": [] + } + }, + { + "user_kwargs": { + "conversation_context": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "What's the weather like in Boston?" - } - ], - "tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "get_weather", - "arguments": "{\"location\": \"Boston\"}" - } - } - }, - "output": { - "evidence": "User message: \"What's the weather like in Boston?\"\nTool call parameter: location=\"Boston\"\nNo prior tool calls in conversation.", - "explanation": "Tool Call Appropriateness: The tool call directly addresses the user's explicit request for weather information. It's timely, relevant, and non-redundant.\n\nParameter Grounding: The 'location' parameter value \"Boston\" is explicitly mentioned in the user's question, providing perfect grounding.\n\nPrerequisites: No prerequisites are needed for a weather query beyond having the location, which is provided.\n\nFormat and Consistency: The location parameter is a simple string matching the user's input, internally consistent.\n\nOverall Correctness: This tool call is perfectly appropriate and fully grounded in the conversation.", - "output": 5, - "confidence": 0.98, - "correction": {}, - "actionable_recommendations": [] - } + "role": "user", + "content": "I want to book a flight." }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "I want to book a flight." - }, - { - "role": "assistant", - "content": "I'd be happy to help you book a flight. Could you please provide the departure city, destination, and travel dates?" - } - ], - "tool_call": { - "id": "call_002", - "type": "function", - "function": { - "name": "book_flight", - "arguments": "{\"departure\": \"New York\", \"destination\": \"Los Angeles\", \"date\": \"2025-12-25\"}" - } - } - }, - "output": { - "evidence": "User message: \"I want to book a flight.\"\nAssistant message: \"Could you please provide the departure city, destination, and travel dates?\"\nTool call parameters: departure=\"New York\", destination=\"Los Angeles\", date=\"2025-12-25\"\nNo user response providing the requested information.", - "explanation": "Tool Call Appropriateness: The tool call is premature and inappropriate. The assistant explicitly asked the user for departure city, destination, and dates, but the user has not yet provided this information. Making the booking call now is incorrect.\n\nParameter Grounding: All three parameters (departure, destination, date) are completely ungrounded. They do not appear anywhere in the conversation and represent pure hallucination. The user only expressed a general intent to book a flight.\n\nPrerequisites: The critical prerequisite—obtaining the necessary booking information from the user—has not been satisfied. The assistant asked for this information but didn't wait for the response.\n\nFormat and Consistency: While the parameters themselves are internally consistent (valid city names and date format), they have no basis in reality for this conversation.\n\nOverall Correctness: This tool call is fundamentally incorrect and should not be made. The system should wait for user input before proceeding.", - "output": 1, - "confidence": 0.95, - "correction": { - "need_more_information": [ - "What city will you be departing from?", - "What is your destination?", - "What date would you like to travel?" - ] - }, - "actionable_recommendations": [ - { - "recommendation": "SYSTEM_PROMPT_INSTRUCTION", - "details": "Add instruction to wait for user responses before making tool calls.", - "quote": "If you ask the user for information, you must wait for their response before making any tool calls that require that information. Never fabricate or assume parameter values." - }, - { - "recommendation": "SYSTEM_PROMPT_INSTRUCTION", - "details": "Add instruction about prerequisite checking.", - "quote": "Before making tool calls, verify that all required information is available in the conversation. If any required parameter is missing, ask the user to provide it first." - } - ] - } + "role": "assistant", + "content": "I'd be happy to help you book a flight. Could you please provide the departure city, destination, and travel dates?" + } + ], + "tool_call": { + "id": "call_002", + "type": "function", + "function": { + "name": "book_flight", + "arguments": "{\"departure\": \"New York\", \"destination\": \"Los Angeles\", \"date\": \"2025-12-25\"}" + } + } + }, + "output": { + "evidence": "User message: \"I want to book a flight.\"\nAssistant message: \"Could you please provide the departure city, destination, and travel dates?\"\nTool call parameters: departure=\"New York\", destination=\"Los Angeles\", date=\"2025-12-25\"\nNo user response providing the requested information.", + "explanation": "Tool Call Appropriateness: The tool call is premature and inappropriate. The assistant explicitly asked the user for departure city, destination, and dates, but the user has not yet provided this information. Making the booking call now is incorrect.\n\nParameter Grounding: All three parameters (departure, destination, date) are completely ungrounded. They do not appear anywhere in the conversation and represent pure hallucination. The user only expressed a general intent to book a flight.\n\nPrerequisites: The critical prerequisite\u2014obtaining the necessary booking information from the user\u2014has not been satisfied. The assistant asked for this information but didn't wait for the response.\n\nFormat and Consistency: While the parameters themselves are internally consistent (valid city names and date format), they have no basis in reality for this conversation.\n\nOverall Correctness: This tool call is fundamentally incorrect and should not be made. The system should wait for user input before proceeding.", + "output": 1, + "confidence": 0.95, + "correction": { + "need_more_information": [ + "What city will you be departing from?", + "What is your destination?", + "What date would you like to travel?" + ] + }, + "actionable_recommendations": [ + { + "target": "system_prompt", + "tool_name": null, + "parameter_name": null, + "diff": "--- a/system_prompt\n+++ b/system_prompt\n@@\n+Before you call a tool, verify every required argument is explicitly grounded in the conversation, a prior tool output, or a documented default. If a required argument is unknown, ask the user instead of guessing.", + "rationale": "Reinforces evidence-based argument selection so the agent does not fabricate required values.", + "importance": 0.8 }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Get me the user profile for the logged-in user." - }, - { - "role": "assistant", - "content": null, - "tool_calls": [ - { - "id": "call_001", - "type": "function", - "function": { - "name": "get_current_user_id", - "arguments": "{}" - } - } - ] - }, - { - "role": "tool", - "tool_call_id": "call_001", - "content": "{\"user_id\": 42, \"username\": \"john_doe\"}" - }, - { - "role": "assistant", - "content": "I found that you're logged in as john_doe (user ID: 42). Let me get your full profile now." - } - ], - "tool_call": { - "id": "call_002", - "type": "function", - "function": { - "name": "get_user_profile", - "arguments": "{\"user_id\": 42}" - } - } - }, - "output": { - "evidence": "User message: \"Get me the user profile for the logged-in user.\"\nPrior tool call: get_current_user_id()\nTool output: {\"user_id\": 42, \"username\": \"john_doe\"}\nAssistant confirmation: \"you're logged in as john_doe (user ID: 42)\"\nCurrent tool call: get_user_profile(user_id=42)", - "explanation": "Tool Call Appropriateness: The tool call is perfectly appropriate and represents the logical next step. The user requested the profile for the logged-in user. The assistant first identified the user (ID: 42) via a tool call, confirmed this with the user, and is now proceeding to fetch the full profile.\n\nParameter Grounding: The 'user_id' parameter value of 42 is grounded in the tool output from get_current_user_id. This is the highest-quality grounding source (tool output), taking precedence over any human statements.\n\nPrerequisites: All prerequisites are satisfied—the user ID was obtained through a prior tool call, which is the proper way to get this information before fetching the profile.\n\nFormat and Consistency: The user_id is an integer (42) which is appropriate for an ID field. It's consistent with the value returned from the authentication check.\n\nOverall Correctness: This tool call demonstrates excellent conversation flow—proper sequencing, authoritative grounding from tool output, and clear alignment with user intent.", - "output": 5, - "confidence": 0.97, - "correction": {}, - "actionable_recommendations": [] - } + "target": "tool_description", + "tool_name": "book_flight", + "parameter_name": null, + "diff": "--- a/tool/book_flight\n+++ b/tool/book_flight\n@@\n+Call this only after you have gathered the inputs listed in `required`. If any required input is missing, use a discovery tool first.", + "rationale": "Makes the tool's prerequisites explicit so the agent does not call it with under-grounded arguments.", + "importance": 0.7 + } + ] + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Get me the user profile for the logged-in user." }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Book a table for dinner tonight at 7 PM." - } - ], - "tool_call": { - "id": "call_003", - "type": "function", - "function": { - "name": "book_restaurant", - "arguments": "{\"date\": \"2025-12-17\", \"time\": \"19:00\", \"party_size\": 2}" - } - } - }, - "output": { - "evidence": "User message: \"Book a table for dinner tonight at 7 PM.\"\nTool call parameters: date=\"2025-12-17\", time=\"19:00\", party_size=2\nCurrent date context: December 17, 2025\nNo mention of restaurant name or party size in conversation.", - "explanation": "Tool Call Appropriateness: The tool call is generally appropriate as it responds to the user's booking request. However, it's missing a critical piece of information—which restaurant to book.\n\nParameter Grounding:\n- 'date': The value \"2025-12-17\" is a reasonable inference from \"tonight\" given the current date context. This is a clear implicit inference (score: 4 level).\n- 'time': The value \"19:00\" (7 PM in 24-hour format) is explicitly grounded in the user's statement \"7 PM\". This is well-grounded.\n- 'party_size': The value 2 is an assumption with no basis in the conversation. The user didn't mention how many people. This is a hallucination.\n\nPrerequisites: The restaurant name/location is missing, which is typically a required prerequisite for booking. The user should specify which restaurant.\n\nFormat and Consistency: The parameters are internally consistent (date is today, time is evening, party size is positive), but the party_size lacks grounding.\n\nOverall Correctness: The tool call has mixed quality—some parameters are well-grounded while others are assumed. The missing restaurant information is also problematic.", - "output": 3, - "confidence": 0.85, - "correction": { - "need_more_information": [ - "Which restaurant would you like to book?", - "How many people will be dining?" - ] - }, - "actionable_recommendations": [ - { - "recommendation": "SYSTEM_PROMPT_INSTRUCTION", - "details": "Add instruction about when inference is acceptable vs. when to ask.", - "quote": "Temporal references like 'tonight' or 'tomorrow' can be inferred from the current date. However, user-specific values like party size, number of travelers, or personal preferences must never be assumed—always ask the user if not explicitly provided." - } - ] + "role": "assistant", + "content": null, + "tool_calls": [ + { + "id": "call_001", + "type": "function", + "function": { + "name": "get_current_user_id", + "arguments": "{}" + } } + ] }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "I need a reservation at The French Bistro for 2 people tomorrow at 7 PM." - }, - { - "role": "assistant", - "content": "I'll book a table for you at The French Bistro." - } - ], - "tool_call": { - "id": "call_004", - "type": "function", - "function": { - "name": "book_restaurant", - "arguments": "{\"restaurant_name\": \"The French Bistro\", \"date\": \"2025-12-18\", \"time\": \"19:00\", \"party_size\": 4, \"dietary_restrictions\": [\"vegetarian\"]}" - } - } - }, - "output": { - "evidence": "User message: \"I need a reservation at The French Bistro for 2 people tomorrow at 7 PM.\"\nTool call parameters: restaurant_name=\"The French Bistro\", date=\"2025-12-18\", time=\"19:00\", party_size=4, dietary_restrictions=[\"vegetarian\"]\nCurrent date context: December 17, 2025", - "explanation": "Tool Call Appropriateness: The tool call is appropriate in intent but has parameter errors.\n\nParameter Grounding:\n- 'restaurant_name': \"The French Bistro\" is explicitly grounded.\n- 'date': \"2025-12-18\" is correctly inferred from \"tomorrow\".\n- 'time': \"19:00\" is explicitly grounded in \"7 PM\".\n- 'party_size': Value is 4, but user explicitly said \"2 people\" - this is a contradiction.\n- 'dietary_restrictions': [\"vegetarian\"] is completely ungrounded - user never mentioned dietary restrictions.\n\nOverall Correctness: Significant issues with party_size contradiction and hallucinated dietary_restrictions.", - "output": 2, - "confidence": 0.9, - "correction": { - "corrected_tool_call": { - "name": "book_restaurant", - "arguments": { - "restaurant_name": "The French Bistro", - "date": "2025-12-18", - "time": "19:00", - "party_size": 2 - } - } - }, - "actionable_recommendations": [ - { - "recommendation": "SYSTEM_PROMPT_INSTRUCTION", - "details": "Add instruction to verify numbers carefully.", - "quote": "When users provide specific quantities (party size, number of guests, passenger count, etc.), use the exact number stated. Never modify or assume different quantities unless explicitly confirmed by the user." - } - ] - } + "role": "tool", + "tool_call_id": "call_001", + "content": "{\"user_id\": 42, \"username\": \"john_doe\"}" + }, + { + "role": "assistant", + "content": "I found that you're logged in as john_doe (user ID: 42). Let me get your full profile now." + } + ], + "tool_call": { + "id": "call_002", + "type": "function", + "function": { + "name": "get_user_profile", + "arguments": "{\"user_id\": 42}" + } + } + }, + "output": { + "evidence": "User message: \"Get me the user profile for the logged-in user.\"\nPrior tool call: get_current_user_id()\nTool output: {\"user_id\": 42, \"username\": \"john_doe\"}\nAssistant confirmation: \"you're logged in as john_doe (user ID: 42)\"\nCurrent tool call: get_user_profile(user_id=42)", + "explanation": "Tool Call Appropriateness: The tool call is perfectly appropriate and represents the logical next step. The user requested the profile for the logged-in user. The assistant first identified the user (ID: 42) via a tool call, confirmed this with the user, and is now proceeding to fetch the full profile.\n\nParameter Grounding: The 'user_id' parameter value of 42 is grounded in the tool output from get_current_user_id. This is the highest-quality grounding source (tool output), taking precedence over any human statements.\n\nPrerequisites: All prerequisites are satisfied\u2014the user ID was obtained through a prior tool call, which is the proper way to get this information before fetching the profile.\n\nFormat and Consistency: The user_id is an integer (42) which is appropriate for an ID field. It's consistent with the value returned from the authentication check.\n\nOverall Correctness: This tool call demonstrates excellent conversation flow\u2014proper sequencing, authoritative grounding from tool output, and clear alignment with user intent.", + "output": 5, + "confidence": 0.97, + "correction": {}, + "actionable_recommendations": [] + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Book a table for dinner tonight at 7 PM." + } + ], + "tool_call": { + "id": "call_003", + "type": "function", + "function": { + "name": "book_restaurant", + "arguments": "{\"date\": \"2025-12-17\", \"time\": \"19:00\", \"party_size\": 2}" + } + } + }, + "output": { + "evidence": "User message: \"Book a table for dinner tonight at 7 PM.\"\nTool call parameters: date=\"2025-12-17\", time=\"19:00\", party_size=2\nCurrent date context: December 17, 2025\nNo mention of restaurant name or party size in conversation.", + "explanation": "Tool Call Appropriateness: The tool call is generally appropriate as it responds to the user's booking request. However, it's missing a critical piece of information\u2014which restaurant to book.\n\nParameter Grounding:\n- 'date': The value \"2025-12-17\" is a reasonable inference from \"tonight\" given the current date context. This is a clear implicit inference (score: 4 level).\n- 'time': The value \"19:00\" (7 PM in 24-hour format) is explicitly grounded in the user's statement \"7 PM\". This is well-grounded.\n- 'party_size': The value 2 is an assumption with no basis in the conversation. The user didn't mention how many people. This is a hallucination.\n\nPrerequisites: The restaurant name/location is missing, which is typically a required prerequisite for booking. The user should specify which restaurant.\n\nFormat and Consistency: The parameters are internally consistent (date is today, time is evening, party size is positive), but the party_size lacks grounding.\n\nOverall Correctness: The tool call has mixed quality\u2014some parameters are well-grounded while others are assumed. The missing restaurant information is also problematic.", + "output": 3, + "confidence": 0.85, + "correction": { + "need_more_information": [ + "Which restaurant would you like to book?", + "How many people will be dining?" + ] + }, + "actionable_recommendations": [ + { + "target": "system_prompt", + "tool_name": null, + "parameter_name": null, + "diff": "--- a/system_prompt\n+++ b/system_prompt\n@@\n+Before you call a tool, verify every required argument is explicitly grounded in the conversation, a prior tool output, or a documented default. If a required argument is unknown, ask the user instead of guessing.", + "rationale": "Reinforces evidence-based argument selection so the agent does not fabricate required values.", + "importance": 0.8 + }, + { + "target": "tool_description", + "tool_name": "book_restaurant", + "parameter_name": null, + "diff": "--- a/tool/book_restaurant\n+++ b/tool/book_restaurant\n@@\n+Call this only after you have gathered the inputs listed in `required`. If any required input is missing, use a discovery tool first.", + "rationale": "Makes the tool's prerequisites explicit so the agent does not call it with under-grounded arguments.", + "importance": 0.7 + } + ] + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "I need a reservation at The French Bistro for 2 people tomorrow at 7 PM." + }, + { + "role": "assistant", + "content": "I'll book a table for you at The French Bistro." + } + ], + "tool_call": { + "id": "call_004", + "type": "function", + "function": { + "name": "book_restaurant", + "arguments": "{\"restaurant_name\": \"The French Bistro\", \"date\": \"2025-12-18\", \"time\": \"19:00\", \"party_size\": 4, \"dietary_restrictions\": [\"vegetarian\"]}" + } + } + }, + "output": { + "evidence": "User message: \"I need a reservation at The French Bistro for 2 people tomorrow at 7 PM.\"\nTool call parameters: restaurant_name=\"The French Bistro\", date=\"2025-12-18\", time=\"19:00\", party_size=4, dietary_restrictions=[\"vegetarian\"]\nCurrent date context: December 17, 2025", + "explanation": "Tool Call Appropriateness: The tool call is appropriate in intent but has parameter errors.\n\nParameter Grounding:\n- 'restaurant_name': \"The French Bistro\" is explicitly grounded.\n- 'date': \"2025-12-18\" is correctly inferred from \"tomorrow\".\n- 'time': \"19:00\" is explicitly grounded in \"7 PM\".\n- 'party_size': Value is 4, but user explicitly said \"2 people\" - this is a contradiction.\n- 'dietary_restrictions': [\"vegetarian\"] is completely ungrounded - user never mentioned dietary restrictions.\n\nOverall Correctness: Significant issues with party_size contradiction and hallucinated dietary_restrictions.", + "output": 2, + "confidence": 0.9, + "correction": { + "corrected_tool_call": { + "name": "book_restaurant", + "arguments": { + "restaurant_name": "The French Bistro", + "date": "2025-12-18", + "time": "19:00", + "party_size": 2 + } + } + }, + "actionable_recommendations": [ + { + "target": "system_prompt", + "tool_name": null, + "parameter_name": null, + "diff": "--- a/system_prompt\n+++ b/system_prompt\n@@\n+Before you call a tool, verify every required argument is explicitly grounded in the conversation, a prior tool output, or a documented default. If a required argument is unknown, ask the user instead of guessing.", + "rationale": "Reinforces evidence-based argument selection so the agent does not fabricate required values.", + "importance": 0.8 + }, + { + "target": "tool_description", + "tool_name": "book_restaurant", + "parameter_name": null, + "diff": "--- a/tool/book_restaurant\n+++ b/tool/book_restaurant\n@@\n+Call this only after you have gathered the inputs listed in `required`. If any required input is missing, use a discovery tool first.", + "rationale": "Makes the tool's prerequisites explicit so the agent does not call it with under-grounded arguments.", + "importance": 0.7 } - ] - } -] \ No newline at end of file + ] + } + } + ] + } +] diff --git a/altk/pre_tool/sparc/function_calling/metrics/function_call/general_metrics_runtime.json b/altk/pre_tool/sparc/function_calling/metrics/function_call/general_metrics_runtime.json index cd645489..931df36f 100644 --- a/altk/pre_tool/sparc/function_calling/metrics/function_call/general_metrics_runtime.json +++ b/altk/pre_tool/sparc/function_calling/metrics/function_call/general_metrics_runtime.json @@ -1,832 +1,835 @@ [ - { - "name": "general_hallucination_check", - "task_description": "Evaluate whether each parameter value in the function call is correct and **directly supported** by the provided conversation history and adhere the API specification. Your assessment must be based **strictly on explicit evidence** from these sources and correctly formatted based on the API specifications. Do **not** assume or hallucinate any information that is not clearly documented and provided.\n\n---\n\n#### 1. Grounding Sources\n\nA parameter value is considered grounded if it originates from one of the following:\n\n- An explicit user message in the conversation \n- An assistant message that the user confirmed or acknowledged \n- The output of a previous tool call \n- A documented default value in the API specification \n\n---\n\n#### 2. Parameter Value Classification\n\nEach parameter value must be labeled using one of the following categories:\n\n- **CORRECT** \n The value is explicitly mentioned, clearly implied in the conversation, or matches a documented default.\n\n- **MISSING_INFORMATION** \n The value is underspecified or incomplete given the current context (e.g., '05/06' when a full date is required).\n\n- **FORMAT ERROR** \n The value is conceptually correct but incorrectly formatted (e.g., wrong units, casing, or structure) based on the API specifications and descriptions.\n\n- **CONTRADICTORY_VALUES** \n The value violates documented constraints or logical relationships between parameters (e.g., `start_time` should be < `end_time`).\n\n- **DEFAULT_ISSUE** \n The default value of the parameter is not the correct value based on the conversation history.\n\n---\n\n#### 3. Handling Default Values\n\n- Optional parameters may only use default values that are **explicitly documented** in the API specification. \n- Defaults that are assumed or undocumented count as hallucinations. \n- If no value is provided and no default exists, the parameter should be omitted.\n\n---\n\n#### 4. Acceptable Transformations\n\nSome transformations are permitted, but only when grounded:\n\n- **Synonyms** (e.g., \"NYC\" -> \"New York\") are allowed only when normalized in the conversation. \n- **Unit conversions** are valid only if the conversation explicitly mentions the conversion. \n- **Format changes** (e.g., date or time reformatting) are acceptable only if acknowledged by the assistant or supported by the specification.\n\n---\n\n#### 5. Multi-Call Context\n\nIf the tool call appears in a sequence:\n\n- You may use outputs from earlier tool calls to justify parameter values. \n- Consider dependencies and ordering between calls when assessing grounding. \n\n---\n\n#### Conservative Judgment Principle\n\nWhen in doubt, err on the side of caution. If grounding cannot be clearly established, treat the parameter as incorrect.\n", - "jsonschema": { - "title": "general_hallucination_check", - "description": "Assessment of tool call grounding accuracy, following the rubric defined in the task description.", - "type": "object", - "additionalProperties": false, - "properties": { - "evidence": { + { + "name": "general_hallucination_check", + "task_description": "Evaluate whether each parameter value in the function call is correct and **directly supported** by the provided conversation history AND adheres to the API specification. Label each parameter using one of the categories below.\n\n---\n\n#### 1. Grounding Sources\n\nA value is grounded when it comes from any of these paths:\n\n- **Explicit statement** \u2014 an explicit user message, an assistant message the user confirmed, a previous tool output, or a value fixed by the system prompt.\n- **Documented default** \u2014 a default that the API spec documents for the parameter.\n- **Pick-one from explicit options** \u2014 the user enumerated several acceptable values and the agent picked one of them. The pick is grounded even if the user did not name a single preferred value.\n- **Specialization from an abstract set/range/category** \u2014 the user named a set (\"any of these\"), a range (\"around N\"), or a category (\"a fast one\"), and the agent supplied a concrete instance that falls INSIDE the user's named scope. A concrete value that satisfies the user's abstract constraint is grounded.\n- **Plausible optional value** \u2014 for OPTIONAL parameters (not in the spec's `required` array), any value consistent with the documented type/enum/pattern is grounded unless it contradicts explicit user intent or a higher-priority source.\n\n---\n\n#### 2. Parameter Value Classification\n\n- **CORRECT** \u2014 grounded via any path in \u00a71 (explicit statement, documented default, pick-one from options, specialization inside the user's named set, or a plausible optional value).\n- **MISSING_INFORMATION** \u2014 required AND no grounding path in \u00a71 applies: no explicit evidence, no documented default, no user-provided option set to pick from, no abstract user intent that can be specialized, and no plausible optional-value interpretation. Do NOT use this label just because the value is not quoted verbatim from the user.\n- **FORMAT_ERROR** \u2014 conceptually right value but wrong format (units, casing, structure, pattern) per the spec. Format issues are separate from value issues \u2014 a value that is semantically wrong is NOT a FORMAT_ERROR.\n- **CONTRADICTORY_VALUES** \u2014 the value violates a documented constraint, a logical relationship between parameters, OR falls OUTSIDE the user's explicitly named set/range/category (specialization grounding failed because the concrete choice is not a member of the abstract intent).\n- **DEFAULT_ISSUE** \u2014 a default is used but the correct value is determinable from context.\n\n---\n\n#### 3. Default and Optional Values\n\nOptional parameters (NOT in `required`) may be omitted OR set to any value consistent with the documented type/enum/pattern. Never label omission of an optional parameter as MISSING_INFORMATION. A plausible optional value \u2014 including a neutral boolean (e.g. `false`), a neutral numeric default, or an empty filter \u2014 is CORRECT unless it contradicts explicit user intent or a documented constraint. For REQUIRED parameters, only explicitly documented defaults count as grounded defaults; an un-documented default is DEFAULT_ISSUE.\n\n---\n\n#### 4. Acceptable Transformations\n\nSynonyms, unit conversions, and format re-formattings are valid when supported by the conversation or the spec. An under-specified value (e.g. one missing a year, region, unit, or other component) should be grounded from a higher-priority source (system prompt, prior tool output) before being flagged \u2014 the grounded canonical form is CORRECT.\n\n---\n\n#### Required-Parameter Omission\n\nA parameter listed in the spec's `required` array that is completely absent from the arguments is ALWAYS MISSING_INFORMATION \u2014 no grounding path (explicit statement, documented default, pick-one, specialization, or plausible-optional) substitutes for actually providing a value. The grounding paths only apply to values the agent ACTUALLY PROVIDED; they never excuse omission of a required parameter.\n\n#### User-Context Discarded\n\nWhen a parameter is spec-optional but the user (or prior tool output / system prompt) provided an explicit narrowing value for it \u2014 a search term, a filter, a category, an identifier \u2014 and the agent OMITS the parameter or sets it to an \"empty\" sentinel (empty string, null, default wildcard), this is NOT a \"plausible optional value\". It is USER_CONTEXT_DISCARDED and should be labeled CONTRADICTORY_VALUES (the call contradicts explicit user intent). Score the call 2-3 accordingly. This rule applies regardless of whether the spec marks the parameter as required; it turns on whether the user supplied narrowing context for it.\n\n---\n\n#### Stringency\n\nApply this rubric strictly for REQUIRED parameters whose value is missing, fabricated, or inconsistent with the spec. Apply it leniently for OPTIONAL parameters (not in the spec's `required` array) and for values reasonably inferable from visible context. A value is NOT an issue unless it contradicts explicit user intent, a documented constraint, or higher-priority evidence.", + "jsonschema": { + "title": "general_hallucination_check", + "description": "Assessment of tool call grounding accuracy, following the rubric defined in the task description.", + "type": "object", + "additionalProperties": false, + "properties": { + "evidence": { + "type": "string", + "description": "Provide EXACT quotes from user messages, assistant responses, prior tool outputs, or API specifications that demonstrate whether each parameter value is grounded. Do not paraphrase or interpret-use only direct evidence." + }, + "explanation": { + "type": "string", + "description": "For EACH parameter value, explain its grounding status by referencing specific evidence from the conversation or tool specification. Indicate whether the value is correctly grounded, hallucinated, needs more information, has format errors, or contradicts other parameters." + }, + "output": { + "type": "integer", + "minimum": 1, + "maximum": 5, + "threshold_low": 4, + "threshold_high": 5, + "description": "Grounding Quality Score (1-5). Judge the CALL AS A WHOLE, aggregating per-parameter grounding. Use only evidence from the system prompt, prior tool outputs, user messages, and assistant messages; do not appeal to world knowledge.\n\n- **5**: Every parameter is grounded via an explicit source, a documented default, a pick-one from user-provided options, or a specialization inside the user's named set/range/category. Any optional parameter is either omitted or set to a plausible value consistent with the spec.\n Example: user gives a full identifier and a timeframe; the agent passes them verbatim.\n Example: user enumerates two acceptable values and the agent picks one of them.\n\n- **4**: Every parameter is grounded, but one value is a mild inference \u2014 e.g. a common default like a count of 1, a minimum/ maximum from the spec, or a concrete specialization of an abstract intent where the user's scope clearly allows it.\n\n- **3**: One required parameter is weakly grounded \u2014 a specialization from an abstract intent where the user's scope is ambiguous, or a reasonable but unconfirmed inference. Other parameters are CORRECT.\n\n- **2**: Multiple parameters are weakly grounded or one required value clearly contradicts a documented constraint, a higher-priority source, or falls OUTSIDE the user's explicitly named set (CONTRADICTORY_VALUES).\n\n- **1**: One or more required parameters have no grounding path at all (no explicit evidence, no pick-one, no in-scope specialization, no documented default) \u2014 they are fabricated. Only reach this level when you can state positively that NO grounding path applies." + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1, + "threshold_low": 0, + "threshold_high": 1, + "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." + }, + "correction": { + "type": "object", + "description": "Corrections to the tool call. For correctly grounded calls, use an empty object {}. For flawed calls, include parameter_issues and optionally a corrected tool_call if the full correction can be reconstructed.", + "properties": { + "parameter_issues": { + "type": "array", + "description": "List of parameters with issues. Leave empty if no issues were found.", + "items": { + "type": "object", + "properties": { + "parameter_name": { "type": "string", - "description": "Provide EXACT quotes from user messages, assistant responses, prior tool outputs, or API specifications that demonstrate whether each parameter value is grounded. Do not paraphrase or interpret-use only direct evidence." - }, - "explanation": { + "description": "Name of the parameter with an issue." + }, + "reason_types": { + "type": "array", + "description": "One or more categories explaining the issue with this parameter value.", + "items": { + "type": "string", + "enum": [ + "MISSING_INFORMATION", + "FORMAT_ERROR", + "CONTRADICTORY_VALUES", + "DEFAULT_ISSUE", + "OTHER" + ] + } + }, + "reasons": { "type": "string", - "description": "For EACH parameter value, explain its grounding status by referencing specific evidence from the conversation or tool specification. Indicate whether the value is correctly grounded, hallucinated, needs more information, has format errors, or contradicts other parameters." - }, - "output": { - "type": "integer", - "minimum": 1, - "maximum": 5, - "threshold_low": 4, - "threshold_high": 5, - "description": "Reflection Quality Score (1-5):\n\n- 5: All values are correct - fully grounded and well-formatted.\n All parameter values are explicitly grounded in the conversation or tool output, and exactly match the expected types, formats, and units.\n Example:\n User: \"Book a hotel in Paris for two adults from August 10th to 12th.\"\n \u2192 Tool call includes: location: \"Paris\", guests: 2, check_in: \"2025-08-10\", check_out: \"2025-08-12\"\n\n- 4: All values are correct based on the conversation history and tool specifications - can be logically inferred (can be implicitly grounded).\n Some parameter values are not directly mentioned but can be logically inferred from the context. No evidence of hallucination.\n Example:\n User: \"I want to book a hotel in Tokyo for me and my wife.\"\n \u2192 Tool call includes: location: \"Tokyo\", guests: 2\n\n- 3: One or more values are probably incorrect or ungrounded - ambiguous or undetermined\n It's unclear whether some parameter values are grounded. They might be inferred, hallucinated, or drawn from partial context, or some parameter values partially match the expected format but are likely to be rejected or misinterpreted by automatic processing.\n Example:\n User: \"Book a hotel in Rome.\"\n \u2192 Tool call includes: location: \"Rome\", guests: 2\n\n- 2: One or more values are probably incorrect - likely hallucinated or incorrect format.\n Several parameter values do not align with the conversation context and are likely hallucinated, or have format errors.\n Example:\n User: \"Find a hotel.\"\n \u2192 Tool call includes: location: \"New York\", guests: 4\n\n- 1: One or more values are clearly incorrect - certainly hallucinated or fabricated.\n Some parameter values are clearly invented, irrelevant, contradict known context, or do not follow the API specifications.\n Example:\n User: \"Remind me to call John.\"\n \u2192 Tool call includes: location: \"Barcelona\", guests: 6, check_in: \"2025-07-01\"" + "description": "Brief explanation of the issue(s) for this parameter." + }, + "corrected_value": { + "type": "object", + "description": "Object containing parameter name as key and corrected value as value. Use 'need_more_information' as the key and clarification question for the user if additional information is required. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-grounded, return an empty object {}.", + "additionalProperties": true + } }, - "confidence": { - "type": "number", - "minimum": 0, - "maximum": 1, - "threshold_low": 0, - "threshold_high": 1, - "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." + "required": [] + } + }, + "tool_call": { + "type": "object", + "description": "Complete corrected tool call, including all fixed arguments. Must be included if a full reconstruction is possible.", + "properties": { + "name": { + "type": "string", + "description": "Name of the function to call." }, - "correction": { - "type": "object", - "description": "Corrections to the tool call. For correctly grounded calls, use an empty object {}. For flawed calls, include parameter_issues and optionally a corrected tool_call if the full correction can be reconstructed.", - "properties": { - "parameter_issues": { - "type": "array", - "description": "List of parameters with issues. Leave empty if no issues were found.", - "items": { - "type": "object", - "properties": { - "parameter_name": { - "type": "string", - "description": "Name of the parameter with an issue." - }, - "reason_types": { - "type": "array", - "description": "One or more categories explaining the issue with this parameter value.", - "items": { - "type": "string", - "enum": [ - "MISSING_INFORMATION", - "FORMAT_ERROR", - "CONTRADICTORY_VALUES", - "DEFAULT_ISSUE", - "OTHER" - ] - } - }, - "reasons": { - "type": "string", - "description": "Brief explanation of the issue(s) for this parameter." - }, - "corrected_value": { - "type": "object", - "description": "Object containing parameter name as key and corrected value as value. Use 'need_more_information' as the key and clarification question for the user if additional information is required. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-grounded, return an empty object {}.", - "additionalProperties": true - } - }, - "required": [] - } - }, - "tool_call": { - "type": "object", - "description": "Complete corrected tool call, including all fixed arguments. Must be included if a full reconstruction is possible.", - "properties": { - "name": { - "type": "string", - "description": "Name of the function to call." - }, - "arguments": { - "type": "object", - "description": "Set of corrected arguments for the function call, including both modified and retained values.", - "additionalProperties": true - } - }, - "required": [] - } - }, - "required": [] + "arguments": { + "type": "object", + "description": "Set of corrected arguments for the function call, including both modified and retained values.", + "additionalProperties": true } + }, + "required": [] + } + }, + "required": [] + } + }, + "required": [ + "evidence", + "explanation", + "output", + "confidence", + "correction" + ] + }, + "examples": [ + { + "user_kwargs": { + "conversation_context": [ + { + "role": "assistant", + "content": "Logged in as user 55; auth token acquired." }, - "required": [ - "evidence", - "explanation", - "output", - "confidence", - "correction" - ] - }, - "examples": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "assistant", - "content": "Logged in as user 55; auth token acquired." - }, - { - "role": "assistant", - "content": "Ready to fetch your profile." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "get_user_profile", - "description": "Retrieves a user's profile details", - "parameters": { - "type": "object", - "properties": { - "user_id": { - "type": "integer", - "description": "The ID of the user whose profile is being retrieved" - } - }, - "required": [ - "user_id" - ] - } - } - } - ], - "tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "get_user_profile", - "arguments": "{ \"user_id\": 55 }" - } + "role": "assistant", + "content": "Ready to fetch your profile." + } + ], + "tool_inventory": [ + { + "type": "function", + "function": { + "name": "get_user_profile", + "description": "Retrieves a user's profile details", + "parameters": { + "type": "object", + "properties": { + "user_id": { + "type": "integer", + "description": "The ID of the user whose profile is being retrieved" } - }, - "output": { - "evidence": "Assistant message: \"Logged in as user 55; auth token acquired.\"", - "explanation": "The user_id=55 parameter is properly grounded in the conversation history, as evidenced by the assistant's explicit statement \"Logged in as user 55.\"", - "output": 5, - "confidence": 0.95, - "correction": {} + }, + "required": [ + "user_id" + ] } - }, + } + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "get_user_profile", + "arguments": "{ \"user_id\": 55 }" + } + } + }, + "output": { + "evidence": "Assistant message: \"Logged in as user 55; auth token acquired.\"", + "explanation": "The user_id=55 parameter is properly grounded in the conversation history, as evidenced by the assistant's explicit statement \"Logged in as user 55.\"", + "output": 5, + "confidence": 0.95, + "correction": {} + } + }, + { + "user_kwargs": { + "conversation_context": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "assistant", - "content": "Ready to fetch your profile." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "get_user_profile", - "description": "Retrieves a user's profile details", - "parameters": { - "type": "object", - "properties": { - "user_id": { - "type": "integer", - "description": "The ID of the user whose profile is being retrieved" - } - }, - "required": [ - "user_id" - ] - } - } - } - ], - "tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "get_user_profile", - "arguments": "{ \"user_id\": 42 }" - } - } - }, - "output": { - "evidence": "Complete conversation history contains no mention of user ID 42. Tool specification for get_user_profile does not specify any default value for the user_id parameter.", - "explanation": "The user_id=42 parameter is a hallucinated value with no grounding in the conversation or tool specification.", - "output": 1, - "confidence": 0.91, - "correction": { - "parameter_issues": [ - { - "parameter_name": "user_id", - "reason_types": [ - "MISSING_INFORMATION" - ], - "reasons": "Value 42 is invented without any basis in conversation or specification", - "corrected_value": { - "need_more_information": "Please provide the user ID to retrieve the profile." - } - } - ] + "role": "assistant", + "content": "Ready to fetch your profile." + } + ], + "tool_inventory": [ + { + "type": "function", + "function": { + "name": "get_user_profile", + "description": "Retrieves a user's profile details", + "parameters": { + "type": "object", + "properties": { + "user_id": { + "type": "integer", + "description": "The ID of the user whose profile is being retrieved" } + }, + "required": [ + "user_id" + ] } - }, + } + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "get_user_profile", + "arguments": "{ \"user_id\": 42 }" + } + } + }, + "output": { + "evidence": "Complete conversation history contains no mention of user ID 42. Tool specification for get_user_profile does not specify any default value for the user_id parameter.", + "explanation": "The user_id=42 parameter is a hallucinated value with no grounding in the conversation or tool specification.", + "output": 1, + "confidence": 0.91, + "correction": { + "parameter_issues": [ + { + "parameter_name": "user_id", + "reason_types": [ + "MISSING_INFORMATION" + ], + "reasons": "Value 42 is invented without any basis in conversation or specification", + "corrected_value": { + "need_more_information": "Please provide the user ID to retrieve the profile." + } + } + ] + } + } + }, + { + "user_kwargs": { + "conversation_context": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "I want to book a flight to Rome on 19-07-2024. I want to fly business class." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "book_flight", - "description": "Books a flight to a specified destination on a given date.", - "parameters": { - "type": "object", - "properties": { - "destination": { - "type": "string", - "description": "The destination city or airport for the flight" - }, - "departure_date": { - "type": "string", - "description": "The date of departure, in YYYY-MM-DD format, e.g., '2024-07-19'" - }, - "seat_class": { - "type": "string", - "description": "The class of the seat (must be one of: economy, business, or first)", - "enum": [ - "economy", - "business", - "first" - ] - } - }, - "required": [ - "destination", - "departure_date", - "seat_class" - ] - } - } - } - ], - "tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "book_flight", - "arguments": "{ \"destination\": \"Rome\", \"departure_date\": \"19-07-2024\", \"seat_class\": \"business\" }" - } - } - }, - "output": { - "evidence": "- User message: \"I want to book a flight to Rome on 19-07-2024. I want to fly business class.\"\n- Tool spec requires 'departure_date' in YYYY-MM-DD format.\n- 'seat_class' is correctly grounded as 'business'.", - "explanation": "The 'destination' parameter is correctly grounded as 'Rome'. The 'departure_date' is incorrectly formatted as '19-07-2024' instead of the required 'YYYY-MM-DD' format. The 'seat_class' is correctly grounded as 'business', but the date format error needs to be addressed.", - "output": 2, - "confidence": 0.95, - "correction": { - "parameter_issues": [ - { - "parameter_name": "departure_date", - "reason_types": [ - "FORMAT_ERROR" - ], - "reasons": "The date format '19-07-2024' does not conform to the required 'YYYY-MM-DD' format.", - "corrected_value": { - "departure_date": "2024-07-19" - } - } - ], - "tool_call": { - "name": "book_flight", - "arguments": { - "destination": "Rome", - "departure_date": "2024-07-19", - "seat_class": "business" - } - } + "role": "user", + "content": "I want to book a flight to Rome on 19-07-2024. I want to fly business class." + } + ], + "tool_inventory": [ + { + "type": "function", + "function": { + "name": "book_flight", + "description": "Books a flight to a specified destination on a given date.", + "parameters": { + "type": "object", + "properties": { + "destination": { + "type": "string", + "description": "The destination city or airport for the flight" + }, + "departure_date": { + "type": "string", + "description": "The date of departure, in YYYY-MM-DD format, e.g., '2024-07-19'" + }, + "seat_class": { + "type": "string", + "description": "The class of the seat (must be one of: economy, business, or first)", + "enum": [ + "economy", + "business", + "first" + ] } + }, + "required": [ + "destination", + "departure_date", + "seat_class" + ] } + } } - ] - }, - { - "name": "general_value_format_alignment", - "task_description": "Evaluate whether ALL parameter values in the function call conform to the required type, format, and unit conventions defined in the API specification.\n\nYour judgment must be strictly evidence-based: rely exclusively on the provided API documentation and the parameter values. Do not assume or infer formatting rules that are not explicitly documented.\n\nIMPORTANT: Evaluate ALL parameter values in the tool call to ensure they meet the exact type, format, and unit requirements. Consider the collective format compliance of the entire function call.\n---\n\n### 1. Data Type Compliance\n- Check that each value matches the required data type (e.g., string, integer, boolean, object, array)\n- Verify that numbers are represented as numeric types, not strings\n- Ensure strings are quoted correctly if required\n- Confirm booleans are true/false (not \"true\"/\"false\") when the type is boolean\n\n---\n\n### 2. Format Specification Compliance\n- Validate adherence to documented format constraints for each parameter, including:\n - Date and time formats (e.g., ISO 8601, YYYY-MM-DD, timezone presence)\n - Currency (e.g., $USD, symbol placement, decimal precision)\n - Special patterns (e.g., phone numbers, email addresses, postal codes)\n- Enforce regular expressions or formatting rules defined in the schema\n\n---\n\n### 3. Unit Compliance\n- Verify presence or absence of unit suffixes/prefixes as specified (e.g., \"kg\", \"seconds\")\n- Confirm unit types are consistent with spec (e.g., Celsius vs Fahrenheit)\n- Reject any extra units when raw values are required\n\n---\n\n### 4. Consistency Requirements\n- Ensure values are consistent with related parameters (e.g., both timestamps in UTC)\n- Validate compatibility between parameter values in the same call\n- Check that parameter relationships don't violate constraints\n\n---\n\n### Format Alignment Rating Scale\n- **5 - Perfect Alignment**: All parameter values exactly match required types, formats, and units\n- **4 - Minor Deviation**: All values are fundamentally correct but some have minor formatting issues that are still likely to be accepted by the API or parser unless strict validation is enforced\n- **3 - Moderate Deviation**: Some parameter types or formats are partially incorrect; may cause issues\n- **2 - Major Deviation**: Multiple parameters have significant type and format violations; likely to fail\n- **1 - Complete Mismatch**: One or more parameter values are entirely incompatible with specification\n\n---\n\n### Final Rule\nBe conservative in your assessment:\n- If the spec is ambiguous and values seem plausible, prefer a higher score (4-5)\n- If the spec is clear and values deviate, assign a lower score (1-3)\n- Consider the cumulative impact of all format issues across parameters\n\nYour evaluation is critical: even minor formatting issues can lead to tool failures or incorrect outputs. Careful review ensures reliability in downstream tool use.", - "jsonschema": { - "title": "general_value_format_alignment", - "description": "Assessment of all parameter values' compliance with required type, format, and unit specifications, based on the rubric above.", - "type": "object", - "additionalProperties": false, - "properties": { - "evidence": { + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "book_flight", + "arguments": "{ \"destination\": \"Rome\", \"departure_date\": \"19-07-2024\", \"seat_class\": \"business\" }" + } + } + }, + "output": { + "evidence": "- User message: \"I want to book a flight to Rome on 19-07-2024. I want to fly business class.\"\n- Tool spec requires 'departure_date' in YYYY-MM-DD format.\n- 'seat_class' is correctly grounded as 'business'.", + "explanation": "The 'destination' parameter is correctly grounded as 'Rome'. The 'departure_date' is incorrectly formatted as '19-07-2024' instead of the required 'YYYY-MM-DD' format. The 'seat_class' is correctly grounded as 'business', but the date format error needs to be addressed.", + "output": 2, + "confidence": 0.95, + "correction": { + "parameter_issues": [ + { + "parameter_name": "departure_date", + "reason_types": [ + "FORMAT_ERROR" + ], + "reasons": "The date format '19-07-2024' does not conform to the required 'YYYY-MM-DD' format.", + "corrected_value": { + "departure_date": "2024-07-19" + } + } + ], + "tool_call": { + "name": "book_flight", + "arguments": { + "destination": "Rome", + "departure_date": "2024-07-19", + "seat_class": "business" + } + } + } + } + } + ] + }, + { + "name": "general_value_format_alignment", + "task_description": "Evaluate whether every parameter value in the call conforms to the type, format, and unit requirements defined in the API specification. Format compliance applies ONLY to values that are actually present; spec-optional params (not in `required`) that are omitted are NOT format errors. Format is independent of value correctness \u2014 a semantically wrong value with the right type/format is NOT a format issue here (it belongs to the grounding metrics). This rubric cares only about whether the serialized value matches the spec's type/format/pattern/unit.\n\n---\n\n### 1. Data Type Compliance\n- Each value matches the required JSON type (string, integer, number, boolean, object, array).\n- Numbers are numeric, booleans are true/false literals (not quoted strings) when the type is boolean.\n\n### 2. Format Specification Compliance\n- Dates/times, currency, patterns (phone, email, postal code) respect the spec. Regex/`format`/`pattern` constraints are honored.\n\n### 3. Unit Compliance\n- Required unit suffixes/prefixes are present; forbidden ones are absent; unit families (e.g. Celsius vs Fahrenheit) match the spec.\n\n### 4. Cross-Parameter Consistency\n- end_date >= start_date; paired values use consistent scales/units.\n\n---\n\n### Rating Scale\n- **5**: all values match types/formats/units exactly.\n- **4**: fundamentally correct, only minor formatting quirks a lax parser accepts.\n- **3**: partial type/format issues that may cause downstream errors.\n- **2**: multiple significant type/format violations.\n- **1**: at least one value is fundamentally incompatible with the spec.\n\n---\n\n### Final Rule\nIf the spec is ambiguous and values are plausible, prefer 4-5. If the spec is clear and values deviate, assign 1-3 per the severity of the deviation.\n\n---\n\n#### Stringency\n\nApply this rubric strictly for REQUIRED parameters whose value is missing, fabricated, or inconsistent with the spec. Apply it leniently for OPTIONAL parameters (not in the spec's `required` array) and for values reasonably inferable from visible context. A value is NOT an issue unless it contradicts explicit user intent, a documented constraint, or higher-priority evidence.", + "jsonschema": { + "title": "general_value_format_alignment", + "description": "Assessment of all parameter values' compliance with required type, format, and unit specifications, based on the rubric above.", + "type": "object", + "additionalProperties": false, + "properties": { + "evidence": { + "type": "string", + "description": "Quote the specification's type/format definitions for each parameter and include the actual parameter values provided. Cite EXACT text from the tool specification that supports your judgment." + }, + "explanation": { + "type": "string", + "description": "For each parameter value, explain why it conforms or does not conform to the specification's requirements. Address data type, format, units, and any pattern or constraint violations. For incorrect values, describe exactly what is wrong and how it affects the overall tool call." + }, + "output": { + "type": "integer", + "minimum": 1, + "maximum": 5, + "threshold_low": 4, + "threshold_high": 5, + "description": "Type/Format/Unit Alignment Score (1-5). Judge ONLY the serialized representation of present values against the spec's type/format/pattern/unit. A semantically wrong value with a correct format is NOT flagged here.\n\n- **5**: Every present value matches its type, format, pattern, and unit exactly; missing optional parameters are not format errors.\n- **4**: All values are fundamentally correct with at most a minor formatting quirk a lax parser accepts (e.g. missing leading zero).\n- **3**: One value has a partial format issue that may cause a strict parser to reject or misinterpret it.\n- **2**: Multiple significant format violations (wrong type, wrong unit family, wrong date/pattern shape) likely to fail at the API boundary.\n- **1**: At least one value is fundamentally incompatible with the spec (e.g. a free-form string where the spec demands a boolean; an incoherent pattern).\n\nPrefer 4-5 when the spec is ambiguous and values are plausible. Prefer 1-3 when the spec is clear and values deviate; rate by severity." + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1, + "threshold_low": 0, + "threshold_high": 1, + "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." + }, + "correction": { + "type": "object", + "description": "If output >= 4, return {}. If output <= 3, provide parameter_issues and a corrected tool_call.", + "properties": { + "parameter_issues": { + "type": "array", + "description": "List of parameters with format issues. Leave empty if no issues were found.", + "items": { + "type": "object", + "properties": { + "parameter_name": { "type": "string", - "description": "Quote the specification's type/format definitions for each parameter and include the actual parameter values provided. Cite EXACT text from the tool specification that supports your judgment." - }, - "explanation": { + "description": "Name of the parameter with a format issue." + }, + "reason_types": { + "type": "array", + "description": "List of format issues identified. Use one or more of: TYPE_ERROR, FORMAT_ERROR, UNIT_ERROR, PATTERN_ERROR, CONSISTENCY_ERROR, OTHER.", + "items": { + "type": "string", + "enum": [ + "TYPE_ERROR", + "FORMAT_ERROR", + "UNIT_ERROR", + "PATTERN_ERROR", + "CONSISTENCY_ERROR", + "OTHER" + ] + } + }, + "reasons": { "type": "string", - "description": "For each parameter value, explain why it conforms or does not conform to the specification's requirements. Address data type, format, units, and any pattern or constraint violations. For incorrect values, describe exactly what is wrong and how it affects the overall tool call." - }, - "output": { - "type": "integer", - "minimum": 1, - "maximum": 5, - "threshold_low": 4, - "threshold_high": 5, - "description": "An integer from 1 to 5 indicating how well all parameter values conform to the required types, formats, and units as defined in the API specification.\n\nScore meanings:\n\n5 - Perfect Alignment: All parameter values exactly match the expected types, formats, and units. Example: all dates in 'YYYY-MM-DD', all numbers as integers, all strings properly formatted.\n\n4 - Minor Deviation: All values are fundamentally correct but some have small formatting issues (e.g., missing leading zeros) that are still likely to be accepted by the API or parser unless strict validation is enforced. Example: expected '2025-08-05', value is '2025-8-5'.\n\n3 - Moderate Deviation: Some parameter values partially match the expected format but are likely to be rejected or misinterpreted by automatic processing. Example: mix of correct and incorrectly formatted dates.\n\n2 - Major Deviation: Multiple parameter values significantly violate the expected type, format, or unit and are very likely to fail. Example: wrong data types, completely wrong formats.\n\n1 - Complete Mismatch: One or more parameter values are entirely incompatible with the required format or type. Example: expected boolean, value is 'maybe'; expected date, value is random text." + "description": "Short explanation of the specific issue(s) with the parameter's format, type, or unit." + }, + "corrected_value": { + "type": "object", + "description": "An object containing the corrected parameter value in the form: { \"\": }. If you cannot correct the value, use 'need_more_information' as the key and provide a clarification question for the user. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-formatted, return an empty object {}.", + "additionalProperties": true + } }, - "confidence": { - "type": "number", - "minimum": 0, - "maximum": 1, - "threshold_low": 0, - "threshold_high": 1, - "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." + "required": [ + "parameter_name", + "reason_types", + "reasons", + "corrected_value" + ] + } + }, + "tool_call": { + "type": "object", + "description": "Complete corrected tool call with all format issues fixed, including both modified and retained parameter values. Must be included if a full reconstruction is possible.", + "properties": { + "name": { + "type": "string", + "description": "Name of the function to call." }, - "correction": { - "type": "object", - "description": "If output >= 4, return {}. If output <= 3, provide parameter_issues and a corrected tool_call.", - "properties": { - "parameter_issues": { - "type": "array", - "description": "List of parameters with format issues. Leave empty if no issues were found.", - "items": { - "type": "object", - "properties": { - "parameter_name": { - "type": "string", - "description": "Name of the parameter with a format issue." - }, - "reason_types": { - "type": "array", - "description": "List of format issues identified. Use one or more of: TYPE_ERROR, FORMAT_ERROR, UNIT_ERROR, PATTERN_ERROR, CONSISTENCY_ERROR, OTHER.", - "items": { - "type": "string", - "enum": [ - "TYPE_ERROR", - "FORMAT_ERROR", - "UNIT_ERROR", - "PATTERN_ERROR", - "CONSISTENCY_ERROR", - "OTHER" - ] - } - }, - "reasons": { - "type": "string", - "description": "Short explanation of the specific issue(s) with the parameter's format, type, or unit." - }, - "corrected_value": { - "type": "object", - "description": "An object containing the corrected parameter value in the form: { \"\": }. If you cannot correct the value, use 'need_more_information' as the key and provide a clarification question for the user. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-formatted, return an empty object {}.", - "additionalProperties": true - } - }, - "required": [ - "parameter_name", - "reason_types", - "reasons", - "corrected_value" - ] - } - }, - "tool_call": { - "type": "object", - "description": "Complete corrected tool call with all format issues fixed, including both modified and retained parameter values. Must be included if a full reconstruction is possible.", - "properties": { - "name": { - "type": "string", - "description": "Name of the function to call." - }, - "arguments": { - "type": "object", - "description": "Set of corrected arguments for the function call, with all format issues resolved.", - "additionalProperties": true - } - }, - "required": [ - "name", - "arguments" - ] - } + "arguments": { + "type": "object", + "description": "Set of corrected arguments for the function call, with all format issues resolved.", + "additionalProperties": true + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [] + } + }, + "required": [ + "evidence", + "explanation", + "output", + "confidence", + "correction" + ] + }, + "examples": [ + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Book a flight to Paris on 2025-08-15 for 2 adults in economy class." + } + ], + "tool_inventory": [ + { + "type": "function", + "function": { + "name": "book_flight", + "description": "Books a flight with specified parameters", + "parameters": { + "type": "object", + "properties": { + "destination": { + "type": "string", + "description": "Destination city" + }, + "departure_date": { + "type": "string", + "format": "date", + "description": "Date in YYYY-MM-DD format" }, - "required": [] + "passengers": { + "type": "integer", + "description": "Number of passengers" + }, + "class": { + "type": "string", + "enum": [ + "economy", + "business", + "first" + ], + "description": "Flight class" + } + }, + "required": [ + "destination", + "departure_date", + "passengers", + "class" + ] } - }, - "required": [ - "evidence", - "explanation", - "output", - "confidence", - "correction" - ] + } + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "book_flight", + "arguments": "{\"destination\": \"Paris\", \"departure_date\": \"2025-08-15\", \"passengers\": 2, \"class\": \"economy\"}" + } + } }, - "examples": [ + "output": { + "evidence": "Tool spec requires: destination (string), departure_date (string, YYYY-MM-DD format), passengers (integer), class (enum: economy/business/first). Values provided: 'Paris' (string), '2025-08-15' (string, YYYY-MM-DD), 2 (integer), 'economy' (valid enum).", + "explanation": "All parameter values perfectly conform to their specifications: destination is a proper string, departure_date follows the exact YYYY-MM-DD format, passengers is provided as an integer (not string), and class matches one of the required enum values. No format issues detected.", + "output": 5, + "confidence": 0.98, + "correction": {} + } + }, + { + "user_kwargs": { + "conversation_context": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Book a flight to Paris on 2025-08-15 for 2 adults in economy class." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "book_flight", - "description": "Books a flight with specified parameters", - "parameters": { - "type": "object", - "properties": { - "destination": { - "type": "string", - "description": "Destination city" - }, - "departure_date": { - "type": "string", - "format": "date", - "description": "Date in YYYY-MM-DD format" - }, - "passengers": { - "type": "integer", - "description": "Number of passengers" - }, - "class": { - "type": "string", - "enum": [ - "economy", - "business", - "first" - ], - "description": "Flight class" - } - }, - "required": [ - "destination", - "departure_date", - "passengers", - "class" - ] - } - } - } - ], - "tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "book_flight", - "arguments": "{\"destination\": \"Paris\", \"departure_date\": \"2025-08-15\", \"passengers\": 2, \"class\": \"economy\"}" - } + "role": "user", + "content": "Set a timer for 5 minutes and schedule a meeting for tomorrow at 3 PM." + } + ], + "tool_inventory": [ + { + "type": "function", + "function": { + "name": "set_timer_and_meeting", + "description": "Sets a timer and schedules a meeting", + "parameters": { + "type": "object", + "properties": { + "timer_duration": { + "type": "string", + "pattern": "^\\d+\\s+(minutes?|seconds?|hours?)$", + "description": "Duration in format 'number unit' (e.g., '5 minutes')" + }, + "meeting_date": { + "type": "string", + "format": "date", + "description": "Date in YYYY-MM-DD format" + }, + "meeting_time": { + "type": "string", + "pattern": "^([01]?[0-9]|2[0-3]):[0-5][0-9]$", + "description": "Time in HH:MM 24-hour format" } - }, - "output": { - "evidence": "Tool spec requires: destination (string), departure_date (string, YYYY-MM-DD format), passengers (integer), class (enum: economy/business/first). Values provided: 'Paris' (string), '2025-08-15' (string, YYYY-MM-DD), 2 (integer), 'economy' (valid enum).", - "explanation": "All parameter values perfectly conform to their specifications: destination is a proper string, departure_date follows the exact YYYY-MM-DD format, passengers is provided as an integer (not string), and class matches one of the required enum values. No format issues detected.", - "output": 5, - "confidence": 0.98, - "correction": {} + }, + "required": [ + "timer_duration", + "meeting_date", + "meeting_time" + ] + } + } + } + ], + "tool_call": { + "id": "call_002", + "type": "function", + "function": { + "name": "set_timer_and_meeting", + "arguments": "{\"timer_duration\": \"300\", \"meeting_date\": \"tomorrow\", \"meeting_time\": \"3 PM\"}" + } + } + }, + "output": { + "evidence": "Tool spec requires: timer_duration (string matching pattern '^\\d+\\s+(minutes?|seconds?|hours?)$'), meeting_date (string, YYYY-MM-DD format), meeting_time (string, HH:MM 24-hour format). Values provided: '300' (no unit, violates pattern), 'tomorrow' (not YYYY-MM-DD), '3 PM' (not 24-hour format).", + "explanation": "All three parameters have significant format violations: timer_duration '300' lacks the required unit and space (should be '5 minutes'), meeting_date 'tomorrow' doesn't follow YYYY-MM-DD format (should be '2025-08-07'), and meeting_time '3 PM' uses 12-hour format instead of required 24-hour HH:MM (should be '15:00'). These errors will likely cause API rejection.", + "output": 2, + "confidence": 0.92, + "correction": { + "parameter_issues": [ + { + "parameter_name": "timer_duration", + "reason_types": [ + "FORMAT_ERROR", + "UNIT_ERROR" + ], + "reasons": "Value '300' lacks required unit and space format, should be '5 minutes'", + "corrected_value": { + "timer_duration": "5 minutes" } + }, + { + "parameter_name": "meeting_date", + "reason_types": [ + "FORMAT_ERROR" + ], + "reasons": "Value 'tomorrow' doesn't follow required YYYY-MM-DD format", + "corrected_value": { + "meeting_date": "2025-08-07" + } + }, + { + "parameter_name": "meeting_time", + "reason_types": [ + "FORMAT_ERROR" + ], + "reasons": "Value '3 PM' uses 12-hour format instead of required 24-hour HH:MM", + "corrected_value": { + "meeting_time": "15:00" + } + } + ], + "tool_call": { + "name": "set_timer_and_meeting", + "arguments": { + "timer_duration": "5 minutes", + "meeting_date": "2025-08-07", + "meeting_time": "15:00" + } + } + } + } + } + ] + }, + { + "name": "general_conversation_grounded_correctness", + "task_description": "Evaluate the overall correctness and appropriateness of the tool call based SOLELY on the conversation context, WITHOUT requiring explicit tool specifications. If tool specifications appear in the conversation (e.g. in system prompts or assistant messages), you MAY use them; otherwise reason from the conversation alone.\n\n---\n\n### 1. Tool Call Appropriateness\n\n- **Intent Alignment**: does it address the user's request or a logical next step (including preliminary probes and exploratory lookups that precede a later mutating action)?\n- **Relevance**: is the chosen function plausibly useful for the current context?\n\n### 2. Conversation Grounding\n\nParameter values should be grounded in this priority order: system prompt > tool outputs > explicit user statements > confirmed assistant statements > reasonable inference. Tool outputs and system-prompt facts (policy, current date/year, identity) override conflicting assistant statements. Recognize these grounding paths:\n\n- **Explicit** \u2014 value is stated verbatim in a higher-priority source.\n- **Pick-one** \u2014 user enumerated several acceptable values; the agent picked one of them.\n- **Specialization** \u2014 user named a set/range/category; the agent picked a concrete value INSIDE that scope. Flag only when the concrete value lands outside the user's named scope.\n- **Reasonable inference** \u2014 a value that a careful reader of the conversation would derive without surprise (e.g. typical defaults like a party size of 1).\n\n### 3. Prerequisites\n\n- Explicit prerequisites the policy or prior tool messages document (e.g. authentication acquired, order id fetched before a mutation) must be satisfied or be the subject of this call itself.\n- Do NOT fabricate prerequisites not documented in the conversation.\n\n---\n\n### Rating Scale\n- **5**: fully appropriate, well-grounded, prerequisites satisfied (or this call is itself gathering a prerequisite).\n- **4**: well-grounded with only minor reasonable inferences.\n- **3**: generally appropriate but with weak grounding on some parameters or questionable timing.\n- **2**: multiple ungrounded parameters or a relevance issue.\n- **1**: clearly inappropriate, contradicts the conversation, or contains fabricated parameter values.\n\n---\n\n#### Required-Parameter Omission\n\nA parameter listed in the spec's `required` array that is completely absent from the arguments is ALWAYS MISSING_INFORMATION \u2014 no grounding path (explicit statement, documented default, pick-one, specialization, or plausible-optional) substitutes for actually providing a value. The grounding paths only apply to values the agent ACTUALLY PROVIDED; they never excuse omission of a required parameter.\n\n#### User-Context Discarded\n\nWhen a parameter is spec-optional but the user (or prior tool output / system prompt) provided an explicit narrowing value for it \u2014 a search term, a filter, a category, an identifier \u2014 and the agent OMITS the parameter or sets it to an \"empty\" sentinel (empty string, null, default wildcard), this is NOT a \"plausible optional value\". It is USER_CONTEXT_DISCARDED and should be labeled CONTRADICTORY_VALUES (the call contradicts explicit user intent). Score the call 2-3 accordingly. This rule applies regardless of whether the spec marks the parameter as required; it turns on whether the user supplied narrowing context for it.\n\n---\n\n#### Stringency\n\nApply a lenient bar when the call is information-gathering or exploratory (reads, lookups, probes) that is plausibly on-path. Apply a strict bar to the grounding of REQUIRED parameter values that drive a mutating action \u2014 those must be explicitly supported by the conversation, a prior tool output, a system-prompt anchor, or a documented default.\n\n#### Confirmation Scope\n\nExplicit user confirmation is required only before actions that change persistent state on behalf of the user (creating, modifying, or deleting records; sending messages; committing transactions). Inspect-only or information-gathering calls \u2014 reads, lookups, searches, descriptions, status checks \u2014 do NOT require prior user confirmation, even when a later mutating action in the same flow will.", + "jsonschema": { + "title": "general_conversation_grounded_correctness", + "description": "Holistic assessment of tool call correctness based on conversation grounding, appropriateness, and logical consistency without requiring explicit tool specifications.", + "type": "object", + "additionalProperties": false, + "properties": { + "evidence": { + "type": "string", + "description": "Provide EXACT quotes from the conversation that support or contradict the tool call. Include: (1) relevant user messages, (2) assistant messages, (3) prior tool outputs, and (4) any tool specifications mentioned in the conversation. Organize evidence by parameter and by aspect (appropriateness, grounding, prerequisites)." + }, + "explanation": { + "type": "string", + "description": "Provide an analysis covering: (1) Tool Call Appropriateness - why this tool call makes sense (or doesn't) in the conversation flow, (2) Parameter Grounding - for EACH parameter, explain its grounding source and quality, noting conflicts between sources and resolution, (3) Prerequisites - whether all necessary preconditions are satisfied, (4) Format and Consistency - whether parameters make logical sense together, (5) Overall Correctness - summary judgment on whether the tool call is fully justified by the conversation." + }, + "output": { + "type": "integer", + "minimum": 1, + "maximum": 5, + "threshold_low": 4, + "threshold_high": 5, + "description": "Overall Call Correctness Score (1-5), without requiring explicit tool specs. Combine appropriateness (intent fit, on-trajectory sequencing) and grounding of present values (see the grounding paths above). Mid-trajectory probes plausibly on-path are NOT penalized for not, by themselves, completing the user's full goal.\n\n- **5**: Right action at the right time; every required value is grounded via an explicit source, a documented default, a pick-one, or an in-scope specialization; no redundancy; any documented prerequisite is satisfied or this call is itself gathering one.\n- **4**: Appropriate and timely, with most values explicitly grounded and a small number of in-scope inferences or abstract specializations the user clearly allowed.\n- **3**: Generally appropriate but with one weakly grounded required value or minor timing concern. The call might succeed but could reasonably be questioned by a careful reader.\n- **2**: Multiple weakly grounded required values OR a relevance issue (call appears off-path for the stated goal even after accounting for exploration).\n- **1**: Clearly inappropriate, redundant (SAME function + SAME arguments), fundamentally misaligned with the conversation, or contains fabricated required values (no grounding path applies)." + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1, + "threshold_low": 0, + "threshold_high": 1, + "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Higher confidence when conversation context is clear and complete. Lower confidence when context is ambiguous, conversation is incomplete, or tool specifications are not available." + }, + "correction": { + "type": "object", + "description": "Corrections for the tool call. For correct calls (output >= 4), return {}. For problematic calls (output <= 3), provide either need_more_information OR corrected_tool_call.\n\nCRITICAL FORMAT REQUIREMENT: When providing corrected_tool_call, the 'arguments' field MUST be a JSON object/dict (e.g., {\"key\": \"value\"}), NOT a JSON string (e.g., NOT \"{\\\"key\\\": \\\"value\\\"}\").", + "properties": { + "need_more_information": { + "type": "array", + "description": "List of questions to ask the user if information is missing. Each question should be specific and actionable.", + "items": { + "type": "string" + } }, - { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Set a timer for 5 minutes and schedule a meeting for tomorrow at 3 PM." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "set_timer_and_meeting", - "description": "Sets a timer and schedules a meeting", - "parameters": { - "type": "object", - "properties": { - "timer_duration": { - "type": "string", - "pattern": "^\\d+\\s+(minutes?|seconds?|hours?)$", - "description": "Duration in format 'number unit' (e.g., '5 minutes')" - }, - "meeting_date": { - "type": "string", - "format": "date", - "description": "Date in YYYY-MM-DD format" - }, - "meeting_time": { - "type": "string", - "pattern": "^([01]?[0-9]|2[0-3]):[0-5][0-9]$", - "description": "Time in HH:MM 24-hour format" - } - }, - "required": [ - "timer_duration", - "meeting_date", - "meeting_time" - ] - } - } - } - ], - "tool_call": { - "id": "call_002", - "type": "function", - "function": { - "name": "set_timer_and_meeting", - "arguments": "{\"timer_duration\": \"300\", \"meeting_date\": \"tomorrow\", \"meeting_time\": \"3 PM\"}" - } - } + "corrected_tool_call": { + "type": "object", + "description": "Complete corrected tool call with all issues addressed. Only provide if correction is possible without additional user input. IMPORTANT: The 'arguments' field must be a JSON object (dict), not a JSON string.", + "properties": { + "name": { + "type": "string", + "description": "Name of the function to call." }, - "output": { - "evidence": "Tool spec requires: timer_duration (string matching pattern '^\\d+\\s+(minutes?|seconds?|hours?)$'), meeting_date (string, YYYY-MM-DD format), meeting_time (string, HH:MM 24-hour format). Values provided: '300' (no unit, violates pattern), 'tomorrow' (not YYYY-MM-DD), '3 PM' (not 24-hour format).", - "explanation": "All three parameters have significant format violations: timer_duration '300' lacks the required unit and space (should be '5 minutes'), meeting_date 'tomorrow' doesn't follow YYYY-MM-DD format (should be '2025-08-07'), and meeting_time '3 PM' uses 12-hour format instead of required 24-hour HH:MM (should be '15:00'). These errors will likely cause API rejection.", - "output": 2, - "confidence": 0.92, - "correction": { - "parameter_issues": [ - { - "parameter_name": "timer_duration", - "reason_types": [ - "FORMAT_ERROR", - "UNIT_ERROR" - ], - "reasons": "Value '300' lacks required unit and space format, should be '5 minutes'", - "corrected_value": { - "timer_duration": "5 minutes" - } - }, - { - "parameter_name": "meeting_date", - "reason_types": [ - "FORMAT_ERROR" - ], - "reasons": "Value 'tomorrow' doesn't follow required YYYY-MM-DD format", - "corrected_value": { - "meeting_date": "2025-08-07" - } - }, - { - "parameter_name": "meeting_time", - "reason_types": [ - "FORMAT_ERROR" - ], - "reasons": "Value '3 PM' uses 12-hour format instead of required 24-hour HH:MM", - "corrected_value": { - "meeting_time": "15:00" - } - } - ], - "tool_call": { - "name": "set_timer_and_meeting", - "arguments": { - "timer_duration": "5 minutes", - "meeting_date": "2025-08-07", - "meeting_time": "15:00" - } - } - } + "arguments": { + "type": "object", + "description": "Corrected arguments for the function call as a JSON object (not a string). Example: {\"param1\": \"value1\", \"param2\": 123}", + "additionalProperties": true } + }, + "required": [ + "name", + "arguments" + ] } - ] + }, + "required": [] + } + }, + "required": [ + "evidence", + "explanation", + "output", + "confidence", + "correction" + ] }, - { - "name": "general_conversation_grounded_correctness", - "task_description": "Evaluate the overall correctness and appropriateness of the tool call based SOLELY on the conversation context, WITHOUT requiring explicit tool specifications. This metric assesses whether the tool call is a logical, non-redundant action that is fully grounded in the conversation history.\n\n**IMPORTANT**: If tool specifications appear in the conversation (e.g., in system prompts or assistant messages), you MAY use them to enhance your assessment. However, your primary focus should be on conversation grounding.\n\n---\n\n### 1. Tool Call Appropriateness\n\nAssess whether the tool call makes sense given the conversation flow:\n\n- **Intent Alignment**: Does the tool call directly address the user's request or a logical next step?\n- **Non-Redundancy**: Is this tool call necessary, or has equivalent information already been obtained?\n- **Timing**: Are there prerequisite actions that should have been completed first?\n- **Relevance**: Is the chosen function appropriate for the current conversation context?\n\n---\n\n### 2. Conversation Grounding Hierarchy\n\nParameter values should be grounded in the following priority order:\n\n1. **Tool Outputs** (highest priority): Values from previous tool call results\n2. **Explicit User Statements**: Direct mentions in user messages\n3. **Confirmed Assistant Statements**: Information the assistant provided that the user acknowledged\n4. **Logical Inference**: Values that can be reasonably inferred from context (use cautiously)\n\n**Key Principle**: When there's a conflict between sources, tool outputs take precedence over user/assistant statements, as they represent ground truth from the system.\n\n---\n\n### 3. Parameter Value Assessment\n\nFor each parameter in the tool call:\n\n- **Explicit Grounding**: Is the value directly stated in the conversation?\n- **Implicit Grounding**: Can the value be reasonably inferred from context?\n- **Assumption Detection**: Is the value assumed without any conversational basis?\n- **Conflict Resolution**: If multiple sources provide different values, is the most authoritative source used?\n\n---\n\n### 4. Format and Consistency (Conversation-Based)\n\nEven without formal specifications:\n\n- **Internal Consistency**: Do parameter values make logical sense together (e.g., end_date after start_date)?\n- **Observable Patterns**: If tool specifications are mentioned in the conversation, do values conform to described patterns?\n- **Common Sense Validation**: Do values follow basic logical constraints (e.g., no negative quantities for counts)?\n\n---\n\n### 5. Prerequisite Satisfaction\n\nCheck if the tool call has all necessary preconditions:\n\n- **Dependent Information**: Is all required information available from the conversation or prior tool calls?\n- **State Requirements**: Based on conversation context, is the system in the right state for this call?\n- **Missing Steps**: Are there obvious gaps in the conversation that should be addressed first?\n\n---\n\n### Correctness Rating Scale\n\n- **5 - Fully Correct**: Tool call is perfectly appropriate, all parameters are explicitly or clearly inferably grounded in conversation, no redundancy, all prerequisites satisfied\n- **4 - Mostly Correct**: Tool call is appropriate and well-grounded, with only minor implicit inferences that are very reasonable\n- **3 - Partially Correct**: Tool call is generally appropriate but has some parameters with weak grounding or minor timing/relevance issues\n- **2 - Problematic**: Tool call has significant issues—multiple ungrounded parameters, questionable relevance, or missing prerequisites\n- **1 - Incorrect**: Tool call is inappropriate, redundant, or has parameters that are clearly fabricated or contradict conversation context\n\n---\n\n### Evaluation Guidelines\n\n- **Conservative Assessment**: When grounding is unclear, prefer lower scores\n- **Prioritize Tool Outputs**: Always favor values from tool results over human statements\n- **Consider Conversation Flow**: Assess whether the tool call makes sense in the sequence of events\n- **Watch for Hallucinations**: Flag any parameters that appear without conversational basis\n- **Use Available Specs**: If tool specifications appear in the conversation, incorporate them into your assessment\n\nYour evaluation should reflect a holistic view of whether a human reviewing the conversation would consider this tool call to be correct, appropriate, and fully justified by the available information.", - "jsonschema": { - "title": "general_conversation_grounded_correctness", - "description": "Holistic assessment of tool call correctness based on conversation grounding, appropriateness, and logical consistency without requiring explicit tool specifications.", - "type": "object", - "additionalProperties": false, - "properties": { - "evidence": { - "type": "string", - "description": "Provide EXACT quotes from the conversation that support or contradict the tool call. Include: (1) relevant user messages, (2) assistant messages, (3) prior tool outputs, and (4) any tool specifications mentioned in the conversation. Organize evidence by parameter and by aspect (appropriateness, grounding, prerequisites)." - }, - "explanation": { - "type": "string", - "description": "Provide an analysis covering: (1) Tool Call Appropriateness - why this tool call makes sense (or doesn't) in the conversation flow, (2) Parameter Grounding - for EACH parameter, explain its grounding source and quality, noting conflicts between sources and resolution, (3) Prerequisites - whether all necessary preconditions are satisfied, (4) Format and Consistency - whether parameters make logical sense together, (5) Overall Correctness - summary judgment on whether the tool call is fully justified by the conversation." - }, - "output": { - "type": "integer", - "minimum": 1, - "maximum": 5, - "threshold_low": 4, - "threshold_high": 5, - "description": "Overall Correctness Score (1-5):\n\n5 - Fully Correct: Tool call is the right action at the right time. All parameters are explicitly grounded in conversation (user messages, confirmed assistant statements, or tool outputs). No redundancy. All prerequisites satisfied. If tool specs are in conversation, the call aligns with them.\nExample: User asks \"What's the weather in Boston?\", assistant calls get_weather(location=\"Boston\") with no prior weather call for Boston.\n\n4 - Mostly Correct: Tool call is appropriate and timely. Most parameters are explicitly grounded, a few may be reasonably inferred from clear context. No significant issues. Prerequisites satisfied.\nExample: User says \"Book a table for dinner tonight\", assistant calls book_restaurant(date=\"2025-12-17\", time=\"19:00\", party_size=2) where date is inferred from 'tonight' and party_size from typical default.\n\n3 - Partially Correct: Tool call is generally appropriate but has noticeable issues—some parameters are weakly grounded, minor relevance concerns, or unclear if prerequisites are met. The call might work but has questionable elements.\nExample: User says \"I need a hotel\", assistant calls book_hotel(location=\"New York\", check_in=\"2025-12-20\") where location and dates aren't mentioned in conversation.\n\n2 - Problematic: Tool call has significant correctness issues—multiple parameters lack grounding, the timing is questionable, important prerequisites are missing, or the call seems partially redundant. Likely to fail or produce incorrect results.\nExample: User asks \"What flights are available?\", but assistant calls book_flight(...) with specific dates and destinations not discussed, skipping the search step.\n\n1 - Incorrect: Tool call is inappropriate, clearly redundant, or fundamentally misaligned with conversation. Parameters are fabricated or contradict known information. Missing critical prerequisites. The call should not be made.\nExample: User asks about restaurant recommendations, assistant calls cancel_reservation(reservation_id=\"12345\") with no prior discussion of cancellations or that ID." - }, - "confidence": { - "type": "number", - "minimum": 0, - "maximum": 1, - "threshold_low": 0, - "threshold_high": 1, - "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Higher confidence when conversation context is clear and complete. Lower confidence when context is ambiguous, conversation is incomplete, or tool specifications are not available." - }, - "correction": { - "type": "object", - "description": "Corrections for the tool call. For correct calls (output >= 4), return {}. For problematic calls (output <= 3), provide either need_more_information OR corrected_tool_call.\n\nCRITICAL FORMAT REQUIREMENT: When providing corrected_tool_call, the 'arguments' field MUST be a JSON object/dict (e.g., {\"key\": \"value\"}), NOT a JSON string (e.g., NOT \"{\\\"key\\\": \\\"value\\\"}\").", - "properties": { - "need_more_information": { - "type": "array", - "description": "List of questions to ask the user if information is missing. Each question should be specific and actionable.", - "items": { - "type": "string" - } - }, - "corrected_tool_call": { - "type": "object", - "description": "Complete corrected tool call with all issues addressed. Only provide if correction is possible without additional user input. IMPORTANT: The 'arguments' field must be a JSON object (dict), not a JSON string.", - "properties": { - "name": { - "type": "string", - "description": "Name of the function to call." - }, - "arguments": { - "type": "object", - "description": "Corrected arguments for the function call as a JSON object (not a string). Example: {\"param1\": \"value1\", \"param2\": 123}", - "additionalProperties": true - } - }, - "required": ["name", "arguments"] - } - }, - "required": [] - } + "examples": [ + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "What's the weather like in Boston?" + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "get_weather", + "arguments": "{\"location\": \"Boston\"}" + } + } + }, + "output": { + "evidence": "User message: \"What's the weather like in Boston?\"\nTool call parameter: location=\"Boston\"\nNo prior tool calls in conversation.", + "explanation": "Tool Call Appropriateness: The tool call directly addresses the user's explicit request for weather information. It's timely, relevant, and non-redundant.\n\nParameter Grounding: The 'location' parameter value \"Boston\" is explicitly mentioned in the user's question, providing perfect grounding.\n\nPrerequisites: No prerequisites are needed for a weather query beyond having the location, which is provided.\n\nFormat and Consistency: The location parameter is a simple string matching the user's input, internally consistent.\n\nOverall Correctness: This tool call is perfectly appropriate and fully grounded in the conversation.", + "output": 5, + "confidence": 0.98, + "correction": {} + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "I want to book a flight." }, - "required": [ - "evidence", - "explanation", - "output", - "confidence", - "correction" - ] + { + "role": "assistant", + "content": "I'd be happy to help you book a flight. Could you please provide the departure city, destination, and travel dates?" + } + ], + "tool_call": { + "id": "call_002", + "type": "function", + "function": { + "name": "book_flight", + "arguments": "{\"departure\": \"New York\", \"destination\": \"Los Angeles\", \"date\": \"2025-12-25\"}" + } + } }, - "examples": [ + "output": { + "evidence": "User message: \"I want to book a flight.\"\nAssistant message: \"Could you please provide the departure city, destination, and travel dates?\"\nTool call parameters: departure=\"New York\", destination=\"Los Angeles\", date=\"2025-12-25\"\nNo user response providing the requested information.", + "explanation": "Tool Call Appropriateness: The tool call is premature and inappropriate. The assistant explicitly asked the user for departure city, destination, and dates, but the user has not yet provided this information. Making the booking call now is incorrect.\n\nParameter Grounding: All three parameters (departure, destination, date) are completely ungrounded. They do not appear anywhere in the conversation and represent pure hallucination. The user only expressed a general intent to book a flight.\n\nPrerequisites: The critical prerequisite\u2014obtaining the necessary booking information from the user\u2014has not been satisfied. The assistant asked for this information but didn't wait for the response.\n\nFormat and Consistency: While the parameters themselves are internally consistent (valid city names and date format), they have no basis in reality for this conversation.\n\nOverall Correctness: This tool call is fundamentally incorrect and should not be made. The system should wait for user input before proceeding.", + "output": 1, + "confidence": 0.95, + "correction": { + "need_more_information": [ + "What city will you be departing from?", + "What is your destination?", + "What date would you like to travel?" + ] + } + } + }, + { + "user_kwargs": { + "conversation_context": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "What's the weather like in Boston?" - } - ], - "tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "get_weather", - "arguments": "{\"location\": \"Boston\"}" - } - } - }, - "output": { - "evidence": "User message: \"What's the weather like in Boston?\"\nTool call parameter: location=\"Boston\"\nNo prior tool calls in conversation.", - "explanation": "Tool Call Appropriateness: The tool call directly addresses the user's explicit request for weather information. It's timely, relevant, and non-redundant.\n\nParameter Grounding: The 'location' parameter value \"Boston\" is explicitly mentioned in the user's question, providing perfect grounding.\n\nPrerequisites: No prerequisites are needed for a weather query beyond having the location, which is provided.\n\nFormat and Consistency: The location parameter is a simple string matching the user's input, internally consistent.\n\nOverall Correctness: This tool call is perfectly appropriate and fully grounded in the conversation.", - "output": 5, - "confidence": 0.98, - "correction": {} - } + "role": "user", + "content": "Get me the user profile for the logged-in user." }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "I want to book a flight." - }, - { - "role": "assistant", - "content": "I'd be happy to help you book a flight. Could you please provide the departure city, destination, and travel dates?" - } - ], - "tool_call": { - "id": "call_002", - "type": "function", - "function": { - "name": "book_flight", - "arguments": "{\"departure\": \"New York\", \"destination\": \"Los Angeles\", \"date\": \"2025-12-25\"}" - } - } - }, - "output": { - "evidence": "User message: \"I want to book a flight.\"\nAssistant message: \"Could you please provide the departure city, destination, and travel dates?\"\nTool call parameters: departure=\"New York\", destination=\"Los Angeles\", date=\"2025-12-25\"\nNo user response providing the requested information.", - "explanation": "Tool Call Appropriateness: The tool call is premature and inappropriate. The assistant explicitly asked the user for departure city, destination, and dates, but the user has not yet provided this information. Making the booking call now is incorrect.\n\nParameter Grounding: All three parameters (departure, destination, date) are completely ungrounded. They do not appear anywhere in the conversation and represent pure hallucination. The user only expressed a general intent to book a flight.\n\nPrerequisites: The critical prerequisite—obtaining the necessary booking information from the user—has not been satisfied. The assistant asked for this information but didn't wait for the response.\n\nFormat and Consistency: While the parameters themselves are internally consistent (valid city names and date format), they have no basis in reality for this conversation.\n\nOverall Correctness: This tool call is fundamentally incorrect and should not be made. The system should wait for user input before proceeding.", - "output": 1, - "confidence": 0.95, - "correction": { - "need_more_information": [ - "What city will you be departing from?", - "What is your destination?", - "What date would you like to travel?" - ] - } + "role": "assistant", + "content": null, + "tool_calls": [ + { + "id": "call_001", + "type": "function", + "function": { + "name": "get_current_user_id", + "arguments": "{}" + } } + ] }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Get me the user profile for the logged-in user." - }, - { - "role": "assistant", - "content": null, - "tool_calls": [ - { - "id": "call_001", - "type": "function", - "function": { - "name": "get_current_user_id", - "arguments": "{}" - } - } - ] - }, - { - "role": "tool", - "tool_call_id": "call_001", - "content": "{\"user_id\": 42, \"username\": \"john_doe\"}" - }, - { - "role": "assistant", - "content": "I found that you're logged in as john_doe (user ID: 42). Let me get your full profile now." - } - ], - "tool_call": { - "id": "call_002", - "type": "function", - "function": { - "name": "get_user_profile", - "arguments": "{\"user_id\": 42}" - } - } - }, - "output": { - "evidence": "User message: \"Get me the user profile for the logged-in user.\"\nPrior tool call: get_current_user_id()\nTool output: {\"user_id\": 42, \"username\": \"john_doe\"}\nAssistant confirmation: \"you're logged in as john_doe (user ID: 42)\"\nCurrent tool call: get_user_profile(user_id=42)", - "explanation": "Tool Call Appropriateness: The tool call is perfectly appropriate and represents the logical next step. The user requested the profile for the logged-in user. The assistant first identified the user (ID: 42) via a tool call, confirmed this with the user, and is now proceeding to fetch the full profile.\n\nParameter Grounding: The 'user_id' parameter value of 42 is grounded in the tool output from get_current_user_id. This is the highest-quality grounding source (tool output), taking precedence over any human statements.\n\nPrerequisites: All prerequisites are satisfied—the user ID was obtained through a prior tool call, which is the proper way to get this information before fetching the profile.\n\nFormat and Consistency: The user_id is an integer (42) which is appropriate for an ID field. It's consistent with the value returned from the authentication check.\n\nOverall Correctness: This tool call demonstrates excellent conversation flow—proper sequencing, authoritative grounding from tool output, and clear alignment with user intent.", - "output": 5, - "confidence": 0.97, - "correction": {} - } + "role": "tool", + "tool_call_id": "call_001", + "content": "{\"user_id\": 42, \"username\": \"john_doe\"}" }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Book a table for dinner tonight at 7 PM." - } - ], - "tool_call": { - "id": "call_003", - "type": "function", - "function": { - "name": "book_restaurant", - "arguments": "{\"date\": \"2025-12-17\", \"time\": \"19:00\", \"party_size\": 2}" - } - } - }, - "output": { - "evidence": "User message: \"Book a table for dinner tonight at 7 PM.\"\nTool call parameters: date=\"2025-12-17\", time=\"19:00\", party_size=2\nCurrent date context: December 17, 2025\nNo mention of restaurant name or party size in conversation.", - "explanation": "Tool Call Appropriateness: The tool call is generally appropriate as it responds to the user's booking request. However, it's missing a critical piece of information—which restaurant to book.\n\nParameter Grounding:\n- 'date': The value \"2025-12-17\" is a reasonable inference from \"tonight\" given the current date context. This is a clear implicit inference (score: 4 level).\n- 'time': The value \"19:00\" (7 PM in 24-hour format) is explicitly grounded in the user's statement \"7 PM\". This is well-grounded.\n- 'party_size': The value 2 is an assumption with no basis in the conversation. The user didn't mention how many people. This is a hallucination.\n\nPrerequisites: The restaurant name/location is missing, which is typically a required prerequisite for booking. The user should specify which restaurant.\n\nFormat and Consistency: The parameters are internally consistent (date is today, time is evening, party size is positive), but the party_size lacks grounding.\n\nOverall Correctness: The tool call has mixed quality—some parameters are well-grounded while others are assumed. The missing restaurant information is also problematic.", - "output": 3, - "confidence": 0.85, - "correction": { - "need_more_information": [ - "Which restaurant would you like to book?", - "How many people will be dining?" - ] - } - } + "role": "assistant", + "content": "I found that you're logged in as john_doe (user ID: 42). Let me get your full profile now." + } + ], + "tool_call": { + "id": "call_002", + "type": "function", + "function": { + "name": "get_user_profile", + "arguments": "{\"user_id\": 42}" + } + } + }, + "output": { + "evidence": "User message: \"Get me the user profile for the logged-in user.\"\nPrior tool call: get_current_user_id()\nTool output: {\"user_id\": 42, \"username\": \"john_doe\"}\nAssistant confirmation: \"you're logged in as john_doe (user ID: 42)\"\nCurrent tool call: get_user_profile(user_id=42)", + "explanation": "Tool Call Appropriateness: The tool call is perfectly appropriate and represents the logical next step. The user requested the profile for the logged-in user. The assistant first identified the user (ID: 42) via a tool call, confirmed this with the user, and is now proceeding to fetch the full profile.\n\nParameter Grounding: The 'user_id' parameter value of 42 is grounded in the tool output from get_current_user_id. This is the highest-quality grounding source (tool output), taking precedence over any human statements.\n\nPrerequisites: All prerequisites are satisfied\u2014the user ID was obtained through a prior tool call, which is the proper way to get this information before fetching the profile.\n\nFormat and Consistency: The user_id is an integer (42) which is appropriate for an ID field. It's consistent with the value returned from the authentication check.\n\nOverall Correctness: This tool call demonstrates excellent conversation flow\u2014proper sequencing, authoritative grounding from tool output, and clear alignment with user intent.", + "output": 5, + "confidence": 0.97, + "correction": {} + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Book a table for dinner tonight at 7 PM." + } + ], + "tool_call": { + "id": "call_003", + "type": "function", + "function": { + "name": "book_restaurant", + "arguments": "{\"date\": \"2025-12-17\", \"time\": \"19:00\", \"party_size\": 2}" + } + } + }, + "output": { + "evidence": "User message: \"Book a table for dinner tonight at 7 PM.\"\nTool call parameters: date=\"2025-12-17\", time=\"19:00\", party_size=2\nCurrent date context: December 17, 2025\nNo mention of restaurant name or party size in conversation.", + "explanation": "Tool Call Appropriateness: The tool call is generally appropriate as it responds to the user's booking request. However, it's missing a critical piece of information\u2014which restaurant to book.\n\nParameter Grounding:\n- 'date': The value \"2025-12-17\" is a reasonable inference from \"tonight\" given the current date context. This is a clear implicit inference (score: 4 level).\n- 'time': The value \"19:00\" (7 PM in 24-hour format) is explicitly grounded in the user's statement \"7 PM\". This is well-grounded.\n- 'party_size': The value 2 is an assumption with no basis in the conversation. The user didn't mention how many people. This is a hallucination.\n\nPrerequisites: The restaurant name/location is missing, which is typically a required prerequisite for booking. The user should specify which restaurant.\n\nFormat and Consistency: The parameters are internally consistent (date is today, time is evening, party size is positive), but the party_size lacks grounding.\n\nOverall Correctness: The tool call has mixed quality\u2014some parameters are well-grounded while others are assumed. The missing restaurant information is also problematic.", + "output": 3, + "confidence": 0.85, + "correction": { + "need_more_information": [ + "Which restaurant would you like to book?", + "How many people will be dining?" + ] + } + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "I need a reservation at The French Bistro for 2 people tomorrow at 7 PM." }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "I need a reservation at The French Bistro for 2 people tomorrow at 7 PM." - }, - { - "role": "assistant", - "content": "I'll book a table for you at The French Bistro." - } - ], - "tool_call": { - "id": "call_004", - "type": "function", - "function": { - "name": "book_restaurant", - "arguments": "{\"restaurant_name\": \"The French Bistro\", \"date\": \"2025-12-18\", \"time\": \"19:00\", \"party_size\": 4, \"dietary_restrictions\": [\"vegetarian\"]}" - } - } - }, - "output": { - "evidence": "User message: \"I need a reservation at The French Bistro for 2 people tomorrow at 7 PM.\"\nTool call parameters: restaurant_name=\"The French Bistro\", date=\"2025-12-18\", time=\"19:00\", party_size=4, dietary_restrictions=[\"vegetarian\"]\nCurrent date context: December 17, 2025", - "explanation": "Tool Call Appropriateness: The tool call is appropriate in intent but has parameter errors.\n\nParameter Grounding:\n- 'restaurant_name': \"The French Bistro\" is explicitly grounded.\n- 'date': \"2025-12-18\" is correctly inferred from \"tomorrow\".\n- 'time': \"19:00\" is explicitly grounded in \"7 PM\".\n- 'party_size': Value is 4, but user explicitly said \"2 people\" - this is a contradiction.\n- 'dietary_restrictions': [\"vegetarian\"] is completely ungrounded - user never mentioned dietary restrictions.\n\nOverall Correctness: Significant issues with party_size contradiction and hallucinated dietary_restrictions.", - "output": 2, - "confidence": 0.9, - "correction": { - "corrected_tool_call": { - "name": "book_restaurant", - "arguments": { - "restaurant_name": "The French Bistro", - "date": "2025-12-18", - "time": "19:00", - "party_size": 2 - } - } - } - } + "role": "assistant", + "content": "I'll book a table for you at The French Bistro." + } + ], + "tool_call": { + "id": "call_004", + "type": "function", + "function": { + "name": "book_restaurant", + "arguments": "{\"restaurant_name\": \"The French Bistro\", \"date\": \"2025-12-18\", \"time\": \"19:00\", \"party_size\": 4, \"dietary_restrictions\": [\"vegetarian\"]}" + } + } + }, + "output": { + "evidence": "User message: \"I need a reservation at The French Bistro for 2 people tomorrow at 7 PM.\"\nTool call parameters: restaurant_name=\"The French Bistro\", date=\"2025-12-18\", time=\"19:00\", party_size=4, dietary_restrictions=[\"vegetarian\"]\nCurrent date context: December 17, 2025", + "explanation": "Tool Call Appropriateness: The tool call is appropriate in intent but has parameter errors.\n\nParameter Grounding:\n- 'restaurant_name': \"The French Bistro\" is explicitly grounded.\n- 'date': \"2025-12-18\" is correctly inferred from \"tomorrow\".\n- 'time': \"19:00\" is explicitly grounded in \"7 PM\".\n- 'party_size': Value is 4, but user explicitly said \"2 people\" - this is a contradiction.\n- 'dietary_restrictions': [\"vegetarian\"] is completely ungrounded - user never mentioned dietary restrictions.\n\nOverall Correctness: Significant issues with party_size contradiction and hallucinated dietary_restrictions.", + "output": 2, + "confidence": 0.9, + "correction": { + "corrected_tool_call": { + "name": "book_restaurant", + "arguments": { + "restaurant_name": "The French Bistro", + "date": "2025-12-18", + "time": "19:00", + "party_size": 2 + } } - ] - } -] \ No newline at end of file + } + } + } + ] + } +] diff --git a/altk/pre_tool/sparc/function_calling/metrics/function_selection/function_selection.py b/altk/pre_tool/sparc/function_calling/metrics/function_selection/function_selection.py index f28ce826..05eb96e5 100644 --- a/altk/pre_tool/sparc/function_calling/metrics/function_selection/function_selection.py +++ b/altk/pre_tool/sparc/function_calling/metrics/function_selection/function_selection.py @@ -3,6 +3,7 @@ ) _function_system = ( + "{{ common_principles }}\n\n" "### Task Description:\n\n" "{{ task_description }}\n\n" "Your output must conform to the following JSON schema, in the same order as the fields appear in the schema:\n" diff --git a/altk/pre_tool/sparc/function_calling/metrics/function_selection/function_selection_metrics.json b/altk/pre_tool/sparc/function_calling/metrics/function_selection/function_selection_metrics.json index a4f92814..34aa3e08 100644 --- a/altk/pre_tool/sparc/function_calling/metrics/function_selection/function_selection_metrics.json +++ b/altk/pre_tool/sparc/function_calling/metrics/function_selection/function_selection_metrics.json @@ -1,599 +1,633 @@ [ - { - "name": "function_selection_appropriateness", - "task_description": "You are an evaluator assessing whether a specific function is an appropriate next step in a user-assistant conversation.\n\nTASK: Determine if the selected function aligns with the user's current request and is justified by the conversation history and tool inventory.\nIMPORTANT: Evaluate only the function selection appropriateness. Do not assess the function's execution, parameter values correctness, or any other aspect.\n\nEVALUATION CRITERIA:\n\n1. EVIDENCE SOURCES:\n- Use only the conversation history (system prompts, user and assistant turns, prior tool calls and outputs).\n- Use only the tool inventory, which includes function names and descriptions.\n- Do not infer capabilities, tool logic, or user goals beyond what is explicitly provided.\n\n2. DIALOGUE CONTEXT:\n- Conversations alternate between user and assistant.\n- Assistant turns may include multiple tool calls.\n- Tool calls may be:\n - INDEPENDENT: order does not matter\n - CHAINED: later calls depend on earlier ones\n- Consider the evolving context of the user's goal.\n\n3. APPROPRIATENESS DEFINITION:\nA function is appropriate if:\n- It addresses the user's stated or implied request.\n- It contributes meaningfully to the immediate goal.\n- It aligns with the documented function description.\n- It is not redundant with earlier tool calls (e.g. a duplication of the same tool call with the same parameters that is unnecessary).\n\nNote: More than one function may be appropriate. If this call satisfies a relevant part of the user's request, it is valid.\n\n4. WHEN INAPPROPRIATE:\nFlag the function as inappropriate only if:\n- A better function exists in the inventory\n- No valid function matches the request\n- The call is redundant or unnecessary\n\nProvide a corrected tool call if applicable, or use a placeholder such as:\n- {\"name\": \"no_function\"}\n- {\"name\": \"no_function_redundant\"}\n\n5. CONSERVATIVE JUDGMENT:\n- Do not assume tool behavior not stated in descriptions.\n- Do not reference tools or logic outside the inventory.\n- Do not invent user intent.\n- Do not penalize a valid call just because alternatives exist.\n\nOnly mark the function as inappropriate if clear, explicit evidence from the conversation or tool descriptions supports that judgment.\nOnly evaluate the function selection - e.g. if this is the right function name, but the parameter values are incorrect, mark it as a correct function selection with a maximum score - you MUST ignore parameter values mistakes.", - "jsonschema": { - "title": "function_selection_appropriateness", - "description": "Assessment of whether the selected function is an appropriate next step based on the user's request and the provided tool descriptions.", + { + "name": "function_selection_appropriateness", + "task_description": "You are evaluating whether the selected function is an appropriate next step. Focus ONLY on function selection \u2014 do NOT grade parameter values or execution correctness.\n\n---\n\n### Appropriateness\n\nA function is appropriate if it:\n1. Addresses the user's stated or implied request.\n2. Contributes meaningfully to the immediate goal OR sets up a subsequent call. Preliminary probes, exploratory searches, and information-gathering calls that precede a later mutation are valid.\n3. Aligns with its documented function description.\n\nMultiple functions may be appropriate. If this call satisfies a relevant part of the user's request \u2014 or is a reasonable probe toward it \u2014 it is valid.\n\n---\n\n### When Inappropriate\n\nFlag inappropriate only when:\n- No function in the inventory can serve the user's request.\n- The call repeats a prior call with the SAME function AND SAME arguments AND the prior result is still valid.\n- The call is clearly harmful (mutates state against user intent) or semantically unrelated to the conversation.\n\nDo NOT flag as inappropriate merely because a different function MIGHT also work, because the call is preliminary, or because the agent is exploring alternatives after earlier empty/errored results.\n\nWhen rejecting, provide a corrected tool-call hint or a placeholder like `{\"name\": \"no_function\"}` or `{\"name\": \"no_function_redundant\"}`.\n\n---\n\n#### Stringency\n\nJudge leniently when the call is information-gathering or exploratory \u2014 reads, lookups, searches, descriptions, probes \u2014 and it is plausibly on-path toward the user's stated or implied goal. Reserve rejection (output 1-2) for calls that are clearly wrong given the inventory, semantically unrelated to the conversation, or that violate an explicit documented constraint. Do not reject a call solely because a different function might also have worked.\n\n---\n\n#### Actionable Recommendations\n\nWhen the call reveals a fixable gap in the agent's prompts or tool specifications, emit one or more `actionable_recommendations` as unified-diff patches. Each recommendation specifies: `target` (which artifact to change: `system_prompt`, `tool_description`, `parameter_description`, `parameter_examples`), an optional `tool_name` / `parameter_name` for tool-scoped targets, a `diff` in unified-diff format, a short `rationale`, and an `importance` score in [0, 1] (0 = cosmetic, 1 = critical). When the call has no fixable artifact gap \u2014 or the call is already correct \u2014 return an empty array.", + "jsonschema": { + "title": "function_selection_appropriateness", + "description": "Assessment of whether the selected function is an appropriate next step based on the user's request and the provided tool descriptions.", + "type": "object", + "additionalProperties": false, + "properties": { + "evidence": { + "type": "string", + "description": "Cite exact quotes from the conversation history or tool inventory that support your rating. Do not paraphrase or interpret-include only directly quoted material that supports or challenges the function's appropriateness." + }, + "explanation": { + "type": "string", + "description": "Explain why this function call does or does not match the user's immediate intent. Refer to specific conversation turns and tool descriptions. If inappropriate, state whether a better function exists, the function is unnecessary, or none in the inventory are suitable." + }, + "output": { + "type": "integer", + "minimum": 1, + "maximum": 5, + "threshold_low": 3, + "threshold_high": 5, + "description": "Function Selection Appropriateness Score (1-5). Judge ONLY whether the picked function is a reasonable next step. Do NOT grade its arguments.\n\n- **5**: The function directly addresses the user's stated or implied goal given the inventory. No redundancy with prior calls.\n- **4**: The function is a reasonable next step \u2014 including a read-only probe, a discovery lookup, or a prerequisite-gathering call on the path to a later mutating action. The connection to the user's goal is clear even if indirect.\n- **3**: The function's connection to the goal is ambiguous \u2014 another function in the inventory might be a better fit, but this one is not clearly wrong and could contribute to progress.\n- **2**: The function likely does not help; a different function in the inventory is an obvious better match for the user's stated request, AND the call is not a reasonable exploratory probe.\n- **1**: The function is unrelated to the user's request, clearly redundant with a prior SAME-function-SAME-arguments call whose result is still valid, or clearly harmful (a mutation that contradicts user intent)." + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1, + "threshold_low": 0, + "threshold_high": 1, + "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." + }, + "correction": { + "type": "object", + "description": "Required when there are issues. Leave empty ({}) when there are no issues. For invalid function calls, specify the issue(s), explanation, and corrected tool call or placeholder.", + "properties": { + "reason_types": { + "type": "array", + "description": "Categories of issues with this function selection. Use one or more of: IRRELEVANT_FUNCTION, BETTER_FUNCTION_EXISTS, MISSING_FUNCTION, REDUNDANT_CALL, OTHER.", + "items": { + "type": "string", + "enum": [ + "IRRELEVANT_FUNCTION", + "BETTER_FUNCTION_EXISTS", + "MISSING_FUNCTION", + "REDUNDANT_CALL", + "OTHER" + ] + } + }, + "reasons": { + "type": "string", + "description": "Brief justification for why this function is inappropriate. State which criteria it violates and why." + }, + "corrected_function_name": { + "type": "string", + "description": "Name of the corrected function to call. Use \"no_function\" if no function should be called, or \"no_function_redundant\" if the function call is unnecessary." + } + }, + "required": [] + }, + "actionable_recommendations": { + "type": "array", + "description": "List of actionable recommendations as unified-diff patches. Emit only when the call reveals a fixable gap in the agent's prompts or tool specifications. Empty array when there is nothing to fix (including when the call is already correct).", + "items": { "type": "object", "additionalProperties": false, "properties": { - "evidence": { - "type": "string", - "description": "Cite exact quotes from the conversation history or tool inventory that support your rating. Do not paraphrase or interpret-include only directly quoted material that supports or challenges the function's appropriateness." - }, - "explanation": { - "type": "string", - "description": "Explain why this function call does or does not match the user's immediate intent. Refer to specific conversation turns and tool descriptions. If inappropriate, state whether a better function exists, the function is unnecessary, or none in the inventory are suitable." - }, - "output": { - "type": "integer", - "minimum": 1, - "maximum": 5, - "threshold_low": 3, - "threshold_high": 5, - "description": "Function Selection Appropriateness Score (1-5):\n\n- 5: Clearly appropriate and well-justified\n The selected function directly aligns with the user's request, is grounded in the conversation history, and matches the documented purpose in the tool inventory.\n Example:\n User: \"Can you check the weather in Rome tomorrow?\"\nSelected function: get_weather(city=\"Rome\", date=\"2025-08-06\")\n\n- 4: Reasonable choice with minor gaps\n The function is a plausible and helpful next step based on the conversation, though some details (e.g., goal inference or chaining rationale) may not be fully explicit. Example:\n User: \"What's my email address?\"\n Selected function: get_user_id()\n (A precondition for get_user_email(user_id=...) is this get_user_id())\n\n- 3: Ambiguous or weakly justified\n The function may relate to the user's goal, but there is not enough evidence in the conversation or function description to clearly support it.\n Example:\n User: \"I'm planning a trip to Tokyo.\"\n Selected function: get_weather(city=\"Tokyo\")\n (User didn't request weather, but it's a useful proactive step)\n\n- 2: Likely inappropriate\n The function does not clearly support the user's request or appears redundant given earlier tool calls.\n Example:\n User: \"Book me a restaurant in Berlin.\"\n Selected function: get_weather(city=\"Berlin\")\n\n- 1: Clearly inappropriate or unjustified\n The function is unrelated to the conversation, violates tool constraints, or invents a user goal.\n Example:\n User: \"Remind me to call Alex.\"\n Selected function: get_weather(city=\"New York\")" - }, - "confidence": { - "type": "number", - "minimum": 0, - "maximum": 1, - "threshold_low": 0, - "threshold_high": 1, - "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." - }, - "correction": { - "type": "object", - "description": "Required when there are issues. Leave empty ({}) when there are no issues. For invalid function calls, specify the issue(s), explanation, and corrected tool call or placeholder.", - "properties": { - "reason_types": { - "type": "array", - "description": "Categories of issues with this function selection. Use one or more of: IRRELEVANT_FUNCTION, BETTER_FUNCTION_EXISTS, MISSING_FUNCTION, REDUNDANT_CALL, OTHER.", - "items": { - "type": "string", - "enum": [ - "IRRELEVANT_FUNCTION", - "BETTER_FUNCTION_EXISTS", - "MISSING_FUNCTION", - "REDUNDANT_CALL", - "OTHER" - ] - } - }, - "reasons": { - "type": "string", - "description": "Brief justification for why this function is inappropriate. State which criteria it violates and why." - }, - "corrected_function_name": { - "type": "string", - "description": "Name of the corrected function to call. Use \"no_function\" if no function should be called, or \"no_function_redundant\" if the function call is unnecessary." - } - }, - "required": [] - }, - "actionable_recommendations": { - "type": "array", - "description": "Provide actionable recommendations to help the agent developer prevent function selection issues. Only include recommendations when the function selection was incorrect and the advice can genuinely help. Recommend only items that are important and can significantly improve the agent, such as:\n\n1. TOOL_DOCUMENTATION: Update the tool description to clarify requirements, usage details, connections to other tools (e.g., prerequisites or authentication steps that must occur before this tool call), edge cases, and other relevant information. Suggest the exact fix—addition, deletion, or adjustment.\n\n2. TOOL_USAGE_EXAMPLES: Add concrete usage examples to the tool description to help the agent understand when and how to use this function correctly. Include examples of user queries and, if relevant, prerequisite tool calls.\n\n3. INSTRUCTIONS_ADDITIONS: Add general instructions that can help the agent avoid such mistakes. Only include instructions that are essential and do not repeat existing ones. If an existing instruction (usually in the system prompt) can be improved, recommend updating it. Use this recommendation only if a clear, general instruction will significantly improve the agent in preventing this specific type of mistake.\n\n4. OTHER: If none of the above categories apply or if you have a unique recommendation, use this category. Be specific and actionable in your suggestions.", - "items": { - "type": "object", - "properties": { - "recommendation": { - "type": "string", - "description": "A specific, actionable recommendation to improve the agent's function selection process.", - "enum": [ - "TOOL_DOCUMENTATION", - "TOOL_USAGE_EXAMPLES", - "INSTRUCTIONS_ADDITIONS", - "OTHER" - ] - }, - "tool_name": { - "type": "string", - "description": "The name of the tool to which the recommendation applies." - }, - "details": { - "type": "string", - "description": "A detailed explanation of the recommendation, including what specific changes should be made, how they will improve function selection, and any relevant examples or best practices." - }, - "quote": { - "type": "string", - "description": "The specific quote of the additions made to the tool documentation (description in the tool specification), examples (that will be added to the tool documentation), or instructions that will be added to the system prompt of the agent." - } - }, - "required": [ - "recommendation", - "tool_name", - "details", - "quote" - ] - } - } + "target": { + "type": "string", + "enum": [ + "system_prompt", + "tool_description", + "parameter_description", + "parameter_examples" + ], + "description": "Which artifact the diff applies to. `system_prompt` changes the agent's top-level policy; the other three target a specific tool (and possibly one of its parameters)." + }, + "tool_name": { + "type": [ + "string", + "null" + ], + "description": "Required when target is tool_description, parameter_description, or parameter_examples. Use the exact function/tool name. Must be null when target is system_prompt." + }, + "parameter_name": { + "type": [ + "string", + "null" + ], + "description": "Required when target is parameter_description or parameter_examples. Null otherwise." + }, + "diff": { + "type": "string", + "description": "Unified git-diff-format patch of the proposed change. Format: `--- a/\\n+++ b/\\n@@ ... @@\\n-\\n+`. When there is no line-number context, use `@@` as the single hunk header. Keep diffs minimal \u2014 only the lines that meaningfully change." + }, + "rationale": { + "type": "string", + "description": "One-sentence reason why this change addresses the observed issue. <= 200 chars." + }, + "importance": { + "type": "number", + "minimum": 0, + "maximum": 1, + "description": "How impactful this recommendation is at preventing similar issues in future calls (0 = cosmetic, 1 = critical). Calibrate consistently across metrics so downstream aggregation is comparable." + } }, "required": [ - "explanation", - "evidence", - "output", - "confidence", - "correction", - "actionable_recommendations" + "target", + "diff", + "rationale", + "importance" ] + } + } + }, + "required": [ + "explanation", + "evidence", + "output", + "confidence", + "correction", + "actionable_recommendations" + ] + }, + "examples": [ + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "What time is it in Tokyo?" + } + ], + "tools_inventory": [ + { + "name": "translate_text", + "description": "Translate text to a target language", + "parameters": { + "text": "string", + "target": "string" + } + }, + { + "name": "get_time", + "description": "Retrieve current local time", + "parameters": { + "timezone": "string" + } + } + ], + "selected_function": "translate_text", + "proposed_tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "translate_text", + "arguments": "{ \"text\": \"What time is it in Tokyo?\" }" + } + } }, - "examples": [ + "output": { + "evidence": "User request: \"What time is it in Tokyo?\"\nFunction called: translate_text\nBetter tool available: get_time", + "explanation": "The function translate_text is completely inappropriate for the user's request. The user clearly asked for current time information, which should be handled by get_time.", + "output": 1, + "confidence": 0.95, + "correction": { + "reason_types": [ + "IRRELEVANT_FUNCTION", + "BETTER_FUNCTION_EXISTS" + ], + "reasons": "translate_text translates text, not time. get_time directly answers the request.", + "corrected_function_name": "get_time" + }, + "actionable_recommendations": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "What time is it in Tokyo?" - } - ], - "tools_inventory": [ - { - "name": "translate_text", - "description": "Translate text to a target language", - "parameters": { - "text": "string", - "target": "string" - } - }, - { - "name": "get_time", - "description": "Retrieve current local time", - "parameters": { - "timezone": "string" - } - } - ], - "selected_function": "translate_text", - "proposed_tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "translate_text", - "arguments": "{ \"text\": \"What time is it in Tokyo?\" }" - } - } - }, - "output": { - "evidence": "User request: \"What time is it in Tokyo?\"\nFunction called: translate_text\nBetter tool available: get_time", - "explanation": "The function translate_text is completely inappropriate for the user's request. The user clearly asked for current time information, which should be handled by get_time.", - "output": 1, - "confidence": 0.95, - "correction": { - "reason_types": [ - "IRRELEVANT_FUNCTION", - "BETTER_FUNCTION_EXISTS" - ], - "reasons": "translate_text translates text, not time. get_time directly answers the request.", - "corrected_function_name": "get_time" - }, - "actionable_recommendations": [ - { - "recommendation": "TOOL_DOCUMENTATION", - "tool_name": "get_time", - "details": "Update the get_time description to make it clear it handles city names as well as timezones.", - "quote": "Retrieve the current local time for a given timezone or city. Use this for all time-related queries." - }, - { - "recommendation": "OTHER", - "tool_name": "get_time", - "details": "Consider renaming get_time to get_local_time to make its purpose clearer and reduce confusion with other potential time-related tools.", - "quote": "Rename get_time to get_local_time." - } - ] - } + "target": "system_prompt", + "tool_name": null, + "parameter_name": null, + "diff": "--- a/system_prompt\n+++ b/system_prompt\n@@\n+Before you call a tool, verify every required argument is explicitly grounded in the conversation, a prior tool output, or a documented default. If a required argument is unknown, ask the user instead of guessing.", + "rationale": "Reinforces evidence-based argument selection so the agent does not fabricate required values.", + "importance": 0.8 }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "What is the weather like in Paris?" - } - ], - "tools_inventory": [ - { - "name": "get_weather", - "description": "Retrieve current weather conditions for a specified city.", - "parameters": { - "city": "string" - } - }, - { - "name": "get_time", - "description": "Retrieve current local time for a specified timezone.", - "parameters": { - "timezone": "string" - } - } - ], - "selected_function": "get_weather", - "proposed_tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "get_weather", - "arguments": "{ \"city\": \"London\" }" - } - } - }, - "output": { - "evidence": "User request for weather in London is clear and unambiguous.", - "explanation": "The function selection is appropriate as it directly addresses the user's request for weather information. Note: The mistake in the 'city' parameter is non relevant to the selection of the function.", - "output": 5, - "confidence": 0.95, - "correction": {} - } + "target": "tool_description", + "tool_name": "translate_text", + "parameter_name": null, + "diff": "--- a/tool/translate_text\n+++ b/tool/translate_text\n@@\n+Call this only after you have gathered the inputs listed in `required`. If any required input is missing, use a discovery tool first.", + "rationale": "Makes the tool's prerequisites explicit so the agent does not call it with under-grounded arguments.", + "importance": 0.7 + } + ] + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "What is the weather like in Paris?" + } + ], + "tools_inventory": [ + { + "name": "get_weather", + "description": "Retrieve current weather conditions for a specified city.", + "parameters": { + "city": "string" + } }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Can you help me understand what 'Bonjour' means?" - } - ], - "tools_inventory": [ - { - "name": "detect_language", - "description": "Detect the language of the input text", - "parameters": { - "text": "string" - } - }, - { - "name": "translate_text", - "description": "Translate text to a target language. If the source language is not specified or unclear, use detect_language first.", - "parameters": { - "text": "string", - "target": "string" - } - } - ], - "selected_function": "translate_text", - "proposed_tool_call": { - "id": "call_003", - "type": "function", - "function": { - "name": "translate_text", - "arguments": "{ \"text\": \"Bonjour\", \"target\": \"en\" }" - } - } - }, - "output": { - "evidence": "User request: 'Can you help me understand what Bonjour means?'\nFunction called: translate_text\nAmbiguity: user might want translation OR language detection.", - "explanation": "The assistant chose translate_text, which is plausible, but the user might also be asking for language identification. A better approach would be to clarify intent or first run detect_language.", - "output": 2, - "confidence": 0.94, - "correction": { - "reason_types": [ - "BETTER_FUNCTION_EXISTS" - ], - "reasons": "The user may want language identification, not just translation.", - "corrected_function_name": "detect_language" - }, - "actionable_recommendations": [ - { - "recommendation": "TOOL_DOCUMENTATION", - "tool_name": "translate_text", - "details": "Clarify in translate_text description that it requires knowing the target language, and if unknown, detect_language must be called first.", - "quote": "This tool should get the target language, therefore if you don't know the target language, you must use detect_language first." - }, - { - "recommendation": "TOOL_USAGE_EXAMPLES", - "tool_name": "translate_text", - "details": "Add example to translate_text usage that demonstrates the need for detect_language.", - "quote": "Example: User: \"What does 'Bonjour' mean?\" -> Tool call: detect_language with text: \"Bonjour\" and then translate_text with text: \"Bonjour\" and target: \"en\"." - }, - { - "recommendation": "INSTRUCTIONS_ADDITIONS", - "tool_name": "translate_text", - "details": "Add an instruction to clarify the use of detect_language in ambiguous cases.", - "quote": "When you encounter a phrase in an unknown language, you must first use detect_language to identify the language. In ambiguous user requests, always clarify with the user before proceeding with the task." - } - ] + "name": "get_time", + "description": "Retrieve current local time for a specified timezone.", + "parameters": { + "timezone": "string" + } + } + ], + "selected_function": "get_weather", + "proposed_tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "get_weather", + "arguments": "{ \"city\": \"London\" }" + } + } + }, + "output": { + "evidence": "User request for weather in London is clear and unambiguous.", + "explanation": "The function selection is appropriate as it directly addresses the user's request for weather information. Note: The mistake in the 'city' parameter is non relevant to the selection of the function.", + "output": 5, + "confidence": 0.95, + "correction": {}, + "actionable_recommendations": [] + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Can you help me understand what 'Bonjour' means?" + } + ], + "tools_inventory": [ + { + "name": "detect_language", + "description": "Detect the language of the input text", + "parameters": { + "text": "string" + } + }, + { + "name": "translate_text", + "description": "Translate text to a target language. If the source language is not specified or unclear, use detect_language first.", + "parameters": { + "text": "string", + "target": "string" + } + } + ], + "selected_function": "translate_text", + "proposed_tool_call": { + "id": "call_003", + "type": "function", + "function": { + "name": "translate_text", + "arguments": "{ \"text\": \"Bonjour\", \"target\": \"en\" }" + } + } + }, + "output": { + "evidence": "User request: 'Can you help me understand what Bonjour means?'\nFunction called: translate_text\nAmbiguity: user might want translation OR language detection.", + "explanation": "The assistant chose translate_text, which is plausible, but the user might also be asking for language identification. A better approach would be to clarify intent or first run detect_language.", + "output": 2, + "confidence": 0.94, + "correction": { + "reason_types": [ + "BETTER_FUNCTION_EXISTS" + ], + "reasons": "The user may want language identification, not just translation.", + "corrected_function_name": "detect_language" + }, + "actionable_recommendations": [ + { + "target": "system_prompt", + "tool_name": null, + "parameter_name": null, + "diff": "--- a/system_prompt\n+++ b/system_prompt\n@@\n+Before you call a tool, verify every required argument is explicitly grounded in the conversation, a prior tool output, or a documented default. If a required argument is unknown, ask the user instead of guessing.", + "rationale": "Reinforces evidence-based argument selection so the agent does not fabricate required values.", + "importance": 0.8 + }, + { + "target": "tool_description", + "tool_name": "translate_text", + "parameter_name": null, + "diff": "--- a/tool/translate_text\n+++ b/tool/translate_text\n@@\n+Call this only after you have gathered the inputs listed in `required`. If any required input is missing, use a discovery tool first.", + "rationale": "Makes the tool's prerequisites explicit so the agent does not call it with under-grounded arguments.", + "importance": 0.7 + } + ] + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "What is my email address?" + }, + { + "role": "assistant", + "tool_call": { + "name": "get_user_email", + "arguments": { + "user_id": "your_user_id" } + } }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "What is my email address?" - }, - { - "role": "assistant", - "tool_call": { - "name": "get_user_email", - "arguments": { - "user_id": "your_user_id" - } - } - }, - { - "role": "assistant", - "tool_response": {"error": "User ID not found"} - } - ], - "tools_inventory": [ - { - "name": "get_user_id", - "description": "Retrieve the user's unique identifier", - "parameters": {} - }, - { - "name": "get_user_email", - "description": "Retrieve the user's email address using their user ID", - "parameters": { - "user_id": "string" - } - } - ], - "selected_function": "get_user_email", - "proposed_tool_call": { - "name": "get_user_email", - "arguments": { - "user_id": "your_user_id" - } - } + "role": "assistant", + "tool_response": { + "error": "User ID not found" + } + } + ], + "tools_inventory": [ + { + "name": "get_user_id", + "description": "Retrieve the user's unique identifier", + "parameters": {} + }, + { + "name": "get_user_email", + "description": "Retrieve the user's email address using their user ID", + "parameters": { + "user_id": "string" + } + } + ], + "selected_function": "get_user_email", + "proposed_tool_call": { + "name": "get_user_email", + "arguments": { + "user_id": "your_user_id" + } + } + }, + "output": { + "evidence": "", + "explanation": "The function get_user_email is inappropriate because the previous attempt to call it failed due to a missing user ID. The correct next step is to call get_user_id to obtain the necessary identifier before retrying get_user_email.", + "output": 1, + "confidence": 0.96, + "correction": { + "reason_types": [ + "REDUNDANT_CALL" + ], + "reasons": "This is a redundant call to get_user_email without having the user_id and is a duplication of the previous incorrect call.", + "corrected_function_name": "no_function_redundant" + }, + "actionable_recommendations": [ + { + "target": "system_prompt", + "tool_name": null, + "parameter_name": null, + "diff": "--- a/system_prompt\n+++ b/system_prompt\n@@\n+Before you call a tool, verify every required argument is explicitly grounded in the conversation, a prior tool output, or a documented default. If a required argument is unknown, ask the user instead of guessing.", + "rationale": "Reinforces evidence-based argument selection so the agent does not fabricate required values.", + "importance": 0.8 + }, + { + "target": "tool_description", + "tool_name": "get_user_email", + "parameter_name": null, + "diff": "--- a/tool/get_user_email\n+++ b/tool/get_user_email\n@@\n+Call this only after you have gathered the inputs listed in `required`. If any required input is missing, use a discovery tool first.", + "rationale": "Makes the tool's prerequisites explicit so the agent does not call it with under-grounded arguments.", + "importance": 0.7 + } + ] + } + } + ] + }, + { + "name": "agentic_constraints_satisfaction", + "task_description": "Evaluate whether the tool call satisfies the agentic constraints required for safe execution. Focus ONLY on prerequisites, sequencing, and confirmation \u2014 do NOT grade parameter correctness.\n\n---\n\n### 1. Explicit Prerequisites\n- Required prior actions (authentication, ID lookup, etc.) must be satisfied when the tool description or system policy documents them. Accept any of: a successful prior tool call that produced the needed input; an explicit authorization statement; or a documented default.\n- Do NOT fabricate prerequisites the tool description or policy does not state.\n\n### 2. Tool Sequencing\n- Enforce ordering only when the tool description explicitly requires it. Otherwise treat calls as order-independent.\n\n### 3. The Current Call May Be a Prerequisite\n- A read-only lookup that gathers information for a later mutating call is itself a valid step; do NOT reject it as \"premature\".\n\n---\n\n#### Stringency\n\nJudge leniently when the call is information-gathering or exploratory \u2014 reads, lookups, searches, descriptions, probes \u2014 and it is plausibly on-path toward the user's stated or implied goal. Reserve rejection (output 1-2) for calls that are clearly wrong given the inventory, semantically unrelated to the conversation, or that violate an explicit documented constraint. Do not reject a call solely because a different function might also have worked.\n\n#### Confirmation Scope\n\nExplicit user confirmation is required only before actions that change persistent state on behalf of the user (creating, modifying, or deleting records; sending messages; committing transactions). Inspect-only or information-gathering calls \u2014 reads, lookups, searches, descriptions, status checks \u2014 do NOT require prior user confirmation, even when a later mutating action in the same flow will.\n\n---\n\n#### Actionable Recommendations\n\nWhen the call reveals a fixable gap in the agent's prompts or tool specifications, emit one or more `actionable_recommendations` as unified-diff patches. Each recommendation specifies: `target` (which artifact to change: `system_prompt`, `tool_description`, `parameter_description`, `parameter_examples`), an optional `tool_name` / `parameter_name` for tool-scoped targets, a `diff` in unified-diff format, a short `rationale`, and an `importance` score in [0, 1] (0 = cosmetic, 1 = critical). When the call has no fixable artifact gap \u2014 or the call is already correct \u2014 return an empty array.", + "jsonschema": { + "title": "agentic_constraints_satisfaction", + "type": "object", + "additionalProperties": false, + "properties": { + "evidence": { + "type": "string", + "description": "Quote SPECIFIC lines from the conversation history or tool specifications that demonstrate whether prerequisites are satisfied or violated. Include exact text showing completions or absences of required steps." + }, + "explanation": { + "type": "string", + "description": "Provide a detailed explanation of whether all prerequisite constraints are satisfied for this tool call. Address explicit dependencies, required sequence, parameter relationships, and potential redundancy. Base your explanation SOLELY on evidence from the provided conversation and tool specifications." + }, + "output": { + "type": "integer", + "minimum": 1, + "maximum": 5, + "threshold_low": 3, + "threshold_high": 5, + "description": "Agentic Constraint Satisfaction Score (1-5). Judge ONLY prerequisites, sequencing, and confirmation-before-mutation.\n\n- **5**: All documented prerequisites are satisfied, OR this call is itself gathering a prerequisite. Sequencing respects any explicit ordering in the tool description. For mutating calls, there is evidence of user confirmation when the policy requires it.\n- **4**: Minor uncertainty about a single prerequisite \u2014 the documented signal is indirect but plausible given the earlier turns.\n- **3**: One prerequisite is undetermined \u2014 not clearly met, not clearly missing.\n- **2**: A required prior step was documented in the tool description or system policy but is missing from the trajectory.\n- **1**: A critical ordering rule is violated (explicit documented sequence broken), the call is a redundant SAME-function-SAME-arguments repeat, OR a mutating action proceeds without a confirmation signal that policy explicitly requires." + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1, + "threshold_low": 0, + "threshold_high": 1, + "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." + }, + "correction": { + "type": "object", + "description": "For satisfied constraints: Provide an empty object {}. For constraint violations: Provide an object with reason_types, reasons, and either prerequisite_tool_calls (for missing prerequisites) or corrected_function (for parameter issues).", + "properties": { + "reason_types": { + "type": "array", + "description": "Types of constraint violations identified", + "items": { + "type": "string", + "enum": [ + "MISSING_PREREQUISITE", + "REDUNDANT_CALL", + "OTHER" + ] + } + }, + "reasons": { + "type": "string", + "description": "Brief explanation of the specific constraint violations" + }, + "prerequisite_tool_calls": { + "type": "array", + "description": "List of prerequisite tool calls that must be executed before this one", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Name of the prerequisite function" + } }, - "output": { - "evidence": "", - "explanation": "The function get_user_email is inappropriate because the previous attempt to call it failed due to a missing user ID. The correct next step is to call get_user_id to obtain the necessary identifier before retrying get_user_email.", - "output": 1, - "confidence": 0.96, - "correction": { - "reason_types": [ - "REDUNDANT_CALL" - ], - "reasons": "This is a redundant call to get_user_email without having the user_id and is a duplication of the previous incorrect call.", - "corrected_function_name": "no_function_redundant" - }, - "actionable_recommendations": [ - { - "recommendation": "INSTRUCTIONS_ADDITIONS", - "tool_name": "get_user_id", - "details": "Add instructions to always retrieve necessary identifiers before dependent calls, and do not repeat tool calls that were previously unsuccessful - instead, try to obtain the missing information first.", - "quote": "Always retrieve necessary values before dependent calls. Do not repeat tool calls that were previously unsuccessful - instead, try to obtain the missing information first from other tools or by asking the user clarifying questions." - } - ] - } + "required": [] + } + }, + "corrected_function_name": { + "type": "string", + "description": "Name of the corrected function to call. Use \"no_function\" if no function should be called, or \"no_function_redundant\" if the function call is unnecessary." } - ] - }, - { - "name": "agentic_constraints_satisfaction", - "task_description": "Evaluate whether the proposed tool call satisfies all agentic constraints required for execution.\n\nYour judgment must be based strictly on the provided conversation history and tools inventory. Do not infer or assume any constraints or workflows not explicitly documented.\nIMPORTANT: Evaluate only the agentic constraints satisfaction. Do not assess the function's execution, parameter values correctness, or any other aspect.\n\n---\n\n### 1. Explicit Prerequisites\n- Confirm that required prior actions (e.g., authentication, file upload) have been completed.\n- Accept only direct evidence such as: \"Auth token acquired\" or \"File uploaded successfully.\"\n\n---\n\n### 2. Tool Sequencing\n- Enforce sequencing only when the tool description explicitly states a required order.\n- If no such requirement is documented, assume calls can be executed independently.\n\n---\n\n### 3. Call Independence\n- Calls that are not explicitly marked as dependent may be treated as parallel or order-independent.\n- Do not impose workflow constraints unless specified.\n\n---\n\n### 4. Redundancy\n- Flag the tool call as redundant only if the same function with the same parameters was already executed and no contextual change justifies repetition.\n\n---\n\n### Final Rule\nYour evaluation must be conservative: flag constraint violations only when there is clear, explicit evidence. Do not assume or invent requirements not documented in the conversation or tool descriptions.", - "jsonschema": { - "title": "agentic_constraints_satisfaction", + }, + "required": [] + }, + "actionable_recommendations": { + "type": "array", + "description": "List of actionable recommendations as unified-diff patches. Emit only when the call reveals a fixable gap in the agent's prompts or tool specifications. Empty array when there is nothing to fix (including when the call is already correct).", + "items": { "type": "object", "additionalProperties": false, "properties": { - "evidence": { - "type": "string", - "description": "Quote SPECIFIC lines from the conversation history or tool specifications that demonstrate whether prerequisites are satisfied or violated. Include exact text showing completions or absences of required steps." - }, - "explanation": { - "type": "string", - "description": "Provide a detailed explanation of whether all prerequisite constraints are satisfied for this tool call. Address explicit dependencies, required sequence, parameter relationships, and potential redundancy. Base your explanation SOLELY on evidence from the provided conversation and tool specifications." - }, - "output": { - "type": "integer", - "minimum": 1, - "maximum": 5, - "threshold_low": 3, - "threshold_high": 5, - "description": "Agentic Constraints Satisfaction Score (1-5):\n\n- 5: Fully compliant with all known constraints\n The tool call satisfies all documented prerequisites, sequencing rules, and redundancy checks based on explicit evidence in the conversation and tool inventory.\n Example:\n User: \"Here is my file.\"\n Assistant: (File upload confirmed)\n -> Tool call: summarize_uploaded_file(file_id=\"abc123\")\n\n- 4: Likely compliant, with minor uncertainty\n All major constraints are satisfied, but some minor dependency or precondition may not be fully confirmed, though it is likely met based on adjacent context.\n Example:\n User: \"I've uploaded the file already.\"\n (No explicit upload confirmation, but no contradiction)\n -> Tool call: summarize_uploaded_file(file_id=\"abc123\")\n\n- 3: Ambiguous or undetermined\n It is unclear whether all required conditions or dependencies are met. The call might be valid, but confirmation is lacking.\n Example:\n User: \"Can you summarize the document?\"\n (No evidence of file upload or confirmation)\n -> Tool call: summarize_uploaded_file(file_id=\"abc123\")\n\n- 2: Likely violates one or more constraints\n At least one key constraint-such as missing authentication, a required prior step, or improper sequencing-appears unsatisfied.\n Example:\n User: \"Please delete my account.\"\n (No authentication step documented)\n -> Tool call: delete_account(user_id=\"456\")\n\n- 1: Clearly violates agentic constraints\n The tool call ignores an explicit precondition, repeats a redundant action, or breaks an ordering rule described in the tool documentation.\n Example:\n Assistant already called: upload_file(name=\"report.pdf\")\n -> Immediately repeats: upload_file(name=\"report.pdf\") without any contextual change" - }, - "confidence": { - "type": "number", - "minimum": 0, - "maximum": 1, - "threshold_low": 0, - "threshold_high": 1, - "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." - }, - "correction": { - "type": "object", - "description": "For satisfied constraints: Provide an empty object {}. For constraint violations: Provide an object with reason_types, reasons, and either prerequisite_tool_calls (for missing prerequisites) or corrected_function (for parameter issues).", - "properties": { - "reason_types": { - "type": "array", - "description": "Types of constraint violations identified", - "items": { - "type": "string", - "enum": [ - "MISSING_PREREQUISITE", - "REDUNDANT_CALL", - "OTHER" - ] - } - }, - "reasons": { - "type": "string", - "description": "Brief explanation of the specific constraint violations" - }, - "prerequisite_tool_calls": { - "type": "array", - "description": "List of prerequisite tool calls that must be executed before this one", - "items": { - "type": "object", - "properties": { - "name": { - "type": "string", - "description": "Name of the prerequisite function" - } - }, - "required": [] - } - }, - "corrected_function_name": { - "type": "string", - "description": "Name of the corrected function to call. Use \"no_function\" if no function should be called, or \"no_function_redundant\" if the function call is unnecessary." - } - }, - "required": [] - }, - "actionable_recommendations": { - "type": "array", - "description": "Provide specific, actionable recommendations for the agent developer to address identified prerequisite or constraint satisfaction issues. Only provide recommendations when problems are detected. Include multiple detailed suggestions where appropriate, such as:\n\n1. PREREQUISITE_TRACKING: Suggest specific mechanisms for tracking and verifying completion of prerequisite steps, such as 'Implement a prerequisite tracking system that maintains a state object {\"authentication\": true, \"language_detected\": false, etc.} and checks this before making dependent calls'.\n\n2. TOOL_DEPENDENCY_DOCUMENTATION: Recommend explicit documentation of tool dependencies in function descriptions, like 'Update the translate_text function description to state: \"REQUIRES: Prior successful call to detect_language for the same text input. Example workflow: 1) detect_language -> 2) translate_text\"'.\n\n3. TOOL_CALL_SEQUENCING: Propose flowchart or decision tree implementations for proper tool sequencing, like 'Create a predefined workflow for file operations that enforces the sequence: 1) check_permissions -> 2) upload_file -> 3) process_file'.\n\n4. SYSTEM_PROMPT_ADDITIONS: Suggest specific instructions in the system prompt to enforce dependencies, such as 'Add to system prompt: \"Always perform language detection before translation. Never skip prerequisite steps, even when the source language seems obvious\"'.\n\n5. TOOL_SPECIFICATION_IMPROVEMENTS: Recommend enhancements to tool descriptions that highlight dependencies, like 'Add a \"Prerequisites\" section to each tool's description that lists required prior successful calls and their parameters'.\n\n6. ERROR_HANDLING_IMPROVEMENTS: Suggest specific error recovery procedures, such as 'If authentication fails, implement an automatic retry with explicit user notification: \"I'm having trouble authenticating. Let me try again with [authentication method]\"'.\n\n7. REDUNDANCY_DETECTION: Recommend logic for preventing redundant calls, like 'Implement a result caching mechanism that stores outputs from previous calls and checks if an identical call has already been made in the current session'.\n\n8. CONTEXTUAL_AWARENESS: Suggest improvements for maintaining awareness of conversation state, such as 'Create a context tracker that monitors which entities (files, queries, data objects) have been successfully processed and are available for subsequent operations'.\n\n9. MULTI_STEP_OPERATION_DOCUMENTATION: Recommend clearer documentation for operations requiring multiple steps, like 'Create a developer guide section on \"Chained Operations\" that details common sequences of tool calls with sample code and conversation examples'.\n\n10. AGENT_ARCHITECTURE_IMPROVEMENTS: Suggest architectural changes to better support dependencies, such as 'Implement a prerequisite validation middleware layer that intercepts all tool calls and verifies that required prior calls exist in the conversation history before proceeding'.\n\n11. OTHER: If none of the above categories fit or if you have a unique recommendation, use this category. Be specific and actionable in your suggestions.\n\nEnsure recommendations are thorough, specific, and directly address the root cause of the constraint satisfaction issue.", - "items": { - "type": "object", - "properties": { - "recommendation": { - "type": "string", - "description": "A specific, actionable recommendation to improve the agent's ability to satisfy agentic constraints in future tool calls.", - "enum": [ - "PREREQUISITE_TRACKING", - "TOOL_DEPENDENCY_DOCUMENTATION", - "TOOL_CALL_SEQUENCING", - "SYSTEM_PROMPT_ADDITIONS", - "TOOL_SPECIFICATION_IMPROVEMENTS", - "ERROR_HANDLING_IMPROVEMENTS", - "REDUNDANCY_DETECTION", - "CONTEXTUAL_AWARENESS", - "MULTI_STEP_OPERATION_DOCUMENTATION", - "AGENT_ARCHITECTURE_IMPROVEMENTS", - "OTHER" - ] - }, - "tool_name": { - "type": "string", - "description": "The name of the tool to which the recommendation applies." - }, - "details": { - "type": "string", - "description": "Detailed explanation of the recommendation, including what specific changes should be made, how they will improve the agent's ability to satisfy agentic constraints, and any relevant examples or best practices." - }, - "quote": { - "type": "string", - "description": "The specific quote of the additions made to the documentation, examples, or instructions." - } - }, - "required": [ - "recommendation", - "tool_name", - "details", - "quote" - ] - } - } + "target": { + "type": "string", + "enum": [ + "system_prompt", + "tool_description", + "parameter_description", + "parameter_examples" + ], + "description": "Which artifact the diff applies to. `system_prompt` changes the agent's top-level policy; the other three target a specific tool (and possibly one of its parameters)." + }, + "tool_name": { + "type": [ + "string", + "null" + ], + "description": "Required when target is tool_description, parameter_description, or parameter_examples. Use the exact function/tool name. Must be null when target is system_prompt." + }, + "parameter_name": { + "type": [ + "string", + "null" + ], + "description": "Required when target is parameter_description or parameter_examples. Null otherwise." + }, + "diff": { + "type": "string", + "description": "Unified git-diff-format patch of the proposed change. Format: `--- a/\\n+++ b/\\n@@ ... @@\\n-\\n+`. When there is no line-number context, use `@@` as the single hunk header. Keep diffs minimal \u2014 only the lines that meaningfully change." + }, + "rationale": { + "type": "string", + "description": "One-sentence reason why this change addresses the observed issue. <= 200 chars." + }, + "importance": { + "type": "number", + "minimum": 0, + "maximum": 1, + "description": "How impactful this recommendation is at preventing similar issues in future calls (0 = cosmetic, 1 = critical). Calibrate consistently across metrics so downstream aggregation is comparable." + } }, "required": [ - "explanation", - "evidence", - "output", - "confidence", - "correction", - "actionable_recommendations" + "target", + "diff", + "rationale", + "importance" ] + } + } + }, + "required": [ + "explanation", + "evidence", + "output", + "confidence", + "correction", + "actionable_recommendations" + ] + }, + "examples": [ + { + "user_kwargs": { + "conversation_context": [ + { + "role": "assistant", + "content": "Auth token acquired for user 42." + } + ], + "tools_inventory": [ + { + "name": "get_order_history", + "description": "Retrieve past orders", + "parameters": { + "user_id": "integer" + } + } + ], + "selected_function": "get_order_history", + "proposed_tool_call": { + "id": "call_002", + "type": "function", + "function": { + "name": "get_order_history", + "arguments": "{ \"user_id\": 42 }" + } + } }, - "examples": [ + "output": { + "evidence": "Assistant message explicitly states: \"Auth token acquired for user 42.\" The tool specification for get_order_history does not mention any additional prerequisites beyond authentication.", + "explanation": "This tool call satisfies all agentic constraints. The prerequisite of authentication is explicitly confirmed in the conversation history with \"Auth token acquired for user 42.\" No additional sequencing requirements or redundant calls are indicated.", + "output": 5, + "confidence": 0.95, + "correction": {}, + "actionable_recommendations": [] + } + }, + { + "user_kwargs": { + "conversation_context": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "assistant", - "content": "Auth token acquired for user 42." - } - ], - "tools_inventory": [ - { - "name": "get_order_history", - "description": "Retrieve past orders", - "parameters": { - "user_id": "integer" - } - } - ], - "selected_function": "get_order_history", - "proposed_tool_call": { - "id": "call_002", - "type": "function", - "function": { - "name": "get_order_history", - "arguments": "{ \"user_id\": 42 }" - } - } - }, - "output": { - "evidence": "Assistant message explicitly states: \"Auth token acquired for user 42.\" The tool specification for get_order_history does not mention any additional prerequisites beyond authentication.", - "explanation": "This tool call satisfies all agentic constraints. The prerequisite of authentication is explicitly confirmed in the conversation history with \"Auth token acquired for user 42.\" No additional sequencing requirements or redundant calls are indicated.", - "output": 5, - "confidence": 0.95, - "correction": {}, - "actionable_recommendations": [] - } + "role": "user", + "content": "Translate 'Hola' to English." + } + ], + "tools_inventory": [ + { + "name": "translate_text", + "description": "Translate text to target language. REQUIRES: Prior successful call to detect_language for the input text.", + "parameters": { + "text": "string", + "target": "string" + } }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Translate 'Hola' to English." - } - ], - "tools_inventory": [ - { - "name": "translate_text", - "description": "Translate text to target language. REQUIRES: Prior successful call to detect_language for the input text.", - "parameters": { - "text": "string", - "target": "string" - } - }, - { - "name": "detect_language", - "description": "Detect the language of the input text", - "parameters": { - "text": "string" - } - } - ], - "selected_function": "translate_text", - "proposed_tool_call": { - "id": "call_002", - "type": "function", - "function": { - "name": "translate_text", - "arguments": "{ \"text\": \"Bonjour\", \"target\": \"en\" }" - } - } - }, - "output": { - "evidence": "Tool description explicitly states: \"translate_text: Translate text to target language. REQUIRES: Prior successful call to detect_language for the input text.\" The conversation history contains no detect_language call or language detection result.", - "explanation": "This tool call violates an agentic constraint clearly stated in the tool description. The translate_text function explicitly requires a prior successful call to detect_language for the input text, as stated in its description. The conversation history shows no evidence of any detect_language call being made before this translate_text call.", - "output": 1, - "confidence": 0.95, - "correction": { - "reason_types": [ - "MISSING_PREREQUISITE" - ], - "reasons": "Missing required prerequisite detect_language call as explicitly stated in translate_text description", - "prerequisite_tool_calls": [ - { - "name": "detect_language" - } - ] - }, - "actionable_recommendations": [ - { - "recommendation": "PREREQUISITE_TRACKING", - "tool_name": "translate_text", - "details": "Implement a prerequisite tracking system that maintains a state object {\"language_detected\": false} and checks this before making dependent calls like translate_text.", - "quote": "Implement a prerequisite tracking system that maintains a state object {\"language_detected\": false} and checks this before making dependent calls." - }, - { - "recommendation": "TOOL_DEPENDENCY_DOCUMENTATION", - "tool_name": "translate_text", - "details": "Update the translate_text function description to state: \"REQUIRES: Prior successful call to detect_language for the same text input. Example workflow: 1) detect_language -> 2) translate_text\".", - "quote": "Update the translate_text function description to state: \"REQUIRES: Prior successful call to detect_language for the same text input. Example workflow: 1) detect_language -> 2) translate_text\"." - }, - { - "recommendation": "SYSTEM_PROMPT_ADDITIONS", - "tool_name": "translate_text", - "details": "Add to system prompt an explicit requirement for language detection before translation.", - "quote": "Add to system prompt: \"Always perform language detection before translation. Never skip prerequisite steps, even when the source language seems obvious.\"" - }, - { - "recommendation": "TOOL_CALL_SEQUENCING", - "tool_name": "translate_text", - "details": "Propose flowchart or decision tree implementations for proper tool sequencing, like 'Create a predefined workflow for translation that enforces the sequence: 1) detect_language -> 2) translate_text'.", - "quote": "Propose flowchart or decision tree implementations for proper tool sequencing, like 'Create a predefined workflow for translation that enforces the sequence: 1) detect_language -> 2) translate_text'." - } - ] - } + "name": "detect_language", + "description": "Detect the language of the input text", + "parameters": { + "text": "string" + } + } + ], + "selected_function": "translate_text", + "proposed_tool_call": { + "id": "call_002", + "type": "function", + "function": { + "name": "translate_text", + "arguments": "{ \"text\": \"Bonjour\", \"target\": \"en\" }" + } + } + }, + "output": { + "evidence": "Tool description explicitly states: \"translate_text: Translate text to target language. REQUIRES: Prior successful call to detect_language for the input text.\" The conversation history contains no detect_language call or language detection result.", + "explanation": "This tool call violates an agentic constraint clearly stated in the tool description. The translate_text function explicitly requires a prior successful call to detect_language for the input text, as stated in its description. The conversation history shows no evidence of any detect_language call being made before this translate_text call.", + "output": 1, + "confidence": 0.95, + "correction": { + "reason_types": [ + "MISSING_PREREQUISITE" + ], + "reasons": "Missing required prerequisite detect_language call as explicitly stated in translate_text description", + "prerequisite_tool_calls": [ + { + "name": "detect_language" + } + ] + }, + "actionable_recommendations": [ + { + "target": "system_prompt", + "tool_name": null, + "parameter_name": null, + "diff": "--- a/system_prompt\n+++ b/system_prompt\n@@\n+Before you call a tool, verify every required argument is explicitly grounded in the conversation, a prior tool output, or a documented default. If a required argument is unknown, ask the user instead of guessing.", + "rationale": "Reinforces evidence-based argument selection so the agent does not fabricate required values.", + "importance": 0.8 + }, + { + "target": "tool_description", + "tool_name": "translate_text", + "parameter_name": null, + "diff": "--- a/tool/translate_text\n+++ b/tool/translate_text\n@@\n+Call this only after you have gathered the inputs listed in `required`. If any required input is missing, use a discovery tool first.", + "rationale": "Makes the tool's prerequisites explicit so the agent does not call it with under-grounded arguments.", + "importance": 0.7 } - ] - } + ] + } + } + ] + } ] diff --git a/altk/pre_tool/sparc/function_calling/metrics/function_selection/function_selection_metrics_runtime.json b/altk/pre_tool/sparc/function_calling/metrics/function_selection/function_selection_metrics_runtime.json index cc44a6e9..2658fdef 100644 --- a/altk/pre_tool/sparc/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +++ b/altk/pre_tool/sparc/function_calling/metrics/function_selection/function_selection_metrics_runtime.json @@ -1,477 +1,477 @@ [ - { - "name": "function_selection_appropriateness", - "task_description": "You are an evaluator assessing whether a specific function is an appropriate next step in a user-assistant conversation.\n\nTASK: Determine if the selected function aligns with the user's current request and is justified by the conversation history and tool inventory.\nIMPORTANT: Evaluate only the function selection appropriateness. Do not assess the function's execution, parameter values correctness, or any other aspect.\n\nEVALUATION CRITERIA:\n\n1. EVIDENCE SOURCES:\n- Use only the conversation history (system prompts, user and assistant turns, prior tool calls and outputs).\n- Use only the tool inventory, which includes function names and descriptions.\n- Do not infer capabilities, tool logic, or user goals beyond what is explicitly provided.\n\n2. DIALOGUE CONTEXT:\n- Conversations alternate between user and assistant.\n- Assistant turns may include multiple tool calls.\n- Tool calls may be:\n - INDEPENDENT: order does not matter\n - CHAINED: later calls depend on earlier ones\n- Consider the evolving context of the user's goal.\n\n3. APPROPRIATENESS DEFINITION:\nA function is appropriate if:\n- It addresses the user's stated or implied request.\n- It contributes meaningfully to the immediate goal.\n- It aligns with the documented function description.\n- It is not redundant with earlier tool calls (e.g. a duplication of the same tool call with the same parameters that is unnecessary).\n\nNote: More than one function may be appropriate. If this call satisfies a relevant part of the user's request, it is valid.\n\n4. WHEN INAPPROPRIATE:\nFlag the function as inappropriate only if:\n- A better function exists in the inventory\n- No valid function matches the request\n- The call is redundant or unnecessary\n\nProvide a corrected tool call if applicable, or use a placeholder such as:\n- {\"name\": \"no_function\"}\n- {\"name\": \"no_function_redundant\"}\n\n5. CONSERVATIVE JUDGMENT:\n- Do not assume tool behavior not stated in descriptions.\n- Do not reference tools or logic outside the inventory.\n- Do not invent user intent.\n- Do not penalize a valid call just because alternatives exist.\n\nOnly mark the function as inappropriate if clear, explicit evidence from the conversation or tool descriptions supports that judgment.\nOnly evaluate the function selection - e.g. if the function selection is appropriate, but the parameter values are incorrect, mark it as a correct function selection with a maximum score - you MUST ignore parameter values mistakes.", - "jsonschema": { - "title": "function_selection_appropriateness", - "description": "Assessment of whether the selected function is an appropriate next step based on the user's request and the provided tool descriptions.", - "type": "object", - "additionalProperties": false, - "properties": { - "evidence": { - "type": "string", - "description": "Provide direct quotes from the conversation and tool descriptions that justify your assessment. Only include grounded evidence-do not paraphrase or infer." - }, - "explanation": { - "type": "string", - "description": "Briefly explain why the selected function is or isn't appropriate for the user's request. Reference the user's message and the relevant tool description. If the function is inappropriate, state whether: (1) a better function exists, (2) no function is suitable, or (3) the call is redundant or incorrectly ordered." - }, - "output": { - "type": "integer", - "minimum": 1, - "maximum": 5, - "threshold_low": 3, - "threshold_high": 5, - "description": "Function Selection Appropriateness Score (1-5):\n\n- 5: Clearly appropriate and well-justified\n The selected function directly aligns with the user's request, is grounded in the conversation history, and matches the documented purpose in the tool inventory.\n Example:\n User: \"Can you check the weather in Rome tomorrow?\"\nSelected function: get_weather(city=\"Rome\", date=\"2025-08-06\")\n\n- 4: Reasonable choice with minor gaps\n The function is a plausible and helpful next step based on the conversation, though some details (e.g., goal inference or chaining rationale) may not be fully explicit. Example:\n User: \"What's my email address?\"\n Selected function: get_user_id()\n (A precondition for get_user_email(user_id=...) is this get_user_id())\n\n- 3: Ambiguous or weakly justified\n The function may relate to the user's goal, but there is not enough evidence in the conversation or function description to clearly support it.\n Example:\n User: \"I'm planning a trip to Tokyo.\"\n Selected function: get_weather(city=\"Tokyo\")\n (User didn't request weather, but it's a useful proactive step)\n\n- 2: Likely inappropriate\n The function does not clearly support the user's request or appears redundant given earlier tool calls.\n Example:\n User: \"Book me a restaurant in Berlin.\"\n Selected function: get_weather(city=\"Berlin\")\n\n- 1: Clearly inappropriate or unjustified\n The function is unrelated to the conversation, violates tool constraints, or invents a user goal.\n Example:\n User: \"Remind me to call Alex.\"\n Selected function: get_weather(city=\"New York\")" - }, - "confidence": { - "type": "number", - "minimum": 0, - "maximum": 1, - "threshold_low": 0, - "threshold_high": 1, - "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." - }, - "correction": { - "type": "object", - "description": "Required when there are issues. Leave empty ({}) when there are no issues. For invalid function calls, specify the issue(s), explanation, and corrected tool call or placeholder.", - "properties": { - "reason_types": { - "type": "array", - "description": "Categories of issues with this function selection. Use one or more of: IRRELEVANT_FUNCTION, BETTER_FUNCTION_EXISTS, MISSING_FUNCTION, REDUNDANT_CALL, OTHER.", - "items": { - "type": "string", - "enum": [ - "IRRELEVANT_FUNCTION", - "BETTER_FUNCTION_EXISTS", - "MISSING_FUNCTION", - "REDUNDANT_CALL", - "OTHER" - ] - } - }, - "reasons": { - "type": "string", - "description": "Short explanation of the problem with the function selection." - }, - "corrected_function_name": { - "type": "string", - "description": "Name of the corrected function to call. Use \"no_function\" if no function should be called, or \"no_function_redundant\" if the function call is unnecessary." - } - }, - "required": [] - } + { + "name": "function_selection_appropriateness", + "task_description": "You are evaluating whether the selected function is an appropriate next step. Focus ONLY on function selection \u2014 do NOT grade parameter values or execution correctness.\n\n---\n\n### Appropriateness\n\nA function is appropriate if it:\n1. Addresses the user's stated or implied request.\n2. Contributes meaningfully to the immediate goal OR sets up a subsequent call. Preliminary probes, exploratory searches, and information-gathering calls that precede a later mutation are valid.\n3. Aligns with its documented function description.\n\nMultiple functions may be appropriate. If this call satisfies a relevant part of the user's request \u2014 or is a reasonable probe toward it \u2014 it is valid.\n\n---\n\n### When Inappropriate\n\nFlag inappropriate only when:\n- No function in the inventory can serve the user's request.\n- The call repeats a prior call with the SAME function AND SAME arguments AND the prior result is still valid.\n- The call is clearly harmful (mutates state against user intent) or semantically unrelated to the conversation.\n\nDo NOT flag as inappropriate merely because a different function MIGHT also work, because the call is preliminary, or because the agent is exploring alternatives after earlier empty/errored results.\n\nWhen rejecting, provide a corrected tool-call hint or a placeholder like `{\"name\": \"no_function\"}` or `{\"name\": \"no_function_redundant\"}`.\n\n---\n\n#### Stringency\n\nJudge leniently when the call is information-gathering or exploratory \u2014 reads, lookups, searches, descriptions, probes \u2014 and it is plausibly on-path toward the user's stated or implied goal. Reserve rejection (output 1-2) for calls that are clearly wrong given the inventory, semantically unrelated to the conversation, or that violate an explicit documented constraint. Do not reject a call solely because a different function might also have worked.", + "jsonschema": { + "title": "function_selection_appropriateness", + "description": "Assessment of whether the selected function is an appropriate next step based on the user's request and the provided tool descriptions.", + "type": "object", + "additionalProperties": false, + "properties": { + "evidence": { + "type": "string", + "description": "Provide direct quotes from the conversation and tool descriptions that justify your assessment. Only include grounded evidence-do not paraphrase or infer." + }, + "explanation": { + "type": "string", + "description": "Briefly explain why the selected function is or isn't appropriate for the user's request. Reference the user's message and the relevant tool description. If the function is inappropriate, state whether: (1) a better function exists, (2) no function is suitable, or (3) the call is redundant or incorrectly ordered." + }, + "output": { + "type": "integer", + "minimum": 1, + "maximum": 5, + "threshold_low": 3, + "threshold_high": 5, + "description": "Function Selection Appropriateness Score (1-5). Judge ONLY whether the picked function is a reasonable next step. Do NOT grade its arguments.\n\n- **5**: The function directly addresses the user's stated or implied goal given the inventory. No redundancy with prior calls.\n- **4**: The function is a reasonable next step \u2014 including a read-only probe, a discovery lookup, or a prerequisite-gathering call on the path to a later mutating action. The connection to the user's goal is clear even if indirect.\n- **3**: The function's connection to the goal is ambiguous \u2014 another function in the inventory might be a better fit, but this one is not clearly wrong and could contribute to progress.\n- **2**: The function likely does not help; a different function in the inventory is an obvious better match for the user's stated request, AND the call is not a reasonable exploratory probe.\n- **1**: The function is unrelated to the user's request, clearly redundant with a prior SAME-function-SAME-arguments call whose result is still valid, or clearly harmful (a mutation that contradicts user intent)." + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1, + "threshold_low": 0, + "threshold_high": 1, + "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." + }, + "correction": { + "type": "object", + "description": "Required when there are issues. Leave empty ({}) when there are no issues. For invalid function calls, specify the issue(s), explanation, and corrected tool call or placeholder.", + "properties": { + "reason_types": { + "type": "array", + "description": "Categories of issues with this function selection. Use one or more of: IRRELEVANT_FUNCTION, BETTER_FUNCTION_EXISTS, MISSING_FUNCTION, REDUNDANT_CALL, OTHER.", + "items": { + "type": "string", + "enum": [ + "IRRELEVANT_FUNCTION", + "BETTER_FUNCTION_EXISTS", + "MISSING_FUNCTION", + "REDUNDANT_CALL", + "OTHER" + ] + } }, - "required": [ - "explanation", - "evidence", - "output", - "confidence", - "correction" - ] + "reasons": { + "type": "string", + "description": "Short explanation of the problem with the function selection." + }, + "corrected_function_name": { + "type": "string", + "description": "Name of the corrected function to call. Use \"no_function\" if no function should be called, or \"no_function_redundant\" if the function call is unnecessary." + } + }, + "required": [] + } + }, + "required": [ + "explanation", + "evidence", + "output", + "confidence", + "correction" + ] + }, + "examples": [ + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "What time is it in Tokyo?" + } + ], + "tools_inventory": [ + { + "name": "translate_text", + "description": "Translate text to a target language", + "parameters": { + "text": "string", + "target": "string" + } + }, + { + "name": "get_time", + "description": "Retrieve current local time", + "parameters": { + "timezone": "string" + } + } + ], + "selected_function": "translate_text", + "proposed_tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "translate_text", + "arguments": "{ \"text\": \"What time is it in Tokyo?\" }" + } + } }, - "examples": [ + "output": { + "evidence": "User request: \"What time is it in Tokyo?\"\nFunction called: translate_text with description: \"Translate text to a target language\"\nMore appropriate function available: get_time with description: \"Retrieve current local time\"", + "explanation": "The translate_text function is misused, as the user's request for the current time in Tokyo should have been handled by the get_time function instead.", + "output": 1, + "confidence": 0.95, + "correction": { + "reason_types": [ + "IRRELEVANT_FUNCTION", + "BETTER_FUNCTION_EXISTS" + ], + "reasons": "translate_text performs text translation, not time retrieval; get_time function directly addresses user's request for current time in Tokyo", + "corrected_function_name": "get_time" + } + } + }, + { + "user_kwargs": { + "conversation_context": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "What time is it in Tokyo?" - } - ], - "tools_inventory": [ - { - "name": "translate_text", - "description": "Translate text to a target language", - "parameters": { - "text": "string", - "target": "string" - } - }, - { - "name": "get_time", - "description": "Retrieve current local time", - "parameters": { - "timezone": "string" - } - } - ], - "selected_function": "translate_text", - "proposed_tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "translate_text", - "arguments": "{ \"text\": \"What time is it in Tokyo?\" }" - } - } - }, - "output": { - "evidence": "User request: \"What time is it in Tokyo?\"\nFunction called: translate_text with description: \"Translate text to a target language\"\nMore appropriate function available: get_time with description: \"Retrieve current local time\"", - "explanation": "The translate_text function is misused, as the user's request for the current time in Tokyo should have been handled by the get_time function instead.", - "output": 1, - "confidence": 0.95, - "correction": { - "reason_types": [ - "IRRELEVANT_FUNCTION", - "BETTER_FUNCTION_EXISTS" - ], - "reasons": "translate_text performs text translation, not time retrieval; get_time function directly addresses user's request for current time in Tokyo", - "corrected_function_name": "get_time" - } - } + "role": "user", + "content": "Show me my calendar events from July 17 2024 to July 21 2024." }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Show me my calendar events from July 17 2024 to July 21 2024." - }, - { - "role": "assistant", - "content": "{\"id\":\"call_001\",\"type\":\"function\",\"function\":{\"name\":\"get_calendar_events\",\"arguments\":{\"start_date\":\"2024-07-17\",\"end_date\":\"2024-07-21\"}}}" - } - ], - "tools_inventory": [ - { - "name": "get_calendar_events", - "description": "Retrieve calendar events for a given time range (e.g., today, upcoming, past).", - "parameters": { - "start_date": "string", - "end_date": "string" - } - } - ], - "selected_function": "get_calendar_events", - "proposed_tool_call": { - "id": "call_002", - "type": "function", - "function": { - "name": "get_calendar_events", - "arguments": "{ \"start_date\": \"2024-07-17\", \"end_date\": \"2024-07-21\" }" - } - } - }, - "output": { - "evidence": "The same function call (get_calendar_events with start_date=2024-07-17 and end_date=2024-07-21) was already executed earlier and produced a valid response. There was no user message requesting updated information.", - "explanation": "This tool call is redundant. The assistant re-issued the same call with identical parameters and without any new conversational signal. Such repetition adds no value.", - "output": 1, - "confidence": 0.97, - "correction": { - "reason_types": [ - "REDUNDANT_CALL" - ], - "reasons": "Identical tool call was already made and completed. No user input suggests the need for an update.", - "corrected_function_name": "no_function_redundant" - } - } + "role": "assistant", + "content": "{\"id\":\"call_001\",\"type\":\"function\",\"function\":{\"name\":\"get_calendar_events\",\"arguments\":{\"start_date\":\"2024-07-17\",\"end_date\":\"2024-07-21\"}}}" + } + ], + "tools_inventory": [ + { + "name": "get_calendar_events", + "description": "Retrieve calendar events for a given time range (e.g., today, upcoming, past).", + "parameters": { + "start_date": "string", + "end_date": "string" + } + } + ], + "selected_function": "get_calendar_events", + "proposed_tool_call": { + "id": "call_002", + "type": "function", + "function": { + "name": "get_calendar_events", + "arguments": "{ \"start_date\": \"2024-07-17\", \"end_date\": \"2024-07-21\" }" + } + } + }, + "output": { + "evidence": "The same function call (get_calendar_events with start_date=2024-07-17 and end_date=2024-07-21) was already executed earlier and produced a valid response. There was no user message requesting updated information.", + "explanation": "This tool call is redundant. The assistant re-issued the same call with identical parameters and without any new conversational signal. Such repetition adds no value.", + "output": 1, + "confidence": 0.97, + "correction": { + "reason_types": [ + "REDUNDANT_CALL" + ], + "reasons": "Identical tool call was already made and completed. No user input suggests the need for an update.", + "corrected_function_name": "no_function_redundant" + } + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Remind me to call Mom tomorrow at 6 PM." + } + ], + "tools_inventory": [ + { + "name": "get_reminders", + "description": "Retrieve your scheduled reminders.", + "parameters": {} }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Remind me to call Mom tomorrow at 6 PM." - } - ], - "tools_inventory": [ - { - "name": "get_reminders", - "description": "Retrieve your scheduled reminders.", - "parameters": {} - }, - { - "name": "delete_reminder", - "description": "Remove a reminder by its ID.", - "parameters": { - "reminder_id": "string" - } - } - ], - "selected_function": "get_reminders", - "proposed_tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "get_reminders", - "arguments": "{}" - } - } - }, - "output": { - "evidence": "User request: \"Remind me to call Mom tomorrow at 6 PM.\" Available tools are get_reminders and delete_reminder, neither of which supports creating new reminders.", - "explanation": "The user's intent is to schedule a reminder, but the available tools do not support this. get_reminders retrieves existing reminders, and delete_reminder removes them. There is no tool for creating a reminder.", - "output": 1, - "confidence": 0.93, - "correction": { - "reason_types": [ - "MISSING_FUNCTION" - ], - "reasons": "User request requires a reminder creation tool, which is not present in the tool inventory.", - "corrected_function_name": "no_function" - } - } + "name": "delete_reminder", + "description": "Remove a reminder by its ID.", + "parameters": { + "reminder_id": "string" + } + } + ], + "selected_function": "get_reminders", + "proposed_tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "get_reminders", + "arguments": "{}" + } + } + }, + "output": { + "evidence": "User request: \"Remind me to call Mom tomorrow at 6 PM.\" Available tools are get_reminders and delete_reminder, neither of which supports creating new reminders.", + "explanation": "The user's intent is to schedule a reminder, but the available tools do not support this. get_reminders retrieves existing reminders, and delete_reminder removes them. There is no tool for creating a reminder.", + "output": 1, + "confidence": 0.93, + "correction": { + "reason_types": [ + "MISSING_FUNCTION" + ], + "reasons": "User request requires a reminder creation tool, which is not present in the tool inventory.", + "corrected_function_name": "no_function" + } + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "What is the weather like in Paris?" + } + ], + "tools_inventory": [ + { + "name": "get_weather", + "description": "Retrieve current weather conditions for a specified city.", + "parameters": { + "city": "string" + } }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "What is the weather like in Paris?" - } - ], - "tools_inventory": [ - { - "name": "get_weather", - "description": "Retrieve current weather conditions for a specified city.", - "parameters": { - "city": "string" - } - }, - { - "name": "get_time", - "description": "Retrieve current local time for a specified timezone.", - "parameters": { - "timezone": "string" - } - } - ], - "selected_function": "get_weather", - "proposed_tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "get_weather", - "arguments": "{ \"city\": \"London\" }" - } - } - }, - "output": { - "evidence": "User request for weather in London is clear and unambiguous.", - "explanation": "The function selection is appropriate as it directly addresses the user's request for weather information. Note: The mistake in the 'city' parameter is non relevant to the selection of the function.", - "output": 5, - "confidence": 0.95, - "correction": {} - } + "name": "get_time", + "description": "Retrieve current local time for a specified timezone.", + "parameters": { + "timezone": "string" + } } - ] - }, - { - "name": "agentic_constraints_satisfaction", - "task_description": "Evaluate whether the proposed tool call satisfies all agentic constraints required for safe execution.\n\nYour judgment must be based strictly on the provided conversation history and tools inventory. Do not infer or assume any constraints or workflows not explicitly documented.\nIMPORTANT: Evaluate only the agentic constraints satisfaction. Do not assess the function's execution, parameter values correctness, or any other aspect.\n\n---\n\n### 1. Explicit Prerequisites\n- Confirm that required prior actions (e.g., authentication, file upload) have been completed.\n- Accept only direct evidence such as: \"Auth token acquired\" or \"File uploaded successfully.\"\n\n---\n\n### 2. Tool Sequencing\n- Enforce sequencing only when the tool description explicitly states a required order.\n- If no such requirement is documented, assume calls can be executed independently.\n\n---\n\n### 3. Call Independence\n- Calls that are not explicitly marked as dependent may be treated as parallel or order-independent.\n- Do not impose workflow constraints unless specified.\n\n---\n\n### 4. Redundancy\n- Flag the tool call as redundant only if the same function with the same parameters was already executed and no contextual change justifies repetition.\n\n---\n\n### Final Rule\nYour evaluation must be conservative: flag constraint violations only when there is clear, explicit evidence. Do not assume or invent requirements not documented in the conversation or tool descriptions.", - "jsonschema": { - "title": "agentic_constraints_satisfaction", - "description": "Assessment of whether the tool call satisfies all agentic constraints as defined in the provided conversation and tool inventory.", - "type": "object", - "additionalProperties": false, - "properties": { - "evidence": { - "type": "string", - "description": "Quote specific lines from the conversation or tool specifications showing whether constraints (e.g., prerequisites, sequence, parameter relationships) are satisfied or violated. Include exact wording, not paraphrased summaries." - }, - "explanation": { + ], + "selected_function": "get_weather", + "proposed_tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "get_weather", + "arguments": "{ \"city\": \"London\" }" + } + } + }, + "output": { + "evidence": "User request for weather in London is clear and unambiguous.", + "explanation": "The function selection is appropriate as it directly addresses the user's request for weather information. Note: The mistake in the 'city' parameter is non relevant to the selection of the function.", + "output": 5, + "confidence": 0.95, + "correction": {} + } + } + ] + }, + { + "name": "agentic_constraints_satisfaction", + "task_description": "Evaluate whether the tool call satisfies the agentic constraints required for safe execution. Focus ONLY on prerequisites, sequencing, and confirmation \u2014 do NOT grade parameter correctness.\n\n---\n\n### 1. Explicit Prerequisites\n- Required prior actions (authentication, ID lookup, etc.) must be satisfied when the tool description or system policy documents them. Accept any of: a successful prior tool call that produced the needed input; an explicit authorization statement; or a documented default.\n- Do NOT fabricate prerequisites the tool description or policy does not state.\n\n### 2. Tool Sequencing\n- Enforce ordering only when the tool description explicitly requires it. Otherwise treat calls as order-independent.\n\n### 3. The Current Call May Be a Prerequisite\n- A read-only lookup that gathers information for a later mutating call is itself a valid step; do NOT reject it as \"premature\".\n\n---\n\n#### Stringency\n\nJudge leniently when the call is information-gathering or exploratory \u2014 reads, lookups, searches, descriptions, probes \u2014 and it is plausibly on-path toward the user's stated or implied goal. Reserve rejection (output 1-2) for calls that are clearly wrong given the inventory, semantically unrelated to the conversation, or that violate an explicit documented constraint. Do not reject a call solely because a different function might also have worked.\n\n#### Confirmation Scope\n\nExplicit user confirmation is required only before actions that change persistent state on behalf of the user (creating, modifying, or deleting records; sending messages; committing transactions). Inspect-only or information-gathering calls \u2014 reads, lookups, searches, descriptions, status checks \u2014 do NOT require prior user confirmation, even when a later mutating action in the same flow will.", + "jsonschema": { + "title": "agentic_constraints_satisfaction", + "description": "Assessment of whether the tool call satisfies all agentic constraints as defined in the provided conversation and tool inventory.", + "type": "object", + "additionalProperties": false, + "properties": { + "evidence": { + "type": "string", + "description": "Quote specific lines from the conversation or tool specifications showing whether constraints (e.g., prerequisites, sequence, parameter relationships) are satisfied or violated. Include exact wording, not paraphrased summaries." + }, + "explanation": { + "type": "string", + "description": "Explain clearly whether all agentic constraints are satisfied. Address prerequisite completions, sequencing logic, redundancy, parameter presence, and parameter consistency-based solely on the provided context." + }, + "output": { + "type": "integer", + "minimum": 1, + "maximum": 5, + "threshold_low": 3, + "threshold_high": 5, + "description": "Agentic Constraint Satisfaction Score (1-5). Judge ONLY prerequisites, sequencing, and confirmation-before-mutation.\n\n- **5**: All documented prerequisites are satisfied, OR this call is itself gathering a prerequisite. Sequencing respects any explicit ordering in the tool description. For mutating calls, there is evidence of user confirmation when the policy requires it.\n- **4**: Minor uncertainty about a single prerequisite \u2014 the documented signal is indirect but plausible given the earlier turns.\n- **3**: One prerequisite is undetermined \u2014 not clearly met, not clearly missing.\n- **2**: A required prior step was documented in the tool description or system policy but is missing from the trajectory.\n- **1**: A critical ordering rule is violated (explicit documented sequence broken), the call is a redundant SAME-function-SAME-arguments repeat, OR a mutating action proceeds without a confirmation signal that policy explicitly requires." + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1, + "threshold_low": 0, + "threshold_high": 1, + "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." + }, + "correction": { + "type": "object", + "description": "For satisfied constraints: Provide an empty object {}. For constraint violations: Provide an object with reason_types, reasons, and either prerequisite_tool_calls (for missing prerequisites) or corrected_function (for parameter issues).", + "properties": { + "reason_types": { + "type": "array", + "description": "Categories of constraint violations, if any.", + "items": { + "type": "string", + "enum": [ + "MISSING_PREREQUISITE", + "REDUNDANT_CALL", + "OTHER" + ] + } + }, + "reasons": { + "type": "string", + "description": "Concise justification for the detected constraint violations, tied to specific evidence." + }, + "prerequisite_tool_calls": { + "type": "array", + "description": "If the issue is missing prerequisites, list the tool calls that should precede this one.", + "items": { + "type": "object", + "properties": { + "name": { "type": "string", - "description": "Explain clearly whether all agentic constraints are satisfied. Address prerequisite completions, sequencing logic, redundancy, parameter presence, and parameter consistency-based solely on the provided context." - }, - "output": { - "type": "integer", - "minimum": 1, - "maximum": 5, - "threshold_low": 3, - "threshold_high": 5, - "description": "Agentic Constraints Satisfaction Score (1-5):\n\n- 5: Fully compliant with all known constraints\n The tool call satisfies all documented prerequisites, sequencing rules, and redundancy checks based on explicit evidence in the conversation and tool inventory.\n Example:\n User: \"Here is my file.\"\n Assistant: (File upload confirmed)\n -> Tool call: summarize_uploaded_file(file_id=\"abc123\")\n\n- 4: Likely compliant, with minor uncertainty\n All major constraints are satisfied, but some minor dependency or precondition may not be fully confirmed, though it is likely met based on adjacent context.\n Example:\n User: \"I've uploaded the file already.\"\n (No explicit upload confirmation, but no contradiction)\n -> Tool call: summarize_uploaded_file(file_id=\"abc123\")\n\n- 3: Ambiguous or undetermined\n It is unclear whether all required conditions or dependencies are met. The call might be valid, but confirmation is lacking.\n Example:\n User: \"Can you summarize the document?\"\n (No evidence of file upload or confirmation)\n -> Tool call: summarize_uploaded_file(file_id=\"abc123\")\n\n- 2: Likely violates one or more constraints\n At least one key constraint-such as missing authentication, a required prior step, or improper sequencing-appears unsatisfied.\n Example:\n User: \"Please delete my account.\"\n (No authentication step documented)\n -> Tool call: delete_account(user_id=\"456\")\n\n- 1: Clearly violates agentic constraints\n The tool call ignores an explicit precondition, repeats a redundant action, or breaks an ordering rule described in the tool documentation.\n Example:\n Assistant already called: upload_file(name=\"report.pdf\")\n -> Immediately repeats: upload_file(name=\"report.pdf\") without any contextual change" - }, - "confidence": { - "type": "number", - "minimum": 0, - "maximum": 1, - "threshold_low": 0, - "threshold_high": 1, - "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." + "description": "Name of the required function" + } }, - "correction": { - "type": "object", - "description": "For satisfied constraints: Provide an empty object {}. For constraint violations: Provide an object with reason_types, reasons, and either prerequisite_tool_calls (for missing prerequisites) or corrected_function (for parameter issues).", - "properties": { - "reason_types": { - "type": "array", - "description": "Categories of constraint violations, if any.", - "items": { - "type": "string", - "enum": [ - "MISSING_PREREQUISITE", - "REDUNDANT_CALL", - "OTHER" - ] - } - }, - "reasons": { - "type": "string", - "description": "Concise justification for the detected constraint violations, tied to specific evidence." - }, - "prerequisite_tool_calls": { - "type": "array", - "description": "If the issue is missing prerequisites, list the tool calls that should precede this one.", - "items": { - "type": "object", - "properties": { - "name": { - "type": "string", - "description": "Name of the required function" - } - }, - "required": [] - } - }, - "corrected_function_name": { - "type": "string", - "description": "Name of the corrected function to call. Use \"no_function\" if no function should be called, or \"no_function_redundant\" if the function call is unnecessary." - } - }, - "required": [] - } + "required": [] + } }, - "required": [ - "evidence", - "explanation", - "output", - "confidence", - "correction" - ] + "corrected_function_name": { + "type": "string", + "description": "Name of the corrected function to call. Use \"no_function\" if no function should be called, or \"no_function_redundant\" if the function call is unnecessary." + } + }, + "required": [] + } + }, + "required": [ + "evidence", + "explanation", + "output", + "confidence", + "correction" + ] + }, + "examples": [ + { + "user_kwargs": { + "conversation_context": [ + { + "role": "assistant", + "content": "Auth token acquired for user 42." + } + ], + "tools_inventory": [ + { + "name": "get_order_history", + "description": "Retrieve past orders", + "parameters": { + "user_id": "integer" + } + } + ], + "selected_function": "get_order_history", + "proposed_tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "get_order_history", + "arguments": "{ \"user_id\": 42 }" + } + } }, - "examples": [ + "output": { + "evidence": "Assistant message: \"Auth token acquired for user 42.\"", + "explanation": "Authentication is confirmed by assistant's message. No other prerequisites are required by the tool description. All agentic constraints are satisfied.", + "output": 5, + "confidence": 0.95, + "correction": {} + } + }, + { + "user_kwargs": { + "conversation_context": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "assistant", - "content": "Auth token acquired for user 42." - } - ], - "tools_inventory": [ - { - "name": "get_order_history", - "description": "Retrieve past orders", - "parameters": { - "user_id": "integer" - } - } - ], - "selected_function": "get_order_history", - "proposed_tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "get_order_history", - "arguments": "{ \"user_id\": 42 }" - } - } - }, - "output": { - "evidence": "Assistant message: \"Auth token acquired for user 42.\"", - "explanation": "Authentication is confirmed by assistant's message. No other prerequisites are required by the tool description. All agentic constraints are satisfied.", - "output": 5, - "confidence": 0.95, - "correction": {} - } + "role": "user", + "content": "Translate 'Hola' to English." + } + ], + "tools_inventory": [ + { + "name": "translate_text", + "description": "Translate text to target language. REQUIRES: Prior successful call to detect_language for the input text.", + "parameters": { + "text": "string", + "target": "string" + } }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Translate 'Hola' to English." - } - ], - "tools_inventory": [ - { - "name": "translate_text", - "description": "Translate text to target language. REQUIRES: Prior successful call to detect_language for the input text.", - "parameters": { - "text": "string", - "target": "string" - } - }, - { - "name": "detect_language", - "description": "Detect the language of the input text.", - "parameters": { - "text": "string" - } - } - ], - "selected_function": "translate_text", - "proposed_tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "translate_text", - "arguments": "{ \"text\": \"Hola\", \"target\": \"en\" }" - } - } - }, - "output": { - "evidence": "Tool description requires prior detect_language call. No such call appears in conversation.", - "explanation": "translate_text was used without satisfying its documented prerequisite, violating agentic constraints.", - "output": 1, - "confidence": 0.95, - "correction": { - "reason_types": [ - "MISSING_PREREQUISITE" - ], - "reasons": "translate_text was called before detect_language despite a required sequence.", - "prerequisite_tool_calls": [ - { - "name": "detect_language" - } - ] - } - } + "name": "detect_language", + "description": "Detect the language of the input text.", + "parameters": { + "text": "string" + } + } + ], + "selected_function": "translate_text", + "proposed_tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "translate_text", + "arguments": "{ \"text\": \"Hola\", \"target\": \"en\" }" + } + } + }, + "output": { + "evidence": "Tool description requires prior detect_language call. No such call appears in conversation.", + "explanation": "translate_text was used without satisfying its documented prerequisite, violating agentic constraints.", + "output": 1, + "confidence": 0.95, + "correction": { + "reason_types": [ + "MISSING_PREREQUISITE" + ], + "reasons": "translate_text was called before detect_language despite a required sequence.", + "prerequisite_tool_calls": [ + { + "name": "detect_language" + } + ] + } + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Please translate 'Bonjour' to English." + } + ], + "tools_inventory": [ + { + "name": "translate_text", + "description": "Translate input text to a specified target language.", + "parameters": { + "text": "string", + "target": "string" + } }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Please translate 'Bonjour' to English." - } - ], - "tools_inventory": [ - { - "name": "translate_text", - "description": "Translate input text to a specified target language.", - "parameters": { - "text": "string", - "target": "string" - } - }, - { - "name": "detect_language", - "description": "Detect the language of the input text.", - "parameters": { - "text": "string" - } - } - ], - "selected_function": "translate_text", - "proposed_tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "translate_text", - "arguments": "{ \"text\": \"Bonjour\", \"target\": \"en\" }" - } - } - }, - "output": { - "evidence": "The translate_text tool description does not specify any prerequisite or dependency on detect_language.", - "explanation": "There is no constraint violation. The tool can be used independently since its description does not document any required sequencing or prerequisite.", - "output": 5, - "confidence": 0.94, - "correction": {} - } + "name": "detect_language", + "description": "Detect the language of the input text.", + "parameters": { + "text": "string" + } } - ] - } + ], + "selected_function": "translate_text", + "proposed_tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "translate_text", + "arguments": "{ \"text\": \"Bonjour\", \"target\": \"en\" }" + } + } + }, + "output": { + "evidence": "The translate_text tool description does not specify any prerequisite or dependency on detect_language.", + "explanation": "There is no constraint violation. The tool can be used independently since its description does not document any required sequencing or prerequisite.", + "output": 5, + "confidence": 0.94, + "correction": {} + } + } + ] + } ] diff --git a/altk/pre_tool/sparc/function_calling/metrics/parameter/parameter.py b/altk/pre_tool/sparc/function_calling/metrics/parameter/parameter.py index e13d2657..9f8742e8 100644 --- a/altk/pre_tool/sparc/function_calling/metrics/parameter/parameter.py +++ b/altk/pre_tool/sparc/function_calling/metrics/parameter/parameter.py @@ -4,6 +4,7 @@ ) _parameter_system: str = ( + "{{ common_principles }}\n\n" "### Task Description:\n\n" "{{ task_description }}\n\n" "Your output must conform to the following JSON schema, in the same order as the fields appear in the schema:\n" diff --git a/altk/pre_tool/sparc/function_calling/metrics/parameter/parameter_metrics.json b/altk/pre_tool/sparc/function_calling/metrics/parameter/parameter_metrics.json index 0527f4a4..03cfd736 100644 --- a/altk/pre_tool/sparc/function_calling/metrics/parameter/parameter_metrics.json +++ b/altk/pre_tool/sparc/function_calling/metrics/parameter/parameter_metrics.json @@ -1,613 +1,673 @@ [ - { - "name": "parameter_hallucination_check", - "task_description": "You are an expert evaluator assessing whether a **specific parameter value**-identified as `parameter_name`-in a tool call is **grounded** in the provided conversation history or tool specification, or whether it is **hallucinated**.\n\nYour task is to make a **strictly evidence-based** judgment. Evaluate this parameter alone-ignore other parameters or external knowledge. The value must:\n- Be explicitly supported by the dialogue, prior tool calls, or tool specification\n- Be appropriate for the context and user intent\n- Respect constraint rules defined in the specification\n\nIMPORTANT: Evaluate ONLY the **specific parameter value** provided. Do not evaluate the entire tool call correctness, other parameters, or the function itself. Focus solely on whether this parameter value is grounded in the conversation or specification.\n---\n\n### Rating Scale\n\nAssign a score from 1 to 5 based on how clearly the value is grounded:\n\n**5 - Perfectly Grounded** \nExplicitly stated in conversation or matches a clearly documented and contextually appropriate default.\n\n**4 - Mostly Grounded** \nNot directly quoted but clearly follows from the conversation or is a minor variation on a documented default.\n\n**3 - Ambiguous** \nRequires non-trivial inference or transformation to relate the value to the dialogue or specification.\n\n**2 - Mostly Ungrounded** \nLoosely related, incorrectly formatted, or partially contradicts the spec or context.\n\n**1 - Completely Ungrounded** \nInvented, unrelated, or clearly in conflict with the dialogue or specification.\n\nBe conservative in your scoring: Use 1-3 for weak or unverified grounding, and 4-5 only when evidence is explicit or clearly reliable.\n\n---\n\n### Acceptable Sources for Grounding\nA value may be considered grounded if it comes from:\n- Explicit user input\n- Clearly implied user intent (minimal inference)\n- Assistant statements explicitly confirmed by the user\n- Outputs of previous tool calls\n- Documented default values in the tool specification\n\n---\n\n### Ungrounded Patterns\nMark a value ungrounded if:\n- It is absent from both conversation and tool spec\n- It depends on vague or missing input (e.g., \"tomorrow\" without a date format)\n- It uses the wrong type, unit, or format\n- It contradicts other parameters or tool constraints\n- It inaccurately rephrases or transforms the user's input\n\n---\n\n### Handling Defaults\nA default value may be accepted only if:\n- It is explicitly documented in the tool spec\n- It fits the context of the user's request\n- It does not contradict stated or implied intent\nOtherwise, treat the value as ungrounded.\n\n---\n\n### Parameter Relationships\nEnsure this parameter does not contradict other parameters in the same call.\nExample: If `add_day=false` and `day=\"Sunday\"` but the spec says `add_day` must be true to use `day`, the `day` value is ungrounded.\n\n---\n\n### Final Guideline\nLarge language models frequently hallucinate parameter values or apply defaults inappropriately. Your careful and conservative evaluation prevents propagation of such errors.", - "jsonschema": { - "title": "parameter_hallucination_check", - "description": "Assessment of tool call parameter hallucination, following the rubric defined above.", + { + "name": "parameter_hallucination_check", + "task_description": "You are an expert evaluator assessing whether a **specific parameter value**-identified as `parameter_name`-in a tool call is **grounded** in the provided conversation history or tool specification, or whether it is **hallucinated**.\n\nYour task is to make a **strictly evidence-based** judgment. Evaluate this parameter alone-ignore other parameters or external knowledge. The value must:\n- Be explicitly supported by the dialogue, prior tool calls, or tool specification\n- Be appropriate for the context and user intent\n- Respect constraint rules defined in the specification\n\nIMPORTANT: Evaluate ONLY the **specific parameter value** provided. Do not evaluate the entire tool call correctness, other parameters, or the function itself. Focus solely on whether this parameter value is grounded in the conversation or specification.\n---\n\n### Rating Scale\n\nAssign a score from 1 to 5 based on how clearly the value is grounded:\n\n**5 - Perfectly Grounded** \nExplicitly stated in conversation or matches a clearly documented and contextually appropriate default.\n\n**4 - Mostly Grounded** \nNot directly quoted but clearly follows from the conversation or is a minor variation on a documented default.\n\n**3 - Ambiguous** \nRequires non-trivial inference or transformation to relate the value to the dialogue or specification.\n\n**2 - Mostly Ungrounded** \nLoosely related, incorrectly formatted, or partially contradicts the spec or context.\n\n**1 - Completely Ungrounded** \nInvented, unrelated, or clearly in conflict with the dialogue or specification.\n\nBe conservative in your scoring: Use 1-3 for weak or unverified grounding, and 4-5 only when evidence is explicit or clearly reliable.\n\n---\n\n### Acceptable Sources for Grounding\nA value may be considered grounded if it comes from:\n- Explicit user input\n- Clearly implied user intent (minimal inference)\n- Assistant statements explicitly confirmed by the user\n- Outputs of previous tool calls\n- Documented default values in the tool specification\n\n---\n\n### Ungrounded Patterns\nMark a value ungrounded if:\n- It is absent from both conversation and tool spec\n- It depends on vague or missing input (e.g., \"tomorrow\" without a date format)\n- It uses the wrong type, unit, or format\n- It contradicts other parameters or tool constraints\n- It inaccurately rephrases or transforms the user's input\n\n---\n\n### Handling Defaults\nA default value may be accepted only if:\n- It is explicitly documented in the tool spec\n- It fits the context of the user's request\n- It does not contradict stated or implied intent\nOtherwise, treat the value as ungrounded.\n\n---\n\n### Parameter Relationships\nEnsure this parameter does not contradict other parameters in the same call.\nExample: If `add_day=false` and `day=\"Sunday\"` but the spec says `add_day` must be true to use `day`, the `day` value is ungrounded.\n\n---\n\n### Final Guideline\nLarge language models frequently hallucinate parameter values or apply defaults inappropriately. Your careful and conservative evaluation prevents propagation of such errors.\n\n---\n\n#### Grounding Paths\n\nA parameter value is grounded via any of these paths:\n- **Explicit statement** in a user, assistant, tool, or system message.\n- **Documented default** in the API spec for this parameter.\n- **Pick-one** \u2014 the user enumerated multiple valid values and the agent picked one.\n- **Specialization** \u2014 the user named a set/range/category and the agent supplied a concrete value inside that scope. A value that falls OUTSIDE the named scope is a mismatch, not missing information.\n- **Plausible optional value** \u2014 if this parameter is optional (not in `required`), any spec-consistent value is grounded unless it contradicts explicit user intent.\n\n---\n\n#### Required-Parameter Omission\n\nA parameter listed in the spec's `required` array that is completely absent from the arguments is ALWAYS MISSING_INFORMATION \u2014 no grounding path (explicit statement, documented default, pick-one, specialization, or plausible-optional) substitutes for actually providing a value. The grounding paths only apply to values the agent ACTUALLY PROVIDED; they never excuse omission of a required parameter.\n\n#### User-Context Discarded\n\nWhen a parameter is spec-optional but the user (or prior tool output / system prompt) provided an explicit narrowing value for it \u2014 a search term, a filter, a category, an identifier \u2014 and the agent OMITS the parameter or sets it to an \"empty\" sentinel (empty string, null, default wildcard), this is NOT a \"plausible optional value\". It is USER_CONTEXT_DISCARDED and should be labeled CONTRADICTORY_VALUES (the call contradicts explicit user intent). Score the call 2-3 accordingly. This rule applies regardless of whether the spec marks the parameter as required; it turns on whether the user supplied narrowing context for it.\n\n---\n\n#### Stringency\n\nApply this rubric strictly for REQUIRED parameters whose value is missing, fabricated, or inconsistent with the spec. Apply it leniently for OPTIONAL parameters (not in the spec's `required` array) and for values reasonably inferable from visible context. A value is NOT an issue unless it contradicts explicit user intent, a documented constraint, or higher-priority evidence.\n\n---\n\n#### Actionable Recommendations\n\nWhen the call reveals a fixable gap in the agent's prompts or tool specifications, emit one or more `actionable_recommendations` as unified-diff patches. Each recommendation specifies: `target` (which artifact to change: `system_prompt`, `tool_description`, `parameter_description`, `parameter_examples`), an optional `tool_name` / `parameter_name` for tool-scoped targets, a `diff` in unified-diff format, a short `rationale`, and an `importance` score in [0, 1] (0 = cosmetic, 1 = critical). When the call has no fixable artifact gap \u2014 or the call is already correct \u2014 return an empty array.", + "jsonschema": { + "title": "parameter_hallucination_check", + "description": "Assessment of tool call parameter hallucination, following the rubric defined above.", + "type": "object", + "additionalProperties": false, + "properties": { + "evidence": { + "type": "string", + "description": "Provide EXACT quotes from the conversation history (user messages, assistant responses, previous tool outputs) or tool specification that directly support your assessment. For grounded values, cite the specific text where this value originates. For ungrounded values, demonstrate the absence of supporting evidence or cite contradictory information." + }, + "explanation": { + "type": "string", + "description": "Explain in detail why the specific parameter value is grounded or ungrounded. Reference exact sources from the conversation or tool specification that either support or contradict the parameter value. Address: 1) Where the value originated from, 2) Whether this source is sufficient for grounding, 3) Any format or type compliance issues, 4) Any contradictions with other parameters or specifications, and 5) For default values, whether they're documented and appropriate for this context." + }, + "output": { + "type": "integer", + "minimum": 1, + "maximum": 5, + "threshold_low": 4, + "threshold_high": 5, + "description": "Parameter Grounding Score for this single parameter (1-5). Judge ONLY the named parameter's value; other parameters are out of scope.\n\n- **5**: Value is grounded via explicit statement, documented default, pick-one from user-provided options, or in-scope specialization. For optional parameters, a plausible spec-consistent value is CORRECT.\n- **4**: Value is grounded by a clear but unstated inference a careful reader would accept, or by a concrete specialization inside the user's abstract intent.\n- **3**: Grounding is weak \u2014 the value is plausible given the context but there is real ambiguity about whether it matches the user's intent.\n- **2**: The value is not supported by any grounding path; it appears fabricated or falls outside the user's named scope.\n- **1**: The value clearly contradicts explicit evidence, a documented constraint, or the user's named scope." + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1, + "threshold_low": 0, + "threshold_high": 1, + "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." + }, + "correction": { + "type": "object", + "properties": { + "reason_types": { + "type": "array", + "description": "Types of issues with the parameter value, if any. Use one or more of these values: FORMAT_ERROR (wrong format or type), MISSING_INFORMATION (needs more data), PARAMETER_CONTRADICTION (conflicts with other parameters), DEFAULT_ISSUE (inappropriate default), OTHER (explain in reasons).", + "items": { + "type": "string", + "enum": [ + "FORMAT_ERROR", + "MISSING_INFORMATION", + "PARAMETER_CONTRADICTION", + "DEFAULT_ISSUE", + "OTHER" + ] + } + }, + "reasons": { + "type": "string", + "description": "Concise explanation of the specific issues with the parameter value." + }, + "parameter": { + "type": "object", + "description": "Object containing parameter name as key and corrected value as value. Use 'need_more_information' as the key and clarification question for the user if additional information is required. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-grounded, return an empty object {}.", + "additionalProperties": true + } + }, + "description": "For well-grounded values: Provide an empty object {}. For ungrounded values: Provide an object with reason_types, reasons, and parameter correction.", + "required": [] + }, + "actionable_recommendations": { + "type": "array", + "description": "List of actionable recommendations as unified-diff patches. Emit only when the call reveals a fixable gap in the agent's prompts or tool specifications. Empty array when there is nothing to fix (including when the call is already correct).", + "items": { "type": "object", "additionalProperties": false, "properties": { - "evidence": { - "type": "string", - "description": "Provide EXACT quotes from the conversation history (user messages, assistant responses, previous tool outputs) or tool specification that directly support your assessment. For grounded values, cite the specific text where this value originates. For ungrounded values, demonstrate the absence of supporting evidence or cite contradictory information." - }, - "explanation": { - "type": "string", - "description": "Explain in detail why the specific parameter value is grounded or ungrounded. Reference exact sources from the conversation or tool specification that either support or contradict the parameter value. Address: 1) Where the value originated from, 2) Whether this source is sufficient for grounding, 3) Any format or type compliance issues, 4) Any contradictions with other parameters or specifications, and 5) For default values, whether they're documented and appropriate for this context." - }, - "output": { - "type": "integer", - "minimum": 1, - "maximum": 5, - "threshold_low": 4, - "threshold_high": 5, - "description": "Parameter Hallucination Score (1-5):\n\n- 5: Perfectly grounded\n The parameter value is directly quoted in the conversation or exactly matches a documented default that fits the user's intent.\n Example:\n User: \"Book a hotel in Rome.\"\n → Tool call: search_hotels(city=\"Rome\")\n\n- 4: Grounded via logical reasoning\n The value is not directly quoted but clearly follows from the conversation through straightforward, logic-based inference.\n Example:\n User: \"I'd like a room for me and my wife.\"\n → Tool call: search_hotels(guests=2)\n\n- 3: Ambiguous\n The value might relate to the conversation or domain context, but grounding requires assumptions or nontrivial interpretation not clearly supported by evidence.\n Example:\n User: \"Let's go somewhere warm.\"\n → Tool call: search_hotels(city=\"Barcelona\")\n\n- 2: Likely hallucinated\n The value is loosely connected to the topic but lacks grounding in the conversation or tool specification, or only partially aligns.\n Example:\n User: \"Find hotels.\"\n → Tool call: search_hotels(city=\"Berlin\")\n (No city mentioned)\n\n- 1: Clearly hallucinated\n The parameter value is invented, irrelevant, or contradicts the dialogue or tool specification.\n Example:\n User: \"Book a hotel.\"\n → Tool call: search_hotels(city=\"Mars\")" - }, - "confidence": { - "type": "number", - "minimum": 0, - "maximum": 1, - "threshold_low": 0, - "threshold_high": 1, - "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." - }, - "correction": { - "type": "object", - "properties": { - "reason_types": { - "type": "array", - "description": "Types of issues with the parameter value, if any. Use one or more of these values: FORMAT_ERROR (wrong format or type), MISSING_INFORMATION (needs more data), PARAMETER_CONTRADICTION (conflicts with other parameters), DEFAULT_ISSUE (inappropriate default), OTHER (explain in reasons).", - "items": { - "type": "string", - "enum": [ - "FORMAT_ERROR", - "MISSING_INFORMATION", - "PARAMETER_CONTRADICTION", - "DEFAULT_ISSUE", - "OTHER" - ] - } - }, - "reasons": { - "type": "string", - "description": "Concise explanation of the specific issues with the parameter value." - }, - "parameter": { - "type": "object", - "description": "Object containing parameter name as key and corrected value as value. Use 'need_more_information' as the key and clarification question for the user if additional information is required. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-grounded, return an empty object {}.", - "additionalProperties": true - } - }, - "description": "For well-grounded values: Provide an empty object {}. For ungrounded values: Provide an object with reason_types, reasons, and parameter correction.", - "required": [] - }, - "actionable_recommendations": { - "type": "array", - "description": "Provide specific, actionable recommendations for the agent developer to prevent parameter value hallucination. Only provide recommendations when problems are detected. Include multiple detailed suggestions where appropriate, such as:\n\n1. PARAMETER_DOCUMENTATION: Suggest specific clarifications to parameter descriptions that would help prevent hallucination, e.g., 'Update the \"username\" parameter description to: \"Must be explicitly provided by the user in conversation. No default value is permitted. Do not infer or guess usernames.\"'\n\n2. EXPLICIT_VALUE_EXTRACTION: Recommend concrete techniques for extracting parameter values reliably from user input, like 'Implement a structured extraction pattern for the \"count\" parameter that requires an explicit number from the user before proceeding, such as \"How many tweets would you like to see? (default is 10)\"'\n\n3. DEFAULT_VALUE_HANDLING: Propose specific parameter documentation improvements for default values, e.g., 'Add to the \"count\" parameter description: \"Default: 10 if not specified by user. Never invent a count value; use default or ask user.\"'\n\n4. FORMAT_SPECIFICATION_IMPROVEMENTS: Suggest clearer format requirements in parameter documentation, like 'For the \"date\" parameter, specify: \"Must be in ISO format YYYY-MM-DD. Example: 2024-06-22. Do not use other date formats.\"'\n\n5. TYPE_VALIDATION_RULES: Propose validation logic for parameter types, such as 'Implement pre-call validation that ensures integer parameters like \"count\" are provided as numbers without quotes or text.'\n\n6. SYSTEM_PROMPT_ADDITIONS: Recommend specific additions to the system prompt that would help the agent avoid hallucination, e.g., 'Add to the system prompt: \"For parameters like 'count', always use explicitly provided values from the user. If not specified, use documented defaults or ask for clarification.\"'\n\n7. PARAMETER_NAMING_CONVENTIONS: Suggest improvements to parameter naming that would make their purpose clearer, e.g., 'Rename \"target_lang\" to \"language_code\" to clarify it should be an ISO code, not a full language name.'\n\n8. PRE_CALL_REFLECTION_STRATEGIES: Recommend strategies for the agent to reflect on parameter values before making tool calls, such as 'Implement a pre-call reflection step that checks if all required parameters are grounded in user input before proceeding with the function call.'\n\n9. PARAMETER_SOURCE_TRACKING: Suggest implementing a mechanism to track where each parameter value originated from (e.g., user input, default value) to improve transparency and debugging.\n\n10. CLARIFICATION_PROMPTS: Recommend adding clarification prompts when parameter values are ambiguous or missing, e.g., 'If the \"count\" parameter is not specified, ask the user: \"How many items would you like to retrieve? (default is 10)\"'\n\n11. OTHER: Any other specific recommendations that would help prevent similar parameter hallucination issues in the future.", - "items": { - "type": "object", - "properties": { - "recommendation": { - "type": "string", - "description": "A specific, actionable recommendation to improve the agent's parameter handling process.", - "enum": [ - "PARAMETER_DOCUMENTATION", - "EXPLICIT_VALUE_EXTRACTION", - "DEFAULT_VALUE_HANDLING", - "FORMAT_SPECIFICATION_IMPROVEMENTS", - "TYPE_VALIDATION_RULES", - "SYSTEM_PROMPT_ADDITIONS", - "PARAMETER_NAMING_CONVENTIONS", - "PRE_CALL_REFLECTION_STRATEGIES", - "PARAMETER_SOURCE_TRACKING", - "CLARIFICATION_PROMPTS", - "OTHER" - ] - }, - "details": { - "type": "string", - "description": "Detailed explanation of the recommendation, including what specific changes should be made, how they will improve parameter handling, and any relevant examples or best practices." - } - }, - "required": [ - "recommendation", - "details" - ] - } - } + "target": { + "type": "string", + "enum": [ + "system_prompt", + "tool_description", + "parameter_description", + "parameter_examples" + ], + "description": "Which artifact the diff applies to. `system_prompt` changes the agent's top-level policy; the other three target a specific tool (and possibly one of its parameters)." + }, + "tool_name": { + "type": [ + "string", + "null" + ], + "description": "Required when target is tool_description, parameter_description, or parameter_examples. Use the exact function/tool name. Must be null when target is system_prompt." + }, + "parameter_name": { + "type": [ + "string", + "null" + ], + "description": "Required when target is parameter_description or parameter_examples. Null otherwise." + }, + "diff": { + "type": "string", + "description": "Unified git-diff-format patch of the proposed change. Format: `--- a/\\n+++ b/\\n@@ ... @@\\n-\\n+`. When there is no line-number context, use `@@` as the single hunk header. Keep diffs minimal \u2014 only the lines that meaningfully change." + }, + "rationale": { + "type": "string", + "description": "One-sentence reason why this change addresses the observed issue. <= 200 chars." + }, + "importance": { + "type": "number", + "minimum": 0, + "maximum": 1, + "description": "How impactful this recommendation is at preventing similar issues in future calls (0 = cosmetic, 1 = critical). Calibrate consistently across metrics so downstream aggregation is comparable." + } }, "required": [ - "explanation", - "evidence", - "output", - "confidence", - "correction", - "actionable_recommendations" + "target", + "diff", + "rationale", + "importance" ] - }, - "examples": [ + } + } + }, + "required": [ + "explanation", + "evidence", + "output", + "confidence", + "correction", + "actionable_recommendations" + ] + }, + "examples": [ + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Translate 'hello' to Spanish." + } + ], + "tool_inventory": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Translate 'hello' to Spanish." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "translate_text", - "description": "Translate given text", - "parameters": { - "type": "object", - "properties": { - "text": { - "type": "string", - "description": "The text to be translated" - }, - "target_lang": { - "type": "string", - "description": "The target language code, e.g., 'en' for English" - } - }, - "required": [ - "text", - "target_lang" - ] - } - } - } - ], - "tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "translate_text", - "arguments": "{ \"text\": \"hello\", \"target_lang\": \"es\" }" - } + "type": "function", + "function": { + "name": "translate_text", + "description": "Translate given text", + "parameters": { + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "The text to be translated" }, - "parameter_name": "target_lang", - "parameter_value": "es" - }, - "output": { - "evidence": "User request: \"Translate 'hello' to Spanish.\" Tool specification for translate_text includes parameter target_lang of type string.", - "explanation": "The parameter value target_lang='es' is properly grounded in the conversation. The user explicitly requested translation \"to Spanish\" and the agent correctly converted this to the ISO language code 'es' for Spanish. This conversion from the common language name to its standard code is a reasonable and expected transformation that requires minimal inference. The tool specification indicates target_lang should be a string, and 'es' is the appropriate ISO code for Spanish.", - "output": 5, - "confidence": 0.96, - "correction": {}, - "actionable_recommendations": [] + "target_lang": { + "type": "string", + "description": "The target language code, e.g., 'en' for English" + } + }, + "required": [ + "text", + "target_lang" + ] } - }, + } + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "translate_text", + "arguments": "{ \"text\": \"hello\", \"target_lang\": \"es\" }" + } + }, + "parameter_name": "target_lang", + "parameter_value": "es" + }, + "output": { + "evidence": "User request: \"Translate 'hello' to Spanish.\" Tool specification for translate_text includes parameter target_lang of type string.", + "explanation": "The parameter value target_lang='es' is properly grounded in the conversation. The user explicitly requested translation \"to Spanish\" and the agent correctly converted this to the ISO language code 'es' for Spanish. This conversion from the common language name to its standard code is a reasonable and expected transformation that requires minimal inference. The tool specification indicates target_lang should be a string, and 'es' is the appropriate ISO code for Spanish.", + "output": 5, + "confidence": 0.96, + "correction": {}, + "actionable_recommendations": [] + } + }, + { + "user_kwargs": { + "conversation_context": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Fetch my latest tweets." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "get_tweets", - "description": "Retrieve recent tweets", - "parameters": { - "type": "object", - "properties": { - "username": { - "type": "string", - "description": "The Twitter username to fetch tweets for" - }, - "count": { - "type": "integer", - "description": "The number of recent tweets to retrieve" - } - }, - "required": [ - "username", - "count" - ] - } - } - } - ], - "tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "get_tweets", - "arguments": "{ \"username\": \"elonmusk\", \"count\": 20 }" - } - }, - "parameter_name": "count", - "parameter_value": 20 - }, - "output": { - "evidence": "User request: \"Fetch my latest tweets.\" - contains no mention of any specific count or number of tweets. Tool specification does not document a default value for the count parameter.", - "explanation": "The parameter value count=20 is ungrounded and appears to be hallucinated. The user requested \"latest tweets\" but did not specify any count or quantity. The tool specification defines count as an integer parameter but does not document a default value. The agent appears to have arbitrarily chosen the value 20 without any basis in the conversation or tool specification. Without user specification or a documented default, the agent should either ask the user how many tweets they want to see or use a reasonable, documented default value if one exists.", - "output": 1, - "confidence": 0.95, - "correction": { - "reason_types": [ - "MISSING_INFORMATION" - ], - "reasons": "Count value 20 is invented without user specification or documented default.", - "parameter": { - "need_more_information": "How many tweets would you like to retrieve? Please specify a number." - } + "role": "user", + "content": "Fetch my latest tweets." + } + ], + "tool_inventory": [ + { + "type": "function", + "function": { + "name": "get_tweets", + "description": "Retrieve recent tweets", + "parameters": { + "type": "object", + "properties": { + "username": { + "type": "string", + "description": "The Twitter username to fetch tweets for" }, - "actionable_recommendations": [ - { - "recommendation": "PARAMETER_DOCUMENTATION", - "details": "Update the 'count' parameter description to: 'Number of tweets to retrieve. Default: 10 if not specified. Do not invent count values; use the default if not explicitly provided by the user.'" - }, - { - "recommendation": "SYSTEM_PROMPT_ADDITIONS", - "details": "Add to the system prompt: 'For the get_tweets function, only use explicitly provided count values from the user. When count is not specified, always use the default value of 10. Never invent arbitrary count values.'" - }, - { - "recommendation": "PRE_CALL_REFLECTION_STRATEGIES", - "details": "Implement a pre-call reflection step that checks if all required parameters are grounded in user input before proceeding with the function call." - } - ] + "count": { + "type": "integer", + "description": "The number of recent tweets to retrieve" + } + }, + "required": [ + "username", + "count" + ] } + } + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "get_tweets", + "arguments": "{ \"username\": \"elonmusk\", \"count\": 20 }" + } + }, + "parameter_name": "count", + "parameter_value": 20 + }, + "output": { + "evidence": "User request: \"Fetch my latest tweets.\" - contains no mention of any specific count or number of tweets. Tool specification does not document a default value for the count parameter.", + "explanation": "The parameter value count=20 is ungrounded and appears to be hallucinated. The user requested \"latest tweets\" but did not specify any count or quantity. The tool specification defines count as an integer parameter but does not document a default value. The agent appears to have arbitrarily chosen the value 20 without any basis in the conversation or tool specification. Without user specification or a documented default, the agent should either ask the user how many tweets they want to see or use a reasonable, documented default value if one exists.", + "output": 1, + "confidence": 0.95, + "correction": { + "reason_types": [ + "MISSING_INFORMATION" + ], + "reasons": "Count value 20 is invented without user specification or documented default.", + "parameter": { + "need_more_information": "How many tweets would you like to retrieve? Please specify a number." + } + }, + "actionable_recommendations": [ + { + "target": "system_prompt", + "tool_name": null, + "parameter_name": null, + "diff": "--- a/system_prompt\n+++ b/system_prompt\n@@\n+Before you call a tool, verify every required argument is explicitly grounded in the conversation, a prior tool output, or a documented default. If a required argument is unknown, ask the user instead of guessing.", + "rationale": "Reinforces evidence-based argument selection so the agent does not fabricate required values.", + "importance": 0.8 }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Can you show me just a few of my latest messages?" - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "get_messages", - "description": "Retrieve user's messages", - "parameters": { - "type": "object", - "properties": { - "user_id": { - "type": "integer", - "description": "The ID of the user whose messages are being retrieved" - }, - "limit": { - "type": "integer", - "description": "The maximum number of messages to retrieve" - } - }, - "required": [ - "user_id", - "limit" - ] - } - } - } - ], - "tool_call": { - "id": "call_002", - "type": "function", - "function": { - "name": "get_messages", - "arguments": "{ \"user_id\": 456, \"limit\": 3 }" - } - }, - "parameter_name": "limit", - "parameter_value": 3 - }, - "output": { - "evidence": "User said: 'just a few of my latest messages.' The phrase 'a few' is vague and subjective, and while 3 is a reasonable interpretation, it is not explicitly stated.", - "explanation": "The value limit=3 is a plausible interpretation of the user's request for 'a few' messages. However, the term is ambiguous and does not unambiguously support the specific value of 3. The assistant made a judgment call based on conversational context, but without explicit grounding or a documented default, the score is 3.", - "output": 3, - "confidence": 0.79, - "correction": { - "reason_types": [ - "MISSING_INFORMATION" - ], - "reasons": "The user's phrasing is vague and does not explicitly support the value 3. Clarification is needed to ensure the correct value is used.", - "parameter": { - "need_more_information": "You asked for 'a few' messages. Could you specify how many exactly you'd like to see?" - } + "target": "tool_description", + "tool_name": "get_tweets", + "parameter_name": null, + "diff": "--- a/tool/get_tweets\n+++ b/tool/get_tweets\n@@\n+Call this only after you have gathered the inputs listed in `required`. If any required input is missing, use a discovery tool first.", + "rationale": "Makes the tool's prerequisites explicit so the agent does not call it with under-grounded arguments.", + "importance": 0.7 + } + ] + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Can you show me just a few of my latest messages?" + } + ], + "tool_inventory": [ + { + "type": "function", + "function": { + "name": "get_messages", + "description": "Retrieve user's messages", + "parameters": { + "type": "object", + "properties": { + "user_id": { + "type": "integer", + "description": "The ID of the user whose messages are being retrieved" }, - "actionable_recommendations": [ - { - "recommendation": "PRE_CALL_REFLECTION_STRATEGIES", - "details": "Encourage the assistant to clarify vague quantifiers like 'a few' with a specific number before making a tool call." - }, - { - "recommendation": "CLARIFICATION_PROMPTS", - "details": "Prompt the user for an exact number when their request includes ambiguous terms such as 'some' or 'a few.'" - } - ] + "limit": { + "type": "integer", + "description": "The maximum number of messages to retrieve" + } + }, + "required": [ + "user_id", + "limit" + ] } + } } - ] - }, - { - "name": "parameter_value_format_alignment", - "task_description": "Evaluate whether the specific parameter value (named in \"parameter_name\") conforms exactly to the required type, format, and unit conventions defined in the API specification.\n\nYour judgment must be strictly evidence-based: rely exclusively on the provided API documentation and the parameter value. Do not assume or infer formatting rules that are not explicitly documented.\n\nIMPORTANT: Evaluate ONLY the **specific parameter value** provided to ensure it meets the exact type, format, and unit requirements. Do not evaluate the entire tool call correctness, other parameters, or the function itself. Focus solely on whether this parameter value aligns with the specification.\n---\n\n### 1. Data Type Compliance (Output = 4-5)\n- Check that the value matches the required data type (e.g., string, integer, boolean, object, array)\n- Verify that numbers are represented as numeric types, not strings\n- Ensure strings are quoted correctly if required\n- Confirm booleans are true/false (not \"true\"/\"false\") when the type is boolean\n\n---\n\n### 2. Format Specification Compliance\n- Validate adherence to documented format constraints, including:\n - Date and time formats (e.g., ISO 8601, YYYY-MM-DD, timezone presence)\n - Currency (e.g., $USD, symbol placement, decimal precision)\n - Special patterns (e.g., phone numbers, email addresses, postal codes)\n- Enforce regular expressions or formatting rules defined in the schema\n\n---\n\n### 3. Unit Compliance\n- Verify presence or absence of unit suffixes/prefixes as specified (e.g., \"kg\", \"seconds\")\n- Confirm unit types are consistent with spec (e.g., Celsius vs Fahrenheit)\n- Reject any extra units when raw values are required\n\n---\n\n### 4. Consistency Requirements\n- Ensure values are consistent with related parameters (e.g., both timestamps in UTC)\n- Validate compatibility with other parameter values in the same call\n\n---\n\n### Format Alignment Rating Scale\n- **5 - Perfect Alignment**: Value exactly matches required type, format, and units\n- **4 - Minor Deviation**: Value is fundamentally correct but with minor formatting issues\n- **3 - Moderate Deviation**: Type or format is partially incorrect; may cause issues\n- **2 - Major Deviation**: Type and format are significantly wrong; likely to fail\n- **1 - Complete Mismatch**: Value is entirely incompatible with specification\n\n---\n\n### Final Rule\nBe conservative in your assessment:\n- If the spec is ambiguous and the value seems plausible, prefer a higher score (4-5)\n- If the spec is clear and the value deviates, assign a lower score (1-3)\n\nYour evaluation is critical: even minor formatting issues can lead to tool failures or incorrect outputs. Careful review ensures reliability in downstream tool use.", - "jsonschema": { - "title": "parameter_value_format_alignment", - "description": "Assessment of a parameter value's compliance with required type, format, and unit specifications, based on the rubric above.", + ], + "tool_call": { + "id": "call_002", + "type": "function", + "function": { + "name": "get_messages", + "arguments": "{ \"user_id\": 456, \"limit\": 3 }" + } + }, + "parameter_name": "limit", + "parameter_value": 3 + }, + "output": { + "evidence": "User said: 'just a few of my latest messages.' The phrase 'a few' is vague and subjective, and while 3 is a reasonable interpretation, it is not explicitly stated.", + "explanation": "The value limit=3 is a plausible interpretation of the user's request for 'a few' messages. However, the term is ambiguous and does not unambiguously support the specific value of 3. The assistant made a judgment call based on conversational context, but without explicit grounding or a documented default, the score is 3.", + "output": 3, + "confidence": 0.79, + "correction": { + "reason_types": [ + "MISSING_INFORMATION" + ], + "reasons": "The user's phrasing is vague and does not explicitly support the value 3. Clarification is needed to ensure the correct value is used.", + "parameter": { + "need_more_information": "You asked for 'a few' messages. Could you specify how many exactly you'd like to see?" + } + }, + "actionable_recommendations": [ + { + "target": "system_prompt", + "tool_name": null, + "parameter_name": null, + "diff": "--- a/system_prompt\n+++ b/system_prompt\n@@\n+Before you call a tool, verify every required argument is explicitly grounded in the conversation, a prior tool output, or a documented default. If a required argument is unknown, ask the user instead of guessing.", + "rationale": "Reinforces evidence-based argument selection so the agent does not fabricate required values.", + "importance": 0.8 + }, + { + "target": "tool_description", + "tool_name": "get_messages", + "parameter_name": null, + "diff": "--- a/tool/get_messages\n+++ b/tool/get_messages\n@@\n+Call this only after you have gathered the inputs listed in `required`. If any required input is missing, use a discovery tool first.", + "rationale": "Makes the tool's prerequisites explicit so the agent does not call it with under-grounded arguments.", + "importance": 0.7 + } + ] + } + } + ] + }, + { + "name": "parameter_value_format_alignment", + "task_description": "Evaluate whether the specific parameter value (named in \"parameter_name\") conforms exactly to the required type, format, and unit conventions defined in the API specification.\n\nYour judgment must be strictly evidence-based: rely exclusively on the provided API documentation and the parameter value. Do not assume or infer formatting rules that are not explicitly documented.\n\nIMPORTANT: Evaluate ONLY the **specific parameter value** provided to ensure it meets the exact type, format, and unit requirements. Do not evaluate the entire tool call correctness, other parameters, or the function itself. Focus solely on whether this parameter value aligns with the specification.\n---\n\n### 1. Data Type Compliance (Output = 4-5)\n- Check that the value matches the required data type (e.g., string, integer, boolean, object, array)\n- Verify that numbers are represented as numeric types, not strings\n- Ensure strings are quoted correctly if required\n- Confirm booleans are true/false (not \"true\"/\"false\") when the type is boolean\n\n---\n\n### 2. Format Specification Compliance\n- Validate adherence to documented format constraints, including:\n - Date and time formats (e.g., ISO 8601, YYYY-MM-DD, timezone presence)\n - Currency (e.g., $USD, symbol placement, decimal precision)\n - Special patterns (e.g., phone numbers, email addresses, postal codes)\n- Enforce regular expressions or formatting rules defined in the schema\n\n---\n\n### 3. Unit Compliance\n- Verify presence or absence of unit suffixes/prefixes as specified (e.g., \"kg\", \"seconds\")\n- Confirm unit types are consistent with spec (e.g., Celsius vs Fahrenheit)\n- Reject any extra units when raw values are required\n\n---\n\n### 4. Consistency Requirements\n- Ensure values are consistent with related parameters (e.g., both timestamps in UTC)\n- Validate compatibility with other parameter values in the same call\n\n---\n\n### Format Alignment Rating Scale\n- **5 - Perfect Alignment**: Value exactly matches required type, format, and units\n- **4 - Minor Deviation**: Value is fundamentally correct but with minor formatting issues\n- **3 - Moderate Deviation**: Type or format is partially incorrect; may cause issues\n- **2 - Major Deviation**: Type and format are significantly wrong; likely to fail\n- **1 - Complete Mismatch**: Value is entirely incompatible with specification\n\n---\n\n### Final Rule\nBe conservative in your assessment:\n- If the spec is ambiguous and the value seems plausible, prefer a higher score (4-5)\n- If the spec is clear and the value deviates, assign a lower score (1-3)\n\nYour evaluation is critical: even minor formatting issues can lead to tool failures or incorrect outputs. Careful review ensures reliability in downstream tool use.\n\n---\n\n#### Grounding Paths\n\nA parameter value is grounded via any of these paths:\n- **Explicit statement** in a user, assistant, tool, or system message.\n- **Documented default** in the API spec for this parameter.\n- **Pick-one** \u2014 the user enumerated multiple valid values and the agent picked one.\n- **Specialization** \u2014 the user named a set/range/category and the agent supplied a concrete value inside that scope. A value that falls OUTSIDE the named scope is a mismatch, not missing information.\n- **Plausible optional value** \u2014 if this parameter is optional (not in `required`), any spec-consistent value is grounded unless it contradicts explicit user intent.\n\n---\n\n#### Stringency\n\nApply this rubric strictly for REQUIRED parameters whose value is missing, fabricated, or inconsistent with the spec. Apply it leniently for OPTIONAL parameters (not in the spec's `required` array) and for values reasonably inferable from visible context. A value is NOT an issue unless it contradicts explicit user intent, a documented constraint, or higher-priority evidence.\n\n---\n\n#### Actionable Recommendations\n\nWhen the call reveals a fixable gap in the agent's prompts or tool specifications, emit one or more `actionable_recommendations` as unified-diff patches. Each recommendation specifies: `target` (which artifact to change: `system_prompt`, `tool_description`, `parameter_description`, `parameter_examples`), an optional `tool_name` / `parameter_name` for tool-scoped targets, a `diff` in unified-diff format, a short `rationale`, and an `importance` score in [0, 1] (0 = cosmetic, 1 = critical). When the call has no fixable artifact gap \u2014 or the call is already correct \u2014 return an empty array.", + "jsonschema": { + "title": "parameter_value_format_alignment", + "description": "Assessment of a parameter value's compliance with required type, format, and unit specifications, based on the rubric above.", + "type": "object", + "additionalProperties": false, + "properties": { + "evidence": { + "type": "string", + "description": "Quote the specification's type/format definition and include the actual parameter value provided. Cite EXACT text from the tool specification that supports your judgment." + }, + "explanation": { + "type": "string", + "description": "Explain clearly why the parameter value conforms or does not conform to the specification's requirements. Address data type, format, units, and any pattern or constraint violations. For incorrect values, describe exactly what is wrong." + }, + "output": { + "type": "integer", + "minimum": 1, + "maximum": 5, + "threshold_low": 4, + "threshold_high": 5, + "description": "Parameter Type/Format/Unit Score for this single parameter (1-5). Judge ONLY the serialized representation against the spec's type/format/pattern/unit. Semantic correctness is out of scope here.\n\n- **5**: Value exactly matches the spec's type, format, pattern, and unit. If this parameter is optional and omitted, return 5 (no format issue exists).\n- **4**: Fundamentally correct with a minor quirk a lax parser accepts.\n- **3**: A partial format issue that may cause a strict parser to reject.\n- **2**: A significant type/format/unit violation likely to fail at the API boundary.\n- **1**: The value is fundamentally incompatible with the spec (e.g. free-form text where a boolean is required)." + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1, + "threshold_low": 0, + "threshold_high": 1, + "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." + }, + "correction": { + "type": "object", + "description": "If output >= 4, return {}. If output <= 3, provide reason_types, explanation, and a corrected parameter.", + "properties": { + "reason_types": { + "type": "array", + "description": "List of format issues identified. Use one or more of: TYPE_ERROR, FORMAT_ERROR, UNIT_ERROR, PATTERN_ERROR, CONSISTENCY_ERROR, OTHER.", + "items": { + "type": "string", + "enum": [ + "TYPE_ERROR", + "FORMAT_ERROR", + "UNIT_ERROR", + "PATTERN_ERROR", + "CONSISTENCY_ERROR", + "OTHER" + ] + } + }, + "reasons": { + "type": "string", + "description": "Short explanation of the specific issue(s) with the value's format, type, or unit." + }, + "parameter": { + "type": "object", + "description": "An object containing the corrected parameter value in the form: { \"\": }. If you cannot correct the value, use 'need_more_information' as the key and provide a clarification question for the user. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-formatted, return an empty object {}.", + "additionalProperties": true + } + }, + "required": [] + }, + "actionable_recommendations": { + "type": "array", + "description": "List of actionable recommendations as unified-diff patches. Emit only when the call reveals a fixable gap in the agent's prompts or tool specifications. Empty array when there is nothing to fix (including when the call is already correct).", + "items": { "type": "object", "additionalProperties": false, "properties": { - "evidence": { - "type": "string", - "description": "Quote the specification's type/format definition and include the actual parameter value provided. Cite EXACT text from the tool specification that supports your judgment." - }, - "explanation": { - "type": "string", - "description": "Explain clearly why the parameter value conforms or does not conform to the specification's requirements. Address data type, format, units, and any pattern or constraint violations. For incorrect values, describe exactly what is wrong." - }, - "output": { - "type": "integer", - "minimum": 1, - "maximum": 5, - "threshold_low": 4, - "threshold_high": 5, - "description": "An integer from 1 to 5 indicating how well the parameter value conforms to the required type, format, and unit as defined in the API specification.\n\nScore meanings:\n\n5 - Perfect Alignment: The value exactly matches the expected type, format, and unit. Example: expected date in 'YYYY-MM-DD', value is '2025-08-05'.\n\n4 - Minor Deviation: The value has small formatting issues (e.g., missing leading zeros) but is still likely to be accepted by the API or parser unless strict validation is enforced. Example: expected '2025-08-05', value is '2025-8-5'.\n\n3 - Moderate Deviation: The value partially matches the expected format but is likely to be rejected or misinterpreted by automatic processing. Example: expected 'YYYY-MM-DD', value is 'August 5, 2025'.\n\n2 - Major Deviation: The value significantly violates the expected type, format, or unit and is very likely to fail. Example: expected 'YYYY-MM-DD', value is '08/05/2025'.\n\n1 - Complete Mismatch: The value is entirely incompatible with the required format or type. Example: expected boolean, value is 'maybe'." - }, - "confidence": { - "type": "number", - "minimum": 0, - "maximum": 1, - "threshold_low": 0, - "threshold_high": 1, - "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." - }, - "correction": { - "type": "object", - "description": "If output >= 4, return {}. If output <= 3, provide reason_types, explanation, and a corrected parameter.", - "properties": { - "reason_types": { - "type": "array", - "description": "List of format issues identified. Use one or more of: TYPE_ERROR, FORMAT_ERROR, UNIT_ERROR, PATTERN_ERROR, CONSISTENCY_ERROR, OTHER.", - "items": { - "type": "string", - "enum": [ - "TYPE_ERROR", - "FORMAT_ERROR", - "UNIT_ERROR", - "PATTERN_ERROR", - "CONSISTENCY_ERROR", - "OTHER" - ] - } - }, - "reasons": { - "type": "string", - "description": "Short explanation of the specific issue(s) with the value's format, type, or unit." - }, - "parameter": { - "type": "object", - "description": "An object containing the corrected parameter value in the form: { \"\": }. If you cannot correct the value, use 'need_more_information' as the key and provide a clarification question for the user. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-formatted, return an empty object {}.", - "additionalProperties": true - } - }, - "required": [] - }, - "actionable_recommendations": { - "type": "array", - "description": "Provide actionable suggestions for avoiding this type of error in the future, only if there is an issue. Address root causes like unclear format specs, missing examples, or lack of validation. Include suggestions such as:\n\n1. PARAMETER_FORMAT_DOCUMENTATION: Clarify or expand parameter format expectations in the API documentation.\n2. PARAMETER_EXAMPLES: Add example values with correct formatting in the parameter documentation.\n3. PARAMETER_VALIDATION: Introduce regex or rule-based format validation in the API to catch errors before tool calls.\n4. PARAMETER_CONVERSION: Add conversion helpers for user input to ensure correct formatting.\n5. UNIT_STANDARDS: Standardize unit expectations and flag missing or incorrect units in the tools implementation.\n\n6. PARAMETER_NAMING_CONVENTIONS: Suggest clearer parameter names that indicate expected formats or units.\n7. SYSTEM_PROMPT_ADDITIONS: Recommend specific additions to the system prompt that would help the agent avoid similar format issues in the future.\n8. PRE_CALL_REFLECTION_STRATEGIES: Propose strategies for the agent to reflect on parameter values before making tool calls, such as checking if all required parameters are grounded in user input.\n9. PARAMETER_SOURCE_TRACKING: Suggest implementing a mechanism to track where each parameter value originated from (e.g., user input, default value) to improve transparency and debugging.\n10. CLARIFICATION_PROMPTS: Recommend adding clarification prompts when parameter values are ambiguous or missing, e.g., asking the user for specific formats or units.\n\n11. OTHER: Any other specific recommendations that would help prevent similar format issues in the future.", - "items": { - "type": "object", - "properties": { - "recommendation": { - "type": "string", - "description": "A specific, actionable recommendation to improve the agent's parameter handling process.", - "enum": [ - "PARAMETER_FORMAT_DOCUMENTATION", - "PARAMETER_EXAMPLES", - "PARAMETER_VALIDATION", - "PARAMETER_CONVERSION", - "UNIT_STANDARDS", - "PARAMETER_NAMING_CONVENTIONS", - "SYSTEM_PROMPT_ADDITIONS", - "PRE_CALL_REFLECTION_STRATEGIES", - "PARAMETER_SOURCE_TRACKING", - "CLARIFICATION_PROMPTS", - "OTHER" - ] - }, - "details": { - "type": "string", - "description": "Detailed explanation of the recommendation, including what specific changes should be made, how they will improve parameter handling, and any relevant examples or best practices." - } - }, - "required": [ - "recommendation", - "details" - ] - } - } + "target": { + "type": "string", + "enum": [ + "system_prompt", + "tool_description", + "parameter_description", + "parameter_examples" + ], + "description": "Which artifact the diff applies to. `system_prompt` changes the agent's top-level policy; the other three target a specific tool (and possibly one of its parameters)." + }, + "tool_name": { + "type": [ + "string", + "null" + ], + "description": "Required when target is tool_description, parameter_description, or parameter_examples. Use the exact function/tool name. Must be null when target is system_prompt." + }, + "parameter_name": { + "type": [ + "string", + "null" + ], + "description": "Required when target is parameter_description or parameter_examples. Null otherwise." + }, + "diff": { + "type": "string", + "description": "Unified git-diff-format patch of the proposed change. Format: `--- a/\\n+++ b/\\n@@ ... @@\\n-\\n+`. When there is no line-number context, use `@@` as the single hunk header. Keep diffs minimal \u2014 only the lines that meaningfully change." + }, + "rationale": { + "type": "string", + "description": "One-sentence reason why this change addresses the observed issue. <= 200 chars." + }, + "importance": { + "type": "number", + "minimum": 0, + "maximum": 1, + "description": "How impactful this recommendation is at preventing similar issues in future calls (0 = cosmetic, 1 = critical). Calibrate consistently across metrics so downstream aggregation is comparable." + } }, "required": [ - "explanation", - "evidence", - "output", - "confidence", - "correction", - "actionable_recommendations" + "target", + "diff", + "rationale", + "importance" ] + } + } + }, + "required": [ + "explanation", + "evidence", + "output", + "confidence", + "correction", + "actionable_recommendations" + ] + }, + "examples": [ + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Set a short countdown for 12 minutes." + } + ], + "tool_inventory": [ + { + "type": "function", + "function": { + "name": "set_timer", + "description": "Set a countdown timer", + "parameters": { + "type": "object", + "properties": { + "duration": { + "type": "string", + "description": "Duration in the format 'number + unit' (e.g., '5 minutes', '30 seconds')" + } + }, + "required": [ + "duration" + ] + } + } + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "set_timer", + "arguments": "{ \"duration\": \"12 minutes\" }" + } + }, + "parameter_name": "duration", + "parameter_value": "12 minutes" }, - "examples": [ + "output": { + "evidence": "The tool spec states that 'duration' must be a string in the format 'number + unit', with examples such as '5 minutes' and '30 seconds'. The value '12 minutes' follows this format precisely: it consists of a number followed by a space and a valid time unit.", + "explanation": "The parameter value '12 minutes' aligns perfectly with the required type, structure, and unit conventions defined in the spec. While not explicitly given as an example, it is fully consistent with the format and requires no normalization or interpretation.", + "output": 5, + "confidence": 0.99, + "correction": {}, + "actionable_recommendations": [] + } + }, + { + "user_kwargs": { + "conversation_context": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Set a short countdown for 12 minutes." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "set_timer", - "description": "Set a countdown timer", - "parameters": { - "type": "object", - "properties": { - "duration": { - "type": "string", - "description": "Duration in the format 'number + unit' (e.g., '5 minutes', '30 seconds')" - } - }, - "required": [ - "duration" - ] - } - } - } - ], - "tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "set_timer", - "arguments": "{ \"duration\": \"12 minutes\" }" - } - }, - "parameter_name": "duration", - "parameter_value": "12 minutes" - }, - "output": { - "evidence": "The tool spec states that 'duration' must be a string in the format 'number + unit', with examples such as '5 minutes' and '30 seconds'. The value '12 minutes' follows this format precisely: it consists of a number followed by a space and a valid time unit.", - "explanation": "The parameter value '12 minutes' aligns perfectly with the required type, structure, and unit conventions defined in the spec. While not explicitly given as an example, it is fully consistent with the format and requires no normalization or interpretation.", - "output": 5, - "confidence": 0.99, - "correction": {}, - "actionable_recommendations": [] + "role": "user", + "content": "Start a countdown for 5 minutes." + } + ], + "tool_inventory": [ + { + "type": "function", + "function": { + "name": "set_timer", + "description": "Set a countdown timer", + "parameters": { + "type": "object", + "properties": { + "duration": { + "type": "string", + "description": "Duration in the format 'number + unit' (e.g., '5 minutes', '30 seconds')" + } + }, + "required": [ + "duration" + ] } + } + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "set_timer", + "arguments": "{ \"duration\": \"300000\" }" + } + }, + "parameter_name": "duration", + "parameter_value": "300000" + }, + "output": { + "evidence": "The tool specification defines 'duration' as a string following the 'number + unit' format (e.g., '5 minutes'). The value '300000' is a string, but it lacks a time unit and does not follow the documented pattern.", + "explanation": "The value is likely intended to represent milliseconds, but the format is both undocumented and inconsistent with the specification. It fails both unit and pattern requirements, and cannot be reliably interpreted as valid input without assumptions or external conventions.", + "output": 2, + "confidence": 0.95, + "correction": { + "reason_types": [ + "FORMAT_ERROR", + "UNIT_ERROR" + ], + "reasons": "Missing explicit time unit and nonconformant format. Specification requires 'number + unit' format, which this value does not follow.", + "parameter": { + "duration": "5 minutes" + } + }, + "actionable_recommendations": [ + { + "target": "system_prompt", + "tool_name": null, + "parameter_name": null, + "diff": "--- a/system_prompt\n+++ b/system_prompt\n@@\n+Before you call a tool, verify every required argument is explicitly grounded in the conversation, a prior tool output, or a documented default. If a required argument is unknown, ask the user instead of guessing.", + "rationale": "Reinforces evidence-based argument selection so the agent does not fabricate required values.", + "importance": 0.8 }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Start a countdown for 5 minutes." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "set_timer", - "description": "Set a countdown timer", - "parameters": { - "type": "object", - "properties": { - "duration": { - "type": "string", - "description": "Duration in the format 'number + unit' (e.g., '5 minutes', '30 seconds')" - } - }, - "required": [ - "duration" - ] - } - } - } - ], - "tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "set_timer", - "arguments": "{ \"duration\": \"300000\" }" - } - }, - "parameter_name": "duration", - "parameter_value": "300000" - }, - "output": { - "evidence": "The tool specification defines 'duration' as a string following the 'number + unit' format (e.g., '5 minutes'). The value '300000' is a string, but it lacks a time unit and does not follow the documented pattern.", - "explanation": "The value is likely intended to represent milliseconds, but the format is both undocumented and inconsistent with the specification. It fails both unit and pattern requirements, and cannot be reliably interpreted as valid input without assumptions or external conventions.", - "output": 2, - "confidence": 0.95, - "correction": { - "reason_types": [ - "FORMAT_ERROR", - "UNIT_ERROR" - ], - "reasons": "Missing explicit time unit and nonconformant format. Specification requires 'number + unit' format, which this value does not follow.", - "parameter": { - "duration": "5 minutes" - } - }, - "actionable_recommendations": [ - { - "recommendation": "PARAMETER_FORMAT_DOCUMENTATION", - "details": "Update the 'duration' parameter documentation to be more explicit: 'Duration must be provided as a string with a numeric value followed by a space and then a time unit (e.g., \"5 minutes\", \"30 seconds\"). Millisecond values without units are not supported. Raw numeric values without units will be rejected.'" - }, - { - "recommendation": "PARAMETER_VALIDATION", - "details": "Implement regex validation for the 'duration' parameter that enforces the 'number + unit' format with a required space between them, e.g., /^\\d+\\s+[a-z]+$/i, and clearly rejects pure numeric values with an informative error message." - }, - { - "recommendation": "PARAMETER_CONVERSION", - "details": "Add a conversion helper that automatically formats numeric-only values to the required format by appending appropriate units based on the magnitude of the number (e.g., convert '300000' to '5 minutes'). This would prevent tool call failures while maintaining the intended functionality." - } - ] + "target": "tool_description", + "tool_name": "set_timer", + "parameter_name": null, + "diff": "--- a/tool/set_timer\n+++ b/tool/set_timer\n@@\n+Call this only after you have gathered the inputs listed in `required`. If any required input is missing, use a discovery tool first.", + "rationale": "Makes the tool's prerequisites explicit so the agent does not call it with under-grounded arguments.", + "importance": 0.7 + } + ] + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Set a timer for five minutes please." + } + ], + "tool_inventory": [ + { + "type": "function", + "function": { + "name": "set_timer", + "description": "Set a countdown timer", + "parameters": { + "type": "object", + "properties": { + "duration": { + "type": "string", + "description": "Duration in the format 'number + unit' (e.g., '5 minutes', '30 seconds')" + } + }, + "required": [ + "duration" + ] } + } + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "set_timer", + "arguments": "{ \"duration\": \"5Minutes\" }" + } + }, + "parameter_name": "duration", + "parameter_value": "5Minutes" + }, + "output": { + "evidence": "The specification explicitly defines the duration format as 'number + unit' with examples like '5 minutes'. The given value '5Minutes' omits the required space, resulting in a concatenated string.", + "explanation": "The input conveys the intended meaning and includes all required semantic elements (number and unit), but the lack of a space between them breaks the documented pattern. This deviation is moderate-it introduces a risk of failure in strict parsers that expect a space-separated format.", + "output": 3, + "confidence": 0.89, + "correction": { + "reason_types": [ + "FORMAT_ERROR" + ], + "reasons": "Value contains both components but violates the explicit formatting requirement of space separation.", + "parameter": { + "duration": "5 minutes" + } + }, + "actionable_recommendations": [ + { + "target": "system_prompt", + "tool_name": null, + "parameter_name": null, + "diff": "--- a/system_prompt\n+++ b/system_prompt\n@@\n+Before you call a tool, verify every required argument is explicitly grounded in the conversation, a prior tool output, or a documented default. If a required argument is unknown, ask the user instead of guessing.", + "rationale": "Reinforces evidence-based argument selection so the agent does not fabricate required values.", + "importance": 0.8 }, { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Set a timer for five minutes please." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "set_timer", - "description": "Set a countdown timer", - "parameters": { - "type": "object", - "properties": { - "duration": { - "type": "string", - "description": "Duration in the format 'number + unit' (e.g., '5 minutes', '30 seconds')" - } - }, - "required": [ - "duration" - ] - } - } - } - ], - "tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "set_timer", - "arguments": "{ \"duration\": \"5Minutes\" }" - } - }, - "parameter_name": "duration", - "parameter_value": "5Minutes" - }, - "output": { - "evidence": "The specification explicitly defines the duration format as 'number + unit' with examples like '5 minutes'. The given value '5Minutes' omits the required space, resulting in a concatenated string.", - "explanation": "The input conveys the intended meaning and includes all required semantic elements (number and unit), but the lack of a space between them breaks the documented pattern. This deviation is moderate-it introduces a risk of failure in strict parsers that expect a space-separated format.", - "output": 3, - "confidence": 0.89, - "correction": { - "reason_types": [ - "FORMAT_ERROR" - ], - "reasons": "Value contains both components but violates the explicit formatting requirement of space separation.", - "parameter": { - "duration": "5 minutes" - } - }, - "actionable_recommendations": [ - { - "recommendation": "PARAMETER_FORMAT_DOCUMENTATION", - "details": "Enhance the documentation for the 'duration' parameter to explicitly highlight the importance of the space separator: 'Duration MUST include a space between the numeric value and the unit (e.g., \"5 minutes\" not \"5minutes\"). The space separator is required for proper parsing.'" - }, - { - "recommendation": "PARAMETER_VALIDATION", - "details": "Implement regex-based validation that specifically checks for the presence of a space between the number and unit, e.g., /^(\\d+)\\s+(\\w+)$/, and provides a clear error message when this pattern is violated: 'Duration format error: Please ensure there is a space between the number and unit (e.g., \"5 minutes\").'" - }, - { - "recommendation": "PARAMETER_CONVERSION", - "details": "Add automatic reformatting that can detect and correct common format errors like missing spaces between numbers and units, ensuring the API remains robust against minor formatting issues." - } - ] - } + "target": "tool_description", + "tool_name": "set_timer", + "parameter_name": null, + "diff": "--- a/tool/set_timer\n+++ b/tool/set_timer\n@@\n+Call this only after you have gathered the inputs listed in `required`. If any required input is missing, use a discovery tool first.", + "rationale": "Makes the tool's prerequisites explicit so the agent does not call it with under-grounded arguments.", + "importance": 0.7 } - ] - } + ] + } + } + ] + } ] diff --git a/altk/pre_tool/sparc/function_calling/metrics/parameter/parameter_metrics_runtime.json b/altk/pre_tool/sparc/function_calling/metrics/parameter/parameter_metrics_runtime.json index d89a4534..b531cb47 100644 --- a/altk/pre_tool/sparc/function_calling/metrics/parameter/parameter_metrics_runtime.json +++ b/altk/pre_tool/sparc/function_calling/metrics/parameter/parameter_metrics_runtime.json @@ -1,489 +1,489 @@ [ - { - "name": "parameter_hallucination_check", - "task_description": "You are an expert evaluator assessing whether a **specific parameter value**-identified as `parameter_name`-in a tool call is **grounded** in the provided conversation history or tool specification, or whether it is **hallucinated**.\n\nYour task is to make a **strictly evidence-based** judgment. Evaluate this parameter alone-ignore other parameters or external knowledge. The value must:\n- Be explicitly supported by the dialogue, prior tool calls, or tool specification\n- Be appropriate for the context and user intent\n- Respect constraint rules defined in the specification\n\nIMPORTANT: Evaluate ONLY the **specific parameter value** provided. Do not evaluate the entire tool call correctness, other parameters, or the function itself. Focus solely on whether this parameter value is grounded in the conversation or specification.\n---\n\n### Rating Scale\n\nAssign a score from 1 to 5 based on how clearly the value is grounded:\n\n**5 - Perfectly Grounded** \nExplicitly stated in conversation or matches a clearly documented and contextually appropriate default.\n\n**4 - Mostly Grounded** \nNot directly quoted but clearly follows from the conversation or is a minor variation on a documented default.\n\n**3 - Ambiguous** \nRequires non-trivial inference or transformation to relate the value to the dialogue or specification.\n\n**2 - Mostly Ungrounded** \nLoosely related, incorrectly formatted, or partially contradicts the spec or context.\n\n**1 - Completely Ungrounded** \nInvented, unrelated, or clearly in conflict with the dialogue or specification.\n\nBe conservative in your scoring: Use 1-3 for weak or unverified grounding, and 4-5 only when evidence is explicit or clearly reliable.\n\n---\n\n### Acceptable Sources for Grounding\nA value may be considered grounded if it comes from:\n- Explicit user input\n- Clearly implied user intent (minimal inference)\n- Assistant statements explicitly confirmed by the user\n- Outputs of previous tool calls\n- Documented default values in the tool specification\n\n---\n\n### Ungrounded Patterns\nMark a value ungrounded if:\n- It is absent from both conversation and tool spec\n- It depends on vague or missing input (e.g., \"tomorrow\" without a date format)\n- It uses the wrong type, unit, or format\n- It contradicts other parameters or tool constraints\n- It inaccurately rephrases or transforms the user's input\n\n---\n\n### Handling Defaults\nA default value may be accepted only if:\n- It is explicitly documented in the tool spec\n- It fits the context of the user's request\n- It does not contradict stated or implied intent\nOtherwise, treat the value as ungrounded.\n\n---\n\n### Parameter Relationships\nEnsure this parameter does not contradict other parameters in the same call.\nExample: If `add_day=false` and `day=\"Sunday\"` but the spec says `add_day` must be true to use `day`, the `day` value is ungrounded.\n\n---\n\n### Final Guideline\nLarge language models frequently hallucinate parameter values or apply defaults inappropriately. Your careful and conservative evaluation prevents propagation of such errors.", - "jsonschema": { - "title": "parameter_hallucination_check", - "description": "Assessment of tool call parameter hallucination, following the rubric defined above.", - "type": "object", - "additionalProperties": false, - "properties": { - "evidence": { - "type": "string", - "description": "Provide a short sentence or two with EXACT quotes from the conversation history (user messages, assistant responses, previous tool outputs) or tool specification that directly support your assessment. For grounded values, cite the specific text where this value originates. For ungrounded values, demonstrate the absence of supporting evidence or cite contradictory information." - }, - "explanation": { - "type": "string", - "description": "Explain in one or two sentences why the specific parameter value is grounded or ungrounded. Reference exact sources from the conversation or tool specification that either support or contradict the parameter value. Address: 1) Where the value originated from, 2) Whether this source is sufficient for grounding, 3) Any format or type compliance issues, 4) Any contradictions with other parameters or specifications, and 5) For default values, whether they're documented and appropriate for this context." - }, - "output": { - "type": "integer", - "minimum": 1, - "maximum": 5, - "threshold_low": 4, - "threshold_high": 5, - "description": "Parameter Hallucination Score (1-5):\n\n- 5: Perfectly grounded\n The parameter value is directly quoted in the conversation or exactly matches a documented default that fits the users intent.\n Example:\n User: \"Book a hotel in Rome.\"\n → Tool call: search_hotels(city=\"Rome\")\n\n- 4: Grounded via logical reasoning\n The value is not directly quoted but clearly follows from the conversation through straightforward, logic-based inference.\n Example:\n User: \"I'd like a room for me and my wife.\"\n → Tool call: search_hotels(guests=2)\n\n- 3: Ambiguous\n The value might relate to the conversation or domain context, but grounding requires assumptions or nontrivial interpretation not clearly supported by evidence.\n Example:\n User: \"Let's go somewhere warm.\"\n → Tool call: search_hotels(city=\"Barcelona\")\n\n- 2: Likely hallucinated\n The value is loosely connected to the topic but lacks grounding in the conversation or tool specification, or only partially aligns.\n Example:\n User: \"Find hotels.\"\n → Tool call: search_hotels(city=\"Berlin\")\n (No city mentioned)\n\n- 1: Clearly hallucinated\n The parameter value is invented, irrelevant, or contradicts the dialogue or tool specification.\n Example:\n User: \"Book a hotel.\"\n → Tool call: search_hotels(city=\"Mars\")" - }, - "confidence": { - "type": "number", - "minimum": 0, - "maximum": 1, - "threshold_low": 0, - "threshold_high": 1, - "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." - }, - "correction": { - "type": "object", - "properties": { - "reason_types": { - "type": "array", - "description": "A short sentence or two with types of issues with the parameter value, if any. Use one or more of these values: FORMAT_ERROR (wrong format or type), MISSING_INFORMATION (needs more data), PARAMETER_CONTRADICTION (conflicts with other parameters), DEFAULT_ISSUE (inappropriate default), OTHER (explain in reasons).", - "items": { - "type": "string", - "enum": [ - "FORMAT_ERROR", - "MISSING_INFORMATION", - "PARAMETER_CONTRADICTION", - "DEFAULT_ISSUE", - "OTHER" - ] - } - }, - "reasons": { - "type": "string", - "description": "Concise explanation of the specific issues with the parameter value." - }, - "parameter": { - "type": "object", - "description": "Object containing parameter name as key and corrected value as value. Use 'need_more_information' as the key and clarification question for the user if additional information is required. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-grounded, return an empty object {}.", - "additionalProperties": true - } - }, - "description": "For well-grounded values: Provide an empty object {}. For ungrounded values: Provide an object with reason_types, reasons, and parameter correction.", - "required": [] - } - }, - "required": [ - "explanation", - "evidence", - "output", - "confidence", - "correction" - ] + { + "name": "parameter_hallucination_check", + "task_description": "You are an expert evaluator assessing whether a **specific parameter value**-identified as `parameter_name`-in a tool call is **grounded** in the provided conversation history or tool specification, or whether it is **hallucinated**.\n\nYour task is to make a **strictly evidence-based** judgment. Evaluate this parameter alone-ignore other parameters or external knowledge. The value must:\n- Be explicitly supported by the dialogue, prior tool calls, or tool specification\n- Be appropriate for the context and user intent\n- Respect constraint rules defined in the specification\n\nIMPORTANT: Evaluate ONLY the **specific parameter value** provided. Do not evaluate the entire tool call correctness, other parameters, or the function itself. Focus solely on whether this parameter value is grounded in the conversation or specification.\n---\n\n### Rating Scale\n\nAssign a score from 1 to 5 based on how clearly the value is grounded:\n\n**5 - Perfectly Grounded** \nExplicitly stated in conversation or matches a clearly documented and contextually appropriate default.\n\n**4 - Mostly Grounded** \nNot directly quoted but clearly follows from the conversation or is a minor variation on a documented default.\n\n**3 - Ambiguous** \nRequires non-trivial inference or transformation to relate the value to the dialogue or specification.\n\n**2 - Mostly Ungrounded** \nLoosely related, incorrectly formatted, or partially contradicts the spec or context.\n\n**1 - Completely Ungrounded** \nInvented, unrelated, or clearly in conflict with the dialogue or specification.\n\nBe conservative in your scoring: Use 1-3 for weak or unverified grounding, and 4-5 only when evidence is explicit or clearly reliable.\n\n---\n\n### Acceptable Sources for Grounding\nA value may be considered grounded if it comes from:\n- Explicit user input\n- Clearly implied user intent (minimal inference)\n- Assistant statements explicitly confirmed by the user\n- Outputs of previous tool calls\n- Documented default values in the tool specification\n\n---\n\n### Ungrounded Patterns\nMark a value ungrounded if:\n- It is absent from both conversation and tool spec\n- It depends on vague or missing input (e.g., \"tomorrow\" without a date format)\n- It uses the wrong type, unit, or format\n- It contradicts other parameters or tool constraints\n- It inaccurately rephrases or transforms the user's input\n\n---\n\n### Handling Defaults\nA default value may be accepted only if:\n- It is explicitly documented in the tool spec\n- It fits the context of the user's request\n- It does not contradict stated or implied intent\nOtherwise, treat the value as ungrounded.\n\n---\n\n### Parameter Relationships\nEnsure this parameter does not contradict other parameters in the same call.\nExample: If `add_day=false` and `day=\"Sunday\"` but the spec says `add_day` must be true to use `day`, the `day` value is ungrounded.\n\n---\n\n### Final Guideline\nLarge language models frequently hallucinate parameter values or apply defaults inappropriately. Your careful and conservative evaluation prevents propagation of such errors.\n\n---\n\n#### Grounding Paths\n\nA parameter value is grounded via any of these paths:\n- **Explicit statement** in a user, assistant, tool, or system message.\n- **Documented default** in the API spec for this parameter.\n- **Pick-one** \u2014 the user enumerated multiple valid values and the agent picked one.\n- **Specialization** \u2014 the user named a set/range/category and the agent supplied a concrete value inside that scope. A value that falls OUTSIDE the named scope is a mismatch, not missing information.\n- **Plausible optional value** \u2014 if this parameter is optional (not in `required`), any spec-consistent value is grounded unless it contradicts explicit user intent.\n\n---\n\n#### Required-Parameter Omission\n\nA parameter listed in the spec's `required` array that is completely absent from the arguments is ALWAYS MISSING_INFORMATION \u2014 no grounding path (explicit statement, documented default, pick-one, specialization, or plausible-optional) substitutes for actually providing a value. The grounding paths only apply to values the agent ACTUALLY PROVIDED; they never excuse omission of a required parameter.\n\n#### User-Context Discarded\n\nWhen a parameter is spec-optional but the user (or prior tool output / system prompt) provided an explicit narrowing value for it \u2014 a search term, a filter, a category, an identifier \u2014 and the agent OMITS the parameter or sets it to an \"empty\" sentinel (empty string, null, default wildcard), this is NOT a \"plausible optional value\". It is USER_CONTEXT_DISCARDED and should be labeled CONTRADICTORY_VALUES (the call contradicts explicit user intent). Score the call 2-3 accordingly. This rule applies regardless of whether the spec marks the parameter as required; it turns on whether the user supplied narrowing context for it.\n\n---\n\n#### Stringency\n\nApply this rubric strictly for REQUIRED parameters whose value is missing, fabricated, or inconsistent with the spec. Apply it leniently for OPTIONAL parameters (not in the spec's `required` array) and for values reasonably inferable from visible context. A value is NOT an issue unless it contradicts explicit user intent, a documented constraint, or higher-priority evidence.", + "jsonschema": { + "title": "parameter_hallucination_check", + "description": "Assessment of tool call parameter hallucination, following the rubric defined above.", + "type": "object", + "additionalProperties": false, + "properties": { + "evidence": { + "type": "string", + "description": "Provide a short sentence or two with EXACT quotes from the conversation history (user messages, assistant responses, previous tool outputs) or tool specification that directly support your assessment. For grounded values, cite the specific text where this value originates. For ungrounded values, demonstrate the absence of supporting evidence or cite contradictory information." + }, + "explanation": { + "type": "string", + "description": "Explain in one or two sentences why the specific parameter value is grounded or ungrounded. Reference exact sources from the conversation or tool specification that either support or contradict the parameter value. Address: 1) Where the value originated from, 2) Whether this source is sufficient for grounding, 3) Any format or type compliance issues, 4) Any contradictions with other parameters or specifications, and 5) For default values, whether they're documented and appropriate for this context." }, - "examples": [ + "output": { + "type": "integer", + "minimum": 1, + "maximum": 5, + "threshold_low": 4, + "threshold_high": 5, + "description": "Parameter Grounding Score for this single parameter (1-5). Judge ONLY the named parameter's value; other parameters are out of scope.\n\n- **5**: Value is grounded via explicit statement, documented default, pick-one from user-provided options, or in-scope specialization. For optional parameters, a plausible spec-consistent value is CORRECT.\n- **4**: Value is grounded by a clear but unstated inference a careful reader would accept, or by a concrete specialization inside the user's abstract intent.\n- **3**: Grounding is weak \u2014 the value is plausible given the context but there is real ambiguity about whether it matches the user's intent.\n- **2**: The value is not supported by any grounding path; it appears fabricated or falls outside the user's named scope.\n- **1**: The value clearly contradicts explicit evidence, a documented constraint, or the user's named scope." + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1, + "threshold_low": 0, + "threshold_high": 1, + "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." + }, + "correction": { + "type": "object", + "properties": { + "reason_types": { + "type": "array", + "description": "A short sentence or two with types of issues with the parameter value, if any. Use one or more of these values: FORMAT_ERROR (wrong format or type), MISSING_INFORMATION (needs more data), PARAMETER_CONTRADICTION (conflicts with other parameters), DEFAULT_ISSUE (inappropriate default), OTHER (explain in reasons).", + "items": { + "type": "string", + "enum": [ + "FORMAT_ERROR", + "MISSING_INFORMATION", + "PARAMETER_CONTRADICTION", + "DEFAULT_ISSUE", + "OTHER" + ] + } + }, + "reasons": { + "type": "string", + "description": "Concise explanation of the specific issues with the parameter value." + }, + "parameter": { + "type": "object", + "description": "Object containing parameter name as key and corrected value as value. Use 'need_more_information' as the key and clarification question for the user if additional information is required. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-grounded, return an empty object {}.", + "additionalProperties": true + } + }, + "description": "For well-grounded values: Provide an empty object {}. For ungrounded values: Provide an object with reason_types, reasons, and parameter correction.", + "required": [] + } + }, + "required": [ + "explanation", + "evidence", + "output", + "confidence", + "correction" + ] + }, + "examples": [ + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Translate 'hello' to Spanish." + } + ], + "tool_inventory": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Translate 'hello' to Spanish." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "translate_text", - "description": "Translate given text", - "parameters": { - "type": "object", - "properties": { - "text": { - "type": "string", - "description": "The text to be translated" - }, - "target_lang": { - "type": "string", - "description": "The target language code using ISO 639-1 (e.g., 'en' for English, 'es' for Spanish, 'fr' for French)" - } - }, - "required": [ - "text", - "target_lang" - ] - } - } - } - ], - "tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "translate_text", - "arguments": "{ \"text\": \"hello\", \"target_lang\": \"es\" }" - } + "type": "function", + "function": { + "name": "translate_text", + "description": "Translate given text", + "parameters": { + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "The text to be translated" }, - "parameter_name": "target_lang", - "parameter_value": "es" - }, - "output": { - "evidence": "User said: \"Translate 'hello' to Spanish.\" The tool spec defines target_lang as a string. 'es' is the ISO 639-1 code for Spanish.", - "explanation": "The value 'es' is a grounded and appropriate transformation of the user's explicit request for Spanish. Assuming standard practice for language codes in translation APIs, the ISO code 'es' matches both the user intent and specification.", - "output": 5, - "confidence": 0.97, - "correction": {} + "target_lang": { + "type": "string", + "description": "The target language code using ISO 639-1 (e.g., 'en' for English, 'es' for Spanish, 'fr' for French)" + } + }, + "required": [ + "text", + "target_lang" + ] } - }, + } + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "translate_text", + "arguments": "{ \"text\": \"hello\", \"target_lang\": \"es\" }" + } + }, + "parameter_name": "target_lang", + "parameter_value": "es" + }, + "output": { + "evidence": "User said: \"Translate 'hello' to Spanish.\" The tool spec defines target_lang as a string. 'es' is the ISO 639-1 code for Spanish.", + "explanation": "The value 'es' is a grounded and appropriate transformation of the user's explicit request for Spanish. Assuming standard practice for language codes in translation APIs, the ISO code 'es' matches both the user intent and specification.", + "output": 5, + "confidence": 0.97, + "correction": {} + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Fetch my latest tweets." + } + ], + "tool_inventory": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Fetch my latest tweets." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "get_tweets", - "description": "Retrieve recent tweets", - "parameters": { - "type": "object", - "properties": { - "username": { - "type": "string", - "description": "The Twitter username to fetch tweets for" - }, - "count": { - "type": "integer", - "description": "The number of recent tweets to retrieve" - } - }, - "required": [ - "username", - "count" - ] - } - } - } - ], - "tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "get_tweets", - "arguments": "{ \"username\": \"elonmusk\", \"count\": 20 }" - } + "type": "function", + "function": { + "name": "get_tweets", + "description": "Retrieve recent tweets", + "parameters": { + "type": "object", + "properties": { + "username": { + "type": "string", + "description": "The Twitter username to fetch tweets for" }, - "parameter_name": "count", - "parameter_value": 20 - }, - "output": { - "evidence": "User said: \"Fetch my latest tweets.\" There is no mention of a tweet count. Tool spec lists 'count' as an integer parameter but provides no default.", - "explanation": "The value 20 is ungrounded. The user did not specify how many tweets to fetch, and the tool specification does not define a default value for the 'count' parameter. The agent's choice appears arbitrary.", - "output": 1, - "confidence": 0.95, - "correction": { - "reason_types": [ - "MISSING_INFORMATION" - ], - "reasons": "The value was invented without support in the user message or spec. There is no evidence for selecting 20.", - "parameter": { - "need_more_information": "How many tweets would you like to retrieve? Please specify a number." - } + "count": { + "type": "integer", + "description": "The number of recent tweets to retrieve" } + }, + "required": [ + "username", + "count" + ] } - }, + } + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "get_tweets", + "arguments": "{ \"username\": \"elonmusk\", \"count\": 20 }" + } + }, + "parameter_name": "count", + "parameter_value": 20 + }, + "output": { + "evidence": "User said: \"Fetch my latest tweets.\" There is no mention of a tweet count. Tool spec lists 'count' as an integer parameter but provides no default.", + "explanation": "The value 20 is ungrounded. The user did not specify how many tweets to fetch, and the tool specification does not define a default value for the 'count' parameter. The agent's choice appears arbitrary.", + "output": 1, + "confidence": 0.95, + "correction": { + "reason_types": [ + "MISSING_INFORMATION" + ], + "reasons": "The value was invented without support in the user message or spec. There is no evidence for selecting 20.", + "parameter": { + "need_more_information": "How many tweets would you like to retrieve? Please specify a number." + } + } + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Can you show me just a few of my latest messages?" + } + ], + "tool_inventory": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Can you show me just a few of my latest messages?" - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "get_messages", - "description": "Retrieve user's messages", - "parameters": { - "type": "object", - "properties": { - "user_id": { - "type": "integer", - "description": "The ID of the user whose messages are being retrieved" - }, - "limit": { - "type": "integer", - "description": "The maximum number of messages to retrieve" - } - }, - "required": [ - "user_id", - "limit" - ] - } - } - } - ], - "tool_call": { - "id": "call_002", - "type": "function", - "function": { - "name": "get_messages", - "arguments": "{ \"user_id\": 456, \"limit\": 3 }" - } + "type": "function", + "function": { + "name": "get_messages", + "description": "Retrieve user's messages", + "parameters": { + "type": "object", + "properties": { + "user_id": { + "type": "integer", + "description": "The ID of the user whose messages are being retrieved" }, - "parameter_name": "limit", - "parameter_value": 3 - }, - "output": { - "evidence": "User said: 'just a few of my latest messages.' The phrase 'a few' is vague and subjective, and while 3 is a reasonable interpretation, it is not explicitly stated.", - "explanation": "The value limit=3 is a plausible interpretation of the user's request for 'a few' messages. However, the term is ambiguous and does not unambiguously support the specific value of 3. The assistant made a judgment call based on conversational context, but without explicit grounding or a documented default, the score is 3.", - "output": 3, - "confidence": 0.79, - "correction": { - "reason_types": [ - "MISSING_INFORMATION" - ], - "reasons": "The user's phrasing is vague and does not explicitly support the value 3. Clarification is needed to ensure the correct value is used.", - "parameter": { - "need_more_information": "You asked for 'a few' messages. Could you specify how many exactly you'd like to see?" - } + "limit": { + "type": "integer", + "description": "The maximum number of messages to retrieve" } + }, + "required": [ + "user_id", + "limit" + ] } + } + } + ], + "tool_call": { + "id": "call_002", + "type": "function", + "function": { + "name": "get_messages", + "arguments": "{ \"user_id\": 456, \"limit\": 3 }" + } + }, + "parameter_name": "limit", + "parameter_value": 3 + }, + "output": { + "evidence": "User said: 'just a few of my latest messages.' The phrase 'a few' is vague and subjective, and while 3 is a reasonable interpretation, it is not explicitly stated.", + "explanation": "The value limit=3 is a plausible interpretation of the user's request for 'a few' messages. However, the term is ambiguous and does not unambiguously support the specific value of 3. The assistant made a judgment call based on conversational context, but without explicit grounding or a documented default, the score is 3.", + "output": 3, + "confidence": 0.79, + "correction": { + "reason_types": [ + "MISSING_INFORMATION" + ], + "reasons": "The user's phrasing is vague and does not explicitly support the value 3. Clarification is needed to ensure the correct value is used.", + "parameter": { + "need_more_information": "You asked for 'a few' messages. Could you specify how many exactly you'd like to see?" + } + } + } + } + ] + }, + { + "name": "parameter_value_format_alignment", + "task_description": "Evaluate whether the specific parameter value (named in \"parameter_name\") conforms exactly to the required type, format, and unit conventions defined in the API specification.\n\nYour judgment must be strictly evidence-based: rely exclusively on the provided API documentation and the parameter value. Do not assume or infer formatting rules that are not explicitly documented.\n\nIMPORTANT: Evaluate ONLY the **specific parameter value** provided to ensure it meets the exact type, format, and unit requirements. Do not evaluate the entire tool call correctness, other parameters, or the function itself. Focus solely on whether this parameter value aligns with the specification.\n---\n\n### 1. Data Type Compliance (Output = 4-5)\n- Check that the value matches the required data type (e.g., string, integer, boolean, object, array)\n- Verify that numbers are represented as numeric types, not strings\n- Ensure strings are quoted correctly if required\n- Confirm booleans are true/false (not \"true\"/\"false\") when the type is boolean\n\n---\n\n### 2. Format Specification Compliance\n- Validate adherence to documented format constraints, including:\n - Date and time formats (e.g., ISO 8601, YYYY-MM-DD, timezone presence)\n - Currency (e.g., $USD, symbol placement, decimal precision)\n - Special patterns (e.g., phone numbers, email addresses, postal codes)\n- Enforce regular expressions or formatting rules defined in the schema\n\n---\n\n### 3. Unit Compliance\n- Verify presence or absence of unit suffixes/prefixes as specified (e.g., \"kg\", \"seconds\")\n- Confirm unit types are consistent with spec (e.g., Celsius vs Fahrenheit)\n- Reject any extra units when raw values are required\n\n---\n\n### 4. Consistency Requirements\n- Ensure values are consistent with related parameters (e.g., both timestamps in UTC)\n- Validate compatibility with other parameter values in the same call\n\n---\n\n### Format Alignment Rating Scale\n- **5 - Perfect Alignment**: Value exactly matches required type, format, and units\n- **4 - Minor Deviation**: Value is fundamentally correct but with minor formatting issues\n- **3 - Moderate Deviation**: Type or format is partially incorrect; may cause issues\n- **2 - Major Deviation**: Type and format are significantly wrong; likely to fail\n- **1 - Complete Mismatch**: Value is entirely incompatible with specification\n\n---\n\n### Final Rule\nBe conservative in your assessment:\n- If the spec is ambiguous and the value seems plausible, prefer a higher score (4-5)\n- If the spec is clear and the value deviates, assign a lower score (1-3)\n\nYour evaluation is critical: even minor formatting issues can lead to tool failures or incorrect outputs. Careful review ensures reliability in downstream tool use.\n\n---\n\n#### Grounding Paths\n\nA parameter value is grounded via any of these paths:\n- **Explicit statement** in a user, assistant, tool, or system message.\n- **Documented default** in the API spec for this parameter.\n- **Pick-one** \u2014 the user enumerated multiple valid values and the agent picked one.\n- **Specialization** \u2014 the user named a set/range/category and the agent supplied a concrete value inside that scope. A value that falls OUTSIDE the named scope is a mismatch, not missing information.\n- **Plausible optional value** \u2014 if this parameter is optional (not in `required`), any spec-consistent value is grounded unless it contradicts explicit user intent.\n\n---\n\n#### Stringency\n\nApply this rubric strictly for REQUIRED parameters whose value is missing, fabricated, or inconsistent with the spec. Apply it leniently for OPTIONAL parameters (not in the spec's `required` array) and for values reasonably inferable from visible context. A value is NOT an issue unless it contradicts explicit user intent, a documented constraint, or higher-priority evidence.", + "jsonschema": { + "title": "parameter_value_format_alignment", + "description": "Assessment of tool call parameter value format alignment, following the rubric defined above.", + "type": "object", + "additionalProperties": false, + "properties": { + "evidence": { + "type": "string", + "description": "Give a short sentence or two with quotes of the spec's type/format definition and the actual provided value that supports your judgment. Include EXACT text from the tool specification regarding this parameter's requirements." + }, + "explanation": { + "type": "string", + "description": "Explain in one or two sentence precisely why the parameter value does or does not match the specification's type, format, and unit requirements. Reference specific details from the parameter specification including data type, format requirements, unit conventions, and any pattern constraints. For non-conforming values, explain exactly what is incorrect and how it deviates from requirements." + }, + "output": { + "type": "integer", + "minimum": 1, + "maximum": 5, + "threshold_low": 4, + "threshold_high": 5, + "description": "Parameter Type/Format/Unit Score for this single parameter (1-5). Judge ONLY the serialized representation against the spec's type/format/pattern/unit. Semantic correctness is out of scope here.\n\n- **5**: Value exactly matches the spec's type, format, pattern, and unit. If this parameter is optional and omitted, return 5 (no format issue exists).\n- **4**: Fundamentally correct with a minor quirk a lax parser accepts.\n- **3**: A partial format issue that may cause a strict parser to reject.\n- **2**: A significant type/format/unit violation likely to fail at the API boundary.\n- **1**: The value is fundamentally incompatible with the spec (e.g. free-form text where a boolean is required)." + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1, + "threshold_low": 0, + "threshold_high": 1, + "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." + }, + "correction": { + "type": "object", + "properties": { + "reason_types": { + "type": "array", + "description": "A short sentence or two with types of format issues with the parameter value, if any. Use one or more of these values: TYPE_ERROR (wrong data type), FORMAT_ERROR (incorrect format), UNIT_ERROR (wrong/missing units), PATTERN_ERROR (doesn't match required pattern), CONSISTENCY_ERROR (inconsistent with related parameters), OTHER (explain in reasons).", + "items": { + "type": "string", + "enum": [ + "TYPE_ERROR", + "FORMAT_ERROR", + "UNIT_ERROR", + "PATTERN_ERROR", + "CONSISTENCY_ERROR", + "OTHER" + ] + } + }, + "reasons": { + "type": "string", + "description": "Concise explanation of the specific format issues with the parameter value." + }, + "parameter": { + "type": "object", + "description": "An object containing the corrected parameter value in the form: { \"\": }. If you cannot correct the value, use 'need_more_information' as the key and provide a clarification question for the user. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-formatted, return an empty object {}.", + "additionalProperties": true } - ] + }, + "description": "For correctly formatted values: Provide an empty object {}. For format issues: Provide an object with reason_types, reasons, and parameter correction.", + "required": [] + } + }, + "required": [ + "explanation", + "evidence", + "output", + "confidence", + "correction" + ] }, - { - "name": "parameter_value_format_alignment", - "task_description": "Evaluate whether the specific parameter value (named in \"parameter_name\") conforms exactly to the required type, format, and unit conventions defined in the API specification.\n\nYour judgment must be strictly evidence-based: rely exclusively on the provided API documentation and the parameter value. Do not assume or infer formatting rules that are not explicitly documented.\n\nIMPORTANT: Evaluate ONLY the **specific parameter value** provided to ensure it meets the exact type, format, and unit requirements. Do not evaluate the entire tool call correctness, other parameters, or the function itself. Focus solely on whether this parameter value aligns with the specification.\n---\n\n### 1. Data Type Compliance (Output = 4-5)\n- Check that the value matches the required data type (e.g., string, integer, boolean, object, array)\n- Verify that numbers are represented as numeric types, not strings\n- Ensure strings are quoted correctly if required\n- Confirm booleans are true/false (not \"true\"/\"false\") when the type is boolean\n\n---\n\n### 2. Format Specification Compliance\n- Validate adherence to documented format constraints, including:\n - Date and time formats (e.g., ISO 8601, YYYY-MM-DD, timezone presence)\n - Currency (e.g., $USD, symbol placement, decimal precision)\n - Special patterns (e.g., phone numbers, email addresses, postal codes)\n- Enforce regular expressions or formatting rules defined in the schema\n\n---\n\n### 3. Unit Compliance\n- Verify presence or absence of unit suffixes/prefixes as specified (e.g., \"kg\", \"seconds\")\n- Confirm unit types are consistent with spec (e.g., Celsius vs Fahrenheit)\n- Reject any extra units when raw values are required\n\n---\n\n### 4. Consistency Requirements\n- Ensure values are consistent with related parameters (e.g., both timestamps in UTC)\n- Validate compatibility with other parameter values in the same call\n\n---\n\n### Format Alignment Rating Scale\n- **5 - Perfect Alignment**: Value exactly matches required type, format, and units\n- **4 - Minor Deviation**: Value is fundamentally correct but with minor formatting issues\n- **3 - Moderate Deviation**: Type or format is partially incorrect; may cause issues\n- **2 - Major Deviation**: Type and format are significantly wrong; likely to fail\n- **1 - Complete Mismatch**: Value is entirely incompatible with specification\n\n---\n\n### Final Rule\nBe conservative in your assessment:\n- If the spec is ambiguous and the value seems plausible, prefer a higher score (4-5)\n- If the spec is clear and the value deviates, assign a lower score (1-3)\n\nYour evaluation is critical: even minor formatting issues can lead to tool failures or incorrect outputs. Careful review ensures reliability in downstream tool use.", - "jsonschema": { - "title": "parameter_value_format_alignment", - "description": "Assessment of tool call parameter value format alignment, following the rubric defined above.", - "type": "object", - "additionalProperties": false, - "properties": { - "evidence": { - "type": "string", - "description": "Give a short sentence or two with quotes of the spec's type/format definition and the actual provided value that supports your judgment. Include EXACT text from the tool specification regarding this parameter's requirements." - }, - "explanation": { - "type": "string", - "description": "Explain in one or two sentence precisely why the parameter value does or does not match the specification's type, format, and unit requirements. Reference specific details from the parameter specification including data type, format requirements, unit conventions, and any pattern constraints. For non-conforming values, explain exactly what is incorrect and how it deviates from requirements." - }, - "output": { - "type": "integer", - "minimum": 1, - "maximum": 5, - "threshold_low": 4, - "threshold_high": 5, - "description": "An integer from 1 to 5 indicating how well the parameter value conforms to the required type, format, and unit as defined in the API specification.\n\nScore meanings:\n\n5 - Perfect Alignment: The value exactly matches the expected type, format, and unit. Example: expected date in 'YYYY-MM-DD', value is '2025-08-05'.\n\n4 - Minor Deviation: The value has small formatting issues (e.g., missing leading zeros) but is still likely to be accepted by the API or parser unless strict validation is enforced. Example: expected '2025-08-05', value is '2025-8-5'.\n\n3 - Moderate Deviation: The value partially matches the expected format but is likely to be rejected or misinterpreted by automatic processing. Example: expected 'YYYY-MM-DD', value is 'August 5, 2025'.\n\n2 - Major Deviation: The value significantly violates the expected type, format, or unit and is very likely to fail. Example: expected 'YYYY-MM-DD', value is '08/05/2025'.\n\n1 - Complete Mismatch: The value is entirely incompatible with the required format or type. Example: expected boolean, value is 'maybe'." - }, - "confidence": { - "type": "number", - "minimum": 0, - "maximum": 1, - "threshold_low": 0, - "threshold_high": 1, - "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting." - }, - "correction": { - "type": "object", - "properties": { - "reason_types": { - "type": "array", - "description": "A short sentence or two with types of format issues with the parameter value, if any. Use one or more of these values: TYPE_ERROR (wrong data type), FORMAT_ERROR (incorrect format), UNIT_ERROR (wrong/missing units), PATTERN_ERROR (doesn't match required pattern), CONSISTENCY_ERROR (inconsistent with related parameters), OTHER (explain in reasons).", - "items": { - "type": "string", - "enum": [ - "TYPE_ERROR", - "FORMAT_ERROR", - "UNIT_ERROR", - "PATTERN_ERROR", - "CONSISTENCY_ERROR", - "OTHER" - ] - } - }, - "reasons": { - "type": "string", - "description": "Concise explanation of the specific format issues with the parameter value." - }, - "parameter": { - "type": "object", - "description": "An object containing the corrected parameter value in the form: { \"\": }. If you cannot correct the value, use 'need_more_information' as the key and provide a clarification question for the user. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-formatted, return an empty object {}.", - "additionalProperties": true - } - }, - "description": "For correctly formatted values: Provide an empty object {}. For format issues: Provide an object with reason_types, reasons, and parameter correction.", - "required": [] + "examples": [ + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Set a short countdown for 12 minutes." + } + ], + "tool_inventory": [ + { + "type": "function", + "function": { + "name": "set_timer", + "description": "Set a countdown timer", + "parameters": { + "type": "object", + "properties": { + "duration": { + "type": "string", + "description": "Duration in the format 'number + unit' (e.g., '5 minutes', '30 seconds')" + } + }, + "required": [ + "duration" + ] } - }, - "required": [ - "explanation", - "evidence", - "output", - "confidence", - "correction" - ] + } + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "set_timer", + "arguments": "{ \"duration\": \"12 minutes\" }" + } + }, + "parameter_name": "duration", + "parameter_value": "12 minutes" }, - "examples": [ + "output": { + "evidence": "The tool spec states that 'duration' must be a string in the format 'number + unit', with examples such as '5 minutes' and '30 seconds'. The value '12 minutes' follows this format precisely: it consists of a number followed by a space and a valid time unit.", + "explanation": "The parameter value '12 minutes' aligns perfectly with the required type, structure, and unit conventions defined in the spec. While not explicitly given as an example, it is fully consistent with the format and requires no normalization or interpretation.", + "output": 5, + "confidence": 0.99, + "correction": {} + } + }, + { + "user_kwargs": { + "conversation_context": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Set a short countdown for 12 minutes." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "set_timer", - "description": "Set a countdown timer", - "parameters": { - "type": "object", - "properties": { - "duration": { - "type": "string", - "description": "Duration in the format 'number + unit' (e.g., '5 minutes', '30 seconds')" - } - }, - "required": [ - "duration" - ] - } - } - } - ], - "tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "set_timer", - "arguments": "{ \"duration\": \"12 minutes\" }" - } - }, - "parameter_name": "duration", - "parameter_value": "12 minutes" - }, - "output": { - "evidence": "The tool spec states that 'duration' must be a string in the format 'number + unit', with examples such as '5 minutes' and '30 seconds'. The value '12 minutes' follows this format precisely: it consists of a number followed by a space and a valid time unit.", - "explanation": "The parameter value '12 minutes' aligns perfectly with the required type, structure, and unit conventions defined in the spec. While not explicitly given as an example, it is fully consistent with the format and requires no normalization or interpretation.", - "output": 5, - "confidence": 0.99, - "correction": {} - } - }, + "role": "user", + "content": "Start a countdown for 5 minutes." + } + ], + "tool_inventory": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Start a countdown for 5 minutes." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "set_timer", - "description": "Set a countdown timer", - "parameters": { - "type": "object", - "properties": { - "duration": { - "type": "string", - "description": "Duration in the format 'number + unit' (e.g., '5 minutes', '30 seconds')" - } - }, - "required": [ - "duration" - ] - } - } - } - ], - "tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "set_timer", - "arguments": "{ \"duration\": \"300000\" }" - } - }, - "parameter_name": "duration", - "parameter_value": "300000" - }, - "output": { - "evidence": "The tool specification defines 'duration' as a string following the 'number + unit' format (e.g., '5 minutes'). The value '300000' is a string, but it lacks a time unit and does not follow the documented pattern.", - "explanation": "The value is likely intended to represent milliseconds, but the format is both undocumented and inconsistent with the specification. It fails both unit and pattern requirements, and cannot be reliably interpreted as valid input without assumptions or external conventions.", - "output": 2, - "confidence": 0.95, - "correction": { - "reason_types": [ - "FORMAT_ERROR", - "UNIT_ERROR" - ], - "reasons": "Missing explicit time unit and nonconformant format. Specification requires 'number + unit' format, which this value does not follow.", - "parameter": { - "duration": "5 minutes" - } + "type": "function", + "function": { + "name": "set_timer", + "description": "Set a countdown timer", + "parameters": { + "type": "object", + "properties": { + "duration": { + "type": "string", + "description": "Duration in the format 'number + unit' (e.g., '5 minutes', '30 seconds')" } + }, + "required": [ + "duration" + ] } - }, + } + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "set_timer", + "arguments": "{ \"duration\": \"300000\" }" + } + }, + "parameter_name": "duration", + "parameter_value": "300000" + }, + "output": { + "evidence": "The tool specification defines 'duration' as a string following the 'number + unit' format (e.g., '5 minutes'). The value '300000' is a string, but it lacks a time unit and does not follow the documented pattern.", + "explanation": "The value is likely intended to represent milliseconds, but the format is both undocumented and inconsistent with the specification. It fails both unit and pattern requirements, and cannot be reliably interpreted as valid input without assumptions or external conventions.", + "output": 2, + "confidence": 0.95, + "correction": { + "reason_types": [ + "FORMAT_ERROR", + "UNIT_ERROR" + ], + "reasons": "Missing explicit time unit and nonconformant format. Specification requires 'number + unit' format, which this value does not follow.", + "parameter": { + "duration": "5 minutes" + } + } + } + }, + { + "user_kwargs": { + "conversation_context": [ { - "user_kwargs": { - "conversation_context": [ - { - "role": "user", - "content": "Set a timer for five minutes please." - } - ], - "tool_inventory": [ - { - "type": "function", - "function": { - "name": "set_timer", - "description": "Set a countdown timer", - "parameters": { - "type": "object", - "properties": { - "duration": { - "type": "string", - "description": "Duration in the format 'number + unit' (e.g., '5 minutes', '30 seconds')" - } - }, - "required": [ - "duration" - ] - } - } - } - ], - "tool_call": { - "id": "call_001", - "type": "function", - "function": { - "name": "set_timer", - "arguments": "{ \"duration\": \"5Minutes\" }" - } - }, - "parameter_name": "duration", - "parameter_value": "5Minutes" - }, - "output": { - "evidence": "The specification explicitly defines the duration format as 'number + unit' with examples like '5 minutes'. The given value '5Minutes' omits the required space, resulting in a concatenated string.", - "explanation": "The input conveys the intended meaning and includes all required semantic elements (number and unit), but the lack of a space between them breaks the documented pattern. This deviation is moderate-it introduces a risk of failure in strict parsers that expect a space-separated format.", - "output": 3, - "confidence": 0.89, - "correction": { - "reason_types": [ - "FORMAT_ERROR" - ], - "reasons": "Value contains both components but violates the explicit formatting requirement of space separation.", - "parameter": { - "duration": "5 minutes" - } + "role": "user", + "content": "Set a timer for five minutes please." + } + ], + "tool_inventory": [ + { + "type": "function", + "function": { + "name": "set_timer", + "description": "Set a countdown timer", + "parameters": { + "type": "object", + "properties": { + "duration": { + "type": "string", + "description": "Duration in the format 'number + unit' (e.g., '5 minutes', '30 seconds')" } + }, + "required": [ + "duration" + ] } + } + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "set_timer", + "arguments": "{ \"duration\": \"5Minutes\" }" + } + }, + "parameter_name": "duration", + "parameter_value": "5Minutes" + }, + "output": { + "evidence": "The specification explicitly defines the duration format as 'number + unit' with examples like '5 minutes'. The given value '5Minutes' omits the required space, resulting in a concatenated string.", + "explanation": "The input conveys the intended meaning and includes all required semantic elements (number and unit), but the lack of a space between them breaks the documented pattern. This deviation is moderate-it introduces a risk of failure in strict parsers that expect a space-separated format.", + "output": 3, + "confidence": 0.89, + "correction": { + "reason_types": [ + "FORMAT_ERROR" + ], + "reasons": "Value contains both components but violates the explicit formatting requirement of space separation.", + "parameter": { + "duration": "5 minutes" } - ] - } + } + } + } + ] + } ] diff --git a/altk/pre_tool/sparc/function_calling/pipeline/adapters.py b/altk/pre_tool/sparc/function_calling/pipeline/adapters.py index d8b55ee3..4b7257bb 100644 --- a/altk/pre_tool/sparc/function_calling/pipeline/adapters.py +++ b/altk/pre_tool/sparc/function_calling/pipeline/adapters.py @@ -2,6 +2,7 @@ Any, Dict, List, + Literal, ) from altk.pre_tool.sparc.function_calling.pipeline.types import ( @@ -10,6 +11,18 @@ ) +CompactMode = Literal["auto", "never", "always"] +"""Compact tool-inventory rendering mode. + +- ``"auto"`` (default): use compact form (description + parameter name list only) + whenever the inventory has ``>= compact_threshold`` tools, otherwise full + summary (description + {param_name: type}). Helps when the function-selection + prompt would otherwise balloon past the context window. +- ``"never"``: always use full summary (description + {param_name: type}). +- ``"always"``: always use compact form regardless of tool count. +""" + + # ──────────────────────────────────────────────────────────────────────────────── # Adapter definitions # ──────────────────────────────────────────────────────────────────────────────── @@ -41,16 +54,57 @@ def get_param_spec_snippet(self, param_name: str) -> Dict[str, Any]: class OpenAIAdapter(BaseAdapter): - """Adapter for ToolSpec + ToolCall inputs.""" - - def __init__(self, specs: List[ToolSpec], call: ToolCall): + """Adapter for ToolSpec + ToolCall inputs. + + Args: + specs: the full tool inventory for the current turn. + call: the tool call being judged. + compact_tool_schema: how to render the function-selection inventory. + ``"auto"`` (default) falls back to the compact form once the + inventory has ``>= compact_tool_threshold`` tools so the prompt + stays tractable when hundreds of tools are available. Single-tool + prompts (``get_tool_spec``) and the full dump (``get_tools_inventory``) + are unaffected. + compact_tool_threshold: tool-count threshold for ``"auto"`` mode + (default 20, matching SPARCReflectionConfig default). + """ + + def __init__( + self, + specs: List[ToolSpec], + call: ToolCall, + compact_tool_schema: CompactMode = "auto", + compact_tool_threshold: int = 20, + ): self.specs = specs self.call = call + self.compact_tool_schema: CompactMode = compact_tool_schema + self.compact_tool_threshold = compact_tool_threshold + + def _use_compact_summary(self) -> bool: + if self.compact_tool_schema == "always": + return True + if self.compact_tool_schema == "never": + return False + return len(self.specs) >= self.compact_tool_threshold def get_tools_inventory(self) -> List[Dict[str, Any]]: return [spec.model_dump() for spec in self.specs] def get_tools_inventory_summary(self) -> List[Dict[str, Any]]: + # Compact form: tool_description + parameter name list only. + # Drops type annotations to save tokens when many tools are present. + if self._use_compact_summary(): + return [ + { + "tool_name": spec.function.name, + "tool_description": spec.function.description, + "tool_parameters": list( + spec.function.parameters.get("properties", {}).keys() + ), + } + for spec in self.specs + ] return [ { "tool_name": spec.function.name, diff --git a/altk/pre_tool/sparc/function_calling/pipeline/pipeline.py b/altk/pre_tool/sparc/function_calling/pipeline/pipeline.py index 8ab6d8fa..ca27c1c3 100644 --- a/altk/pre_tool/sparc/function_calling/pipeline/pipeline.py +++ b/altk/pre_tool/sparc/function_calling/pipeline/pipeline.py @@ -66,6 +66,8 @@ def __init__( runtime_pipeline: Optional[bool] = True, use_examples: Optional[bool] = True, skip_static_checks: Optional[bool] = False, + compact_tool_schema: str = "auto", + compact_tool_threshold: int = 20, ): self.metrics_client = metrics_client if codegen_client is None: @@ -159,6 +161,8 @@ def __init__( metrics_client=self.metrics_client, codegen_client=self.codegen_client, transform_enabled=transform_enabled, + compact_tool_schema=compact_tool_schema, + compact_tool_threshold=compact_tool_threshold, ) # 4) Auto-detect if we should skip static checks diff --git a/altk/pre_tool/sparc/function_calling/pipeline/semantic_checker.py b/altk/pre_tool/sparc/function_calling/pipeline/semantic_checker.py index 6a08291d..651c3e9d 100644 --- a/altk/pre_tool/sparc/function_calling/pipeline/semantic_checker.py +++ b/altk/pre_tool/sparc/function_calling/pipeline/semantic_checker.py @@ -68,6 +68,8 @@ def __init__( parameter_metrics: Optional[List[Dict[str, Any]]] = None, codegen_client: Optional[ValidatingLLMClient] = None, transform_enabled: Optional[bool] = False, + compact_tool_schema: str = "auto", + compact_tool_threshold: int = 20, ) -> None: # Validate clients if not isinstance(metrics_client, ValidatingLLMClient): @@ -75,6 +77,8 @@ def __init__( self.metrics_client = metrics_client self.transform_enabled = transform_enabled + self.compact_tool_schema = compact_tool_schema + self.compact_tool_threshold = compact_tool_threshold self.codegen_client = codegen_client if not codegen_client or not isinstance(codegen_client, ValidatingLLMClient): self.codegen_client = metrics_client @@ -100,10 +104,20 @@ def __init__( def _make_adapter(self, apis_specs, tool_call): # Handle empty specs for tool-spec-free metrics if not apis_specs: - return OpenAIAdapter([], tool_call) + return OpenAIAdapter( + [], + tool_call, + compact_tool_schema=self.compact_tool_schema, + compact_tool_threshold=self.compact_tool_threshold, + ) first = apis_specs[0] if isinstance(first, ToolSpec): - return OpenAIAdapter(apis_specs, tool_call) + return OpenAIAdapter( + apis_specs, + tool_call, + compact_tool_schema=self.compact_tool_schema, + compact_tool_threshold=self.compact_tool_threshold, + ) raise TypeError("Unsupported spec type") def _collect_params(self, adapter: BaseAdapter) -> Dict[str, Any]: diff --git a/altk/pre_tool/sparc/function_calling/pipeline/transformation_prompts.py b/altk/pre_tool/sparc/function_calling/pipeline/transformation_prompts.py index 87017f24..e205f44f 100644 --- a/altk/pre_tool/sparc/function_calling/pipeline/transformation_prompts.py +++ b/altk/pre_tool/sparc/function_calling/pipeline/transformation_prompts.py @@ -102,7 +102,9 @@ def build_multi_extract_units_schema(params: List[str]) -> Dict[str, Any]: MULTI_EXTRACT_UNITS_SYSTEM: str = """\ You are an expert in natural language understanding and API specifications. Given: - 1. A user context (natural-language instructions). + 1. A user context (natural-language instructions). The context may include + a system prompt that anchors temporal references — e.g. a current + date, current year, or timezone. Treat those anchors as ground truth. 2. A JSON Schema snippet that describes **all** parameters the tool expects. 3. A list of all parameter names. @@ -115,6 +117,37 @@ def build_multi_extract_units_schema(params: List[str]) -> Dict[str, Any]: (If none, return an empty string `""`.) - A brief "transformation_summary" describing how to convert `user_value` to `spec_units_or_format`. +Grounding rules (apply before flagging a mismatch): + - Evidence priority when sources disagree: + system prompt > tool outputs > user messages > assistant messages. + - If the user supplies an under-specified value — one missing a + component such as a year, region, unit, or scale — and a + higher-priority source (system prompt, prior tool output) fixes + that missing component, COMPLETE the value from the anchor BEFORE + reporting `user_value`. The grounded value is a valid canonical + form; `transformation_summary` remains empty because no + code-level conversion is needed — only contextual completion. + In this case, also set `user_units_or_format` to match the + REPORTED grounded form (the value you actually emit as + `user_value`), NOT the user's colloquial phrasing — otherwise + downstream code generation will attempt a format conversion that + is not needed and will fail. + - If the user's value already matches the spec's units/format after + grounding, leave `transformation_summary` empty. Only populate + `transformation_summary` when a genuine conversion (unit change, + encoding change, reformatting) is required. + - Transformation is for VALUE-PRESERVING conversions only + (unit, encoding, format, pattern). If the agent's serialized + value and the user-stated value differ in substance (e.g. + different points, different dates, different identifiers) + rather than in representation, return an empty + `transformation_summary`. Semantic mismatches belong to the + grounding metrics, not a code-generation transformation. + - When evidence across sources is ambiguous or the value cannot be + confidently extracted, return empty strings rather than guessing. + A conservative no-op is preferable to a spurious "needs + transformation" verdict. + Respond with exactly one JSON object whose keys are the parameter names, and whose values are objects with "user_value", "user_units_or_format", and "spec_units_or_format". The JSON must match this schema exactly: @@ -247,6 +280,38 @@ def build_multi_extract_units_schema(params: List[str]) -> Dict[str, Any]: }} }} +4) Context: [{{"role":"system", "content":"The current year is 2024. All dates default to 2024 unless the user states otherwise."}}, + {{"role":"user", "content":"Book me a flight on May 20."}}, + {{"role":"assistant", "content":"{{\"id\":\"tool_call_4\",\"type\":\"function\",\"function\":{{\"name\":\"book_flight\",\"arguments\":{{\"date\":\"2024-05-20\"}}}}}}"}}] + Full Spec: + {{ + "name": "book_flight", + "description": "Book a flight on a given date.", + "parameters": {{ + "type": "object", + "properties": {{ + "date": {{ + "type": "string", + "format": "date", + "description": "Flight date in YYYY-MM-DD" + }} + }}, + "required": ["date"] + }} + }} + Parameter names: "date" + -> {{ + "date": {{ + "user_units_or_format":"yyyy-mm-dd", + "user_value":"2024-05-20", + "spec_units_or_format":"yyyy-mm-dd", + "transformation_summary":"" + }} + }} + (The user said "May 20" without a year; the system prompt anchors the + current year to 2024, so the grounded user_value is 2024-05-20 and no + transformation is needed.) + Context: {context} diff --git a/altk/pre_tool/sparc/sparc.py b/altk/pre_tool/sparc/sparc.py index 403c1bf7..010123ee 100644 --- a/altk/pre_tool/sparc/sparc.py +++ b/altk/pre_tool/sparc/sparc.py @@ -162,6 +162,9 @@ def _initialize_pipeline(self): general_metrics=self._config.general_metrics, function_metrics=self._config.function_metrics, parameter_metrics=self._config.parameter_metrics, + compact_tool_schema=self._config.compact_tool_schema, + compact_tool_threshold=self._config.compact_tool_threshold, + runtime_pipeline=self._config.runtime_pipeline, ) logger.info("Reflection pipeline initialized successfully") @@ -365,6 +368,90 @@ def _run_sync_pipeline( transform_enabled=self._config.transform_enabled, ) + @staticmethod + def _rubric_score(metric_result) -> Optional[float]: + """Extract the integer 1-5 rubric ``output`` from a semantic metric result. + + Returns ``None`` when the metric errored or didn't produce a numeric + output (e.g. raw_response missing, non-coercible).""" + raw = getattr(metric_result, "raw_response", None) + if not isinstance(raw, dict): + return None + val = raw.get("output") + try: + return float(val) if val is not None else None + except (TypeError, ValueError): + return None + + @staticmethod + def _rubric_confidence(metric_result) -> Optional[float]: + raw = getattr(metric_result, "raw_response", None) + if not isinstance(raw, dict): + return None + val = raw.get("confidence") + try: + return float(val) if val is not None else None + except (TypeError, ValueError): + return None + + @staticmethod + def _extract_recommendations(metric_result): + """Pull ``actionable_recommendations`` from a metric's LLM output. + + Only populated in evaluation-time mode — runtime metric JSONs do + not require the field and the model will omit it. Returns a list + of ``SPARCRecommendation`` (empty when absent or malformed). + """ + from altk.pre_tool.core.types import ( + SPARCRecommendation, + SPARCRecommendationTarget, + ) + + raw = getattr(metric_result, "raw_response", None) + if not isinstance(raw, dict): + return [] + recs = raw.get("actionable_recommendations") + if not isinstance(recs, list): + return [] + + out = [] + for item in recs: + if not isinstance(item, dict): + continue + # Normalize target enum — accept case variants defensively. + target_raw = item.get("target") + try: + target = SPARCRecommendationTarget(str(target_raw).lower()) + except (ValueError, AttributeError): + continue + diff = item.get("diff") or item.get("quote") or "" + rationale = item.get("rationale") or item.get("details") or "" + importance_raw = item.get("importance") + try: + importance = float(importance_raw) if importance_raw is not None else 0.5 + except (TypeError, ValueError): + importance = 0.5 + # Clamp into [0, 1]. + importance = max(0.0, min(1.0, importance)) + if not diff.strip(): + # Skip empty diffs — they are not actionable. + continue + try: + out.append( + SPARCRecommendation( + target=target, + tool_name=item.get("tool_name"), + parameter_name=item.get("parameter_name"), + diff=diff, + rationale=rationale, + importance=importance, + ) + ) + except Exception as exc: + logger.debug(f"Dropping malformed recommendation: {exc}") + continue + return out + def _process_pipeline_result( self, pipeline_result: PipelineResult ) -> SPARCReflectionResult: @@ -372,6 +459,13 @@ def _process_pipeline_result( issues = [] has_errors = False decision = SPARCReflectionDecision.APPROVE + # 1-5 rubric scores collected across every semantic metric that produced + # a numeric output, used to compute an aggregate rubric score later. + per_metric_scores: List[float] = [] + # Flat collection of recommendations across every semantic metric that + # emits them (evaluation-time mode only). Runtime metrics return an + # empty list here. + all_recs: List = [] # Check static issues if pipeline_result.static and not pipeline_result.static.final_decision: @@ -420,8 +514,19 @@ def _process_pipeline_result( "explanation", "" ), correction=metric_result.raw_response.get("correction"), + output_value=self._rubric_score(metric_result), + confidence=self._rubric_confidence(metric_result), + recommendations=self._extract_recommendations(metric_result), ) ) + # Always record the rubric score (issue or not) so the + # aggregate reflects judge confidence, not only rejections. + s = self._rubric_score(metric_result) + if s is not None: + per_metric_scores.append(s) + # Always harvest recommendations too — the LLM may flag + # prompt/spec gaps even on a pass-grade call. + all_recs.extend(self._extract_recommendations(metric_result)) if not function_selection_issues: # General metrics - check for errors and issues @@ -458,8 +563,15 @@ def _process_pipeline_result( correction=metric_result.raw_response.get( "correction" ), + output_value=self._rubric_score(metric_result), + confidence=self._rubric_confidence(metric_result), + recommendations=self._extract_recommendations(metric_result), ) ) + s = self._rubric_score(metric_result) + if s is not None: + per_metric_scores.append(s) + all_recs.extend(self._extract_recommendations(metric_result)) # Parameter metrics - check for errors and issues if pipeline_result.semantic.parameter: @@ -496,8 +608,15 @@ def _process_pipeline_result( correction=metric_result.raw_response.get( "correction" ), + output_value=self._rubric_score(metric_result), + confidence=self._rubric_confidence(metric_result), + recommendations=self._extract_recommendations(metric_result), ) ) + s = self._rubric_score(metric_result) + if s is not None: + per_metric_scores.append(s) + all_recs.extend(self._extract_recommendations(metric_result)) # Transform results - check for errors and corrections if pipeline_result.semantic.transform: @@ -544,7 +663,15 @@ def _process_pipeline_result( else: decision = SPARCReflectionDecision.APPROVE + aggregate_score: Optional[float] = ( + sum(per_metric_scores) / len(per_metric_scores) + if per_metric_scores + else None + ) + return SPARCReflectionResult( decision=decision, issues=issues, + score=aggregate_score, + all_recommendations=all_recs, ) diff --git a/tests/core/test_validating_llm_client.py b/tests/core/test_validating_llm_client.py new file mode 100644 index 00000000..452e0347 --- /dev/null +++ b/tests/core/test_validating_llm_client.py @@ -0,0 +1,308 @@ +"""Tests for ``ValidatingLLMClient`` production knobs. + +Covers the behaviors migrated from CLEAR's monkey-patches: + +- ``free_form_object_as_str`` changes how free-form ``type: object`` fields are + modeled (as ``str`` in Pydantic) and loosens ``_validate`` to accept the + stringified form. Together these make SPARC/CLEAR compatible with OpenAI's + ``additionalProperties: false`` structured-output requirement. +- ``prompt_based_validation`` forces schema-into-system-prompt and skips + native ``response_format``. Targeted at providers like watsonx. +- ``default_generation_kwargs`` forwards e.g. ``max_tokens``/``temperature`` + into every ``generate`` call, with caller kwargs winning. +- The wrapped ``_parse_llm_response`` returns ``""`` on parse error and + warns when a reasoning-only response exhausted the budget. +""" + +from __future__ import annotations + +import logging +from types import SimpleNamespace +from typing import Any, Type + +import pytest + +from altk.core.llm.output_parser import ( + OutputValidationError, + ValidatingLLMClient, + json_schema_to_pydantic_model, + relax_freeform_object_schema, +) + + +# --------------------------------------------------------------------------- +# Dummy client used throughout — no actual LLM call. +# --------------------------------------------------------------------------- + + +class _FakeValidating(ValidatingLLMClient): + """Minimal concrete subclass — real ``_generate`` is stubbed via + ``monkeypatch`` in each test, because ``super()._generate`` inside + ``ValidatingLLMClient.generate`` bypasses subclass overrides.""" + + @classmethod + def provider_class(cls) -> Type[Any]: + return object + + def _register_methods(self) -> None: # noqa: D401 + pass + + def _setup_parameter_mapper(self) -> None: # noqa: D401 + pass + + def _parse_llm_response(self, raw: Any) -> str: + if isinstance(raw, Exception): + raise raw + return str(raw) + + def __init__(self, **kw): + super().__init__(**kw) + + +def _install_scripted_generate(monkeypatch, observed: list, scripted: list): + """Replace ``BaseLLMClient._generate`` so tests can intercept the call + that happens inside ``ValidatingLLMClient.generate``'s retry loop.""" + from altk.core.llm.base import BaseLLMClient + + def fake_generate(self, **kwargs): # noqa: ANN001 + observed.append(kwargs) + if not scripted: + return "" + raw = scripted.pop(0) + return self._parse_llm_response(raw) + + monkeypatch.setattr(BaseLLMClient, "_generate", fake_generate, raising=True) + + +# --------------------------------------------------------------------------- +# json_schema_to_pydantic_model — free_form_object_as_str +# --------------------------------------------------------------------------- + + +class TestJsonSchemaToPydantic: + def test_default_keeps_object_as_dict(self): + m = json_schema_to_pydantic_model( + {"type": "object", "properties": {"a": {"type": "object"}}} + ) + assert m.model_fields["a"].annotation is dict + + def test_freeform_object_flag_switches_to_str(self): + m = json_schema_to_pydantic_model( + {"type": "object", "properties": {"a": {"type": "object"}}}, + free_form_object_as_str=True, + ) + assert m.model_fields["a"].annotation is str + + def test_freeform_flag_keeps_nested_objects_as_dict(self): + # only free-form (no properties) converts; an object with properties + # keeps its dict shape (OpenAI can still satisfy additionalProperties + # when the sub-schema is fully specified). + m = json_schema_to_pydantic_model( + { + "type": "object", + "properties": { + "flat": {"type": "object"}, + "structured": { + "type": "object", + "properties": {"x": {"type": "string"}}, + }, + }, + }, + free_form_object_as_str=True, + ) + assert m.model_fields["flat"].annotation is str + assert m.model_fields["structured"].annotation is dict + + +# --------------------------------------------------------------------------- +# relax_freeform_object_schema +# --------------------------------------------------------------------------- + + +class TestRelaxFreeformObjectSchema: + def test_relaxes_freeform_object(self): + out = relax_freeform_object_schema( + {"type": "object", "properties": {"a": {"type": "object"}}} + ) + assert out["properties"]["a"]["type"] == ["object", "string"] + + def test_leaves_structured_objects_alone(self): + schema = { + "type": "object", + "properties": { + "sub": {"type": "object", "properties": {"x": {"type": "string"}}} + }, + } + out = relax_freeform_object_schema(schema) + assert out["properties"]["sub"]["type"] == "object" + + def test_deep_copy_does_not_mutate_input(self): + schema = {"type": "object", "properties": {"a": {"type": "object"}}} + _ = relax_freeform_object_schema(schema) + assert schema["properties"]["a"]["type"] == "object" + + +# --------------------------------------------------------------------------- +# ValidatingLLMClient configuration surface +# --------------------------------------------------------------------------- + + +class TestValidatingLLMClientConfig: + def test_defaults(self): + c = _FakeValidating(client=object()) + assert c.free_form_object_as_str is False + assert c.prompt_based_validation is False + assert c.default_generation_kwargs == {} + + def test_init_kwargs(self): + c = _FakeValidating( + free_form_object_as_str=True, + prompt_based_validation=True, + default_generation_kwargs={"max_tokens": 42}, + client=object(), + ) + assert c.free_form_object_as_str is True + assert c.prompt_based_validation is True + assert c.default_generation_kwargs == {"max_tokens": 42} + + def test_configure_validation_is_chainable(self): + c = _FakeValidating(client=object()) + out = c.configure_validation(free_form_object_as_str=True) + assert out is c + assert c.free_form_object_as_str is True + + def test_default_generation_kwargs_is_copied(self): + kw = {"max_tokens": 10} + c = _FakeValidating(default_generation_kwargs=kw, client=object()) + c.default_generation_kwargs["temperature"] = 0.0 + assert "temperature" not in kw, "caller's dict must not be mutated" + + +# --------------------------------------------------------------------------- +# _validate honors free_form_object_as_str +# --------------------------------------------------------------------------- + + +class TestValidatorRelaxation: + _schema = {"type": "object", "properties": {"a": {"type": "object"}}} + + def test_strict_rejects_json_string_for_object_field(self): + c = _FakeValidating(client=object()) # default: strict + with pytest.raises(OutputValidationError): + c._validate('{"a": "{\\"k\\": 1}"}', self._schema) + + def test_relaxed_accepts_json_string_for_object_field(self): + c = _FakeValidating( + free_form_object_as_str=True, client=object() + ) + # The LLM returned {"a": ""} — still + # valid with relaxed schema. + got = c._validate('{"a": "arbitrary JSON-ish"}', self._schema) + assert got == {"a": "arbitrary JSON-ish"} + + def test_relaxed_still_accepts_normal_object(self): + c = _FakeValidating( + free_form_object_as_str=True, client=object() + ) + assert c._validate('{"a": {"k": 1}}', self._schema) == {"a": {"k": 1}} + + +# --------------------------------------------------------------------------- +# generate() — prompt_based_validation + default_generation_kwargs +# --------------------------------------------------------------------------- + + +class TestPromptBasedValidation: + def test_prompt_based_injects_schema_into_system_prompt(self, monkeypatch): + observed: list = [] + _install_scripted_generate(monkeypatch, observed, ['{"a": "ok"}']) + c = _FakeValidating(prompt_based_validation=True, client=object()) + out = c.generate( + [{"role": "user", "content": "hi"}], + schema={"type": "object", "properties": {"a": {"type": "string"}}}, + ) + assert out == {"a": "ok"} + observed_prompt = observed[-1]["prompt"] + assert observed_prompt[0]["role"] == "system" + assert "JSON Schema" in observed_prompt[0]["content"] + assert "response_format" not in observed[-1] + + +class TestDefaultGenerationKwargs: + def test_defaults_applied_when_caller_does_not_set(self, monkeypatch): + observed: list = [] + _install_scripted_generate(monkeypatch, observed, ['{"a": "ok"}']) + c = _FakeValidating( + prompt_based_validation=True, + default_generation_kwargs={"max_tokens": 123, "temperature": 0.0}, + client=object(), + ) + c.generate([], schema={"type": "object", "properties": {"a": {"type": "string"}}}) + obs = observed[-1] + assert obs["max_tokens"] == 123 + assert obs["temperature"] == 0.0 + + def test_caller_kwargs_win_over_defaults(self, monkeypatch): + observed: list = [] + _install_scripted_generate(monkeypatch, observed, ['{"a": "ok"}']) + c = _FakeValidating( + prompt_based_validation=True, + default_generation_kwargs={"max_tokens": 123}, + client=object(), + ) + c.generate( + [], + schema={"type": "object", "properties": {"a": {"type": "string"}}}, + max_tokens=999, + ) + assert observed[-1]["max_tokens"] == 999 + + +# --------------------------------------------------------------------------- +# Wrapped _parse_llm_response: empty + reasoning-budget exhaustion +# --------------------------------------------------------------------------- + + +class TestSafeParse: + def test_value_error_becomes_empty_string(self): + c = _FakeValidating(client=object()) + # Wrapped parser returns "" (retry-worthy) on ValueError/KeyError + # instead of propagating. + assert c._parse_llm_response(ValueError("broken")) == "" + + def test_key_error_becomes_empty_string(self): + c = _FakeValidating(client=object()) + assert c._parse_llm_response(KeyError("missing")) == "" + + def test_reasoning_budget_warning_logged(self, caplog): + c = _FakeValidating(client=object()) + # A litellm-shaped response: reasoning_content set, finish_reason=length, + # content missing — the classic reasoning-budget exhaustion pattern. + raw = { + "choices": [ + { + "message": {"reasoning_content": "long thinking..."}, + "finish_reason": "length", + } + ] + } + # Wrap raw in a class whose attribute-access fails, forcing ValueError. + class _FailingParse: + def __init__(self, payload): + self._p = payload + + # Intentionally broken access pattern in the *orig* parser. + def __str__(self): + return "ok" + + # Force the orig parser to raise ValueError, then check the safe + # parser emits the targeted warning. + class _C2(_FakeValidating): + def _parse_llm_response(self, raw): # will be wrapped + raise ValueError("empty") + + with caplog.at_level(logging.WARNING, logger="altk.core.llm.output_parser"): + c2 = _C2(client=object()) + out = c2._parse_llm_response(raw) + assert out == "" + assert any("reasoning" in r.message.lower() for r in caplog.records) diff --git a/tests/pre_tool/sparc/test_adapter_compact_schema.py b/tests/pre_tool/sparc/test_adapter_compact_schema.py new file mode 100644 index 00000000..d2fb6472 --- /dev/null +++ b/tests/pre_tool/sparc/test_adapter_compact_schema.py @@ -0,0 +1,204 @@ +"""Tests for the compact tool-schema rendering mode on ``OpenAIAdapter``. + +The compact mode is intended for function-selection prompts when the inventory +is so large that full {param_name: type} summaries would blow the context +window (e.g. appworld's 457 tools). Compact emits a list of parameter names +only — no types. + +Covers: + - ``"never"`` renders the legacy {name: type} summary regardless of size. + - ``"always"`` renders the compact [name, ...] summary regardless of size. + - ``"auto"`` (default) switches based on ``compact_tool_threshold``. + - Single-tool paths (``get_tool_spec``, ``get_tools_inventory``) are + unaffected. + - Pipeline-/component-level config forwards the flags to the adapter. +""" + +from __future__ import annotations + +from typing import Any, Dict, List + +import pytest + +from altk.pre_tool.core.config import SPARCReflectionConfig +from altk.pre_tool.sparc.function_calling.pipeline.adapters import OpenAIAdapter +from altk.pre_tool.sparc.function_calling.pipeline.types import ToolCall, ToolSpec + + +def _make_spec(name: str, params: List[str]) -> ToolSpec: + return ToolSpec.model_validate( + { + "type": "function", + "function": { + "name": name, + "description": f"desc of {name}", + "parameters": { + "type": "object", + "properties": {p: {"type": "string", "description": p} for p in params}, + "required": [], + }, + }, + } + ) + + +def _call() -> ToolCall: + return ToolCall.model_validate( + {"id": "c", "type": "function", "function": {"name": "tool_0", "arguments": "{}"}} + ) + + +@pytest.fixture +def small_inventory() -> List[ToolSpec]: + return [_make_spec(f"tool_{i}", ["a", "b", "c"]) for i in range(5)] + + +@pytest.fixture +def large_inventory() -> List[ToolSpec]: + return [_make_spec(f"big_{i}", ["x", "y"]) for i in range(25)] + + +# --------------------------------------------------------------------------- +# summary rendering — compact_tool_schema modes +# --------------------------------------------------------------------------- + + +class TestCompactSchemaModes: + def test_never_keeps_full_summary_regardless_of_size(self, small_inventory, large_inventory): + for specs in (small_inventory, large_inventory): + ad = OpenAIAdapter(specs, _call(), compact_tool_schema="never") + summary = ad.get_tools_inventory_summary() + assert len(summary) == len(specs) + for entry in summary: + assert isinstance(entry["tool_parameters"], dict), ( + "never mode should emit {param_name: type} dicts" + ) + + def test_always_uses_compact_regardless_of_size(self, small_inventory, large_inventory): + for specs in (small_inventory, large_inventory): + ad = OpenAIAdapter(specs, _call(), compact_tool_schema="always") + summary = ad.get_tools_inventory_summary() + assert len(summary) == len(specs) + for entry in summary: + assert isinstance(entry["tool_parameters"], list), ( + "always mode should emit a list of parameter names" + ) + + def test_auto_stays_full_under_threshold(self, small_inventory): + # default threshold = 20; 5 tools should stay full + ad = OpenAIAdapter(small_inventory, _call()) + summary = ad.get_tools_inventory_summary() + assert isinstance(summary[0]["tool_parameters"], dict) + + def test_auto_switches_compact_at_or_above_threshold(self, large_inventory): + # default threshold = 20; 25 tools should flip to compact + ad = OpenAIAdapter(large_inventory, _call()) + summary = ad.get_tools_inventory_summary() + assert isinstance(summary[0]["tool_parameters"], list) + assert summary[0]["tool_parameters"] == ["x", "y"] + + def test_auto_threshold_override(self, small_inventory): + # With threshold=3, even 5 tools triggers compact + ad = OpenAIAdapter(small_inventory, _call(), compact_tool_threshold=3) + summary = ad.get_tools_inventory_summary() + assert isinstance(summary[0]["tool_parameters"], list) + + def test_auto_threshold_off_by_one(self, small_inventory): + # exactly-at-threshold should flip to compact (>= not >) + ad = OpenAIAdapter(small_inventory, _call(), compact_tool_threshold=5) + assert ad._use_compact_summary() is True + ad = OpenAIAdapter(small_inventory, _call(), compact_tool_threshold=6) + assert ad._use_compact_summary() is False + + +# --------------------------------------------------------------------------- +# single-tool surfaces — must NOT be affected by compact mode +# --------------------------------------------------------------------------- + + +class TestSingleToolSurfacesUnchanged: + def test_get_tool_spec_is_full_dump_in_all_modes(self, large_inventory): + for mode in ("auto", "always", "never"): + ad = OpenAIAdapter(large_inventory, _call(), compact_tool_schema=mode) + spec = ad.get_tool_spec("big_0") + # full single-tool dump keeps the schema + assert "parameters" in spec + assert "properties" in spec["parameters"] + assert spec["parameters"]["properties"]["x"]["type"] == "string" + + def test_get_tools_inventory_is_full_in_all_modes(self, large_inventory): + for mode in ("auto", "always", "never"): + ad = OpenAIAdapter(large_inventory, _call(), compact_tool_schema=mode) + full = ad.get_tools_inventory() + # each entry is a ToolSpec model_dump, which includes the full + # function schema under .function.parameters + assert len(full) == len(large_inventory) + for entry in full: + assert "function" in entry + assert "parameters" in entry["function"] + + +# --------------------------------------------------------------------------- +# config propagation +# --------------------------------------------------------------------------- + + +class TestConfigDefaults: + def test_defaults(self): + cfg = SPARCReflectionConfig() + assert cfg.compact_tool_schema == "auto" + assert cfg.compact_tool_threshold == 20 + + def test_config_override(self): + cfg = SPARCReflectionConfig(compact_tool_schema="always", compact_tool_threshold=5) + assert cfg.compact_tool_schema == "always" + assert cfg.compact_tool_threshold == 5 + + def test_invalid_mode_rejected(self): + with pytest.raises(Exception): # pydantic will raise ValidationError + SPARCReflectionConfig(compact_tool_schema="garbage") + + def test_invalid_threshold_rejected(self): + with pytest.raises(Exception): # ge=1 constraint + SPARCReflectionConfig(compact_tool_threshold=0) + + +class TestPipelineForwarding: + """Confirm compact_tool_schema flows Pipeline -> SemanticChecker -> Adapter.""" + + def test_semantic_checker_forwards_to_adapter(self, large_inventory): + from altk.core.llm import ValidatingLLMClient + from altk.pre_tool.sparc.function_calling.pipeline.semantic_checker import ( + SemanticChecker, + ) + + # minimal dummy ValidatingLLMClient — the checker only needs it + # present to construct itself; we are not calling generate here. + class _Dummy(ValidatingLLMClient): # type: ignore[misc] + @classmethod + def provider_class(cls): + return object + + def _register_methods(self) -> None: # noqa: D401 + pass + + def _parse_llm_response(self, raw): # noqa: D401 + return str(raw) + + def _setup_parameter_mapper(self) -> None: + pass + + client = _Dummy(client=object()) + checker = SemanticChecker( + metrics_client=client, + compact_tool_schema="always", + compact_tool_threshold=42, + ) + adapter = checker._make_adapter(large_inventory, _call()) + assert adapter.compact_tool_schema == "always" + assert adapter.compact_tool_threshold == 42 + + # And an empty-specs path should still propagate the config + adapter_empty = checker._make_adapter([], _call()) + assert adapter_empty.compact_tool_schema == "always" + assert adapter_empty.compact_tool_threshold == 42 diff --git a/tests/pre_tool/sparc/test_eval_metrics_schema.py b/tests/pre_tool/sparc/test_eval_metrics_schema.py new file mode 100644 index 00000000..e63b8626 --- /dev/null +++ b/tests/pre_tool/sparc/test_eval_metrics_schema.py @@ -0,0 +1,214 @@ +"""Schema + hygiene tests for the evaluation-time metric JSONs. + +Runtime metrics are faster and omit recommendations. Evaluation-time +metrics include ``actionable_recommendations`` in the LLM output schema +and every example demonstrates the expected shape. This test locks the +schema in place so a future regen can't silently drift. +""" + +from __future__ import annotations + +import json +import re +from pathlib import Path + +import pytest + + +ROOT = Path(__file__).resolve().parents[3] / "altk/pre_tool/sparc/function_calling/metrics" + +EVAL_GENERAL = ROOT / "function_call/general_metrics.json" +EVAL_FUNCSEL = ROOT / "function_selection/function_selection_metrics.json" +EVAL_PARAM = ROOT / "parameter/parameter_metrics.json" + +RUNTIME_GENERAL = ROOT / "function_call/general_metrics_runtime.json" +RUNTIME_FUNCSEL = ROOT / "function_selection/function_selection_metrics_runtime.json" +RUNTIME_PARAM = ROOT / "parameter/parameter_metrics_runtime.json" + + +ALL_EVAL = [EVAL_GENERAL, EVAL_FUNCSEL, EVAL_PARAM] +ALL_RUNTIME = [RUNTIME_GENERAL, RUNTIME_FUNCSEL, RUNTIME_PARAM] + + +ALLOWED_TARGETS = { + "system_prompt", + "tool_description", + "parameter_description", + "parameter_examples", +} + +REQUIRED_ITEM_KEYS = ("target", "diff", "rationale", "importance") + + +@pytest.fixture(scope="module", params=ALL_EVAL, ids=[p.name for p in ALL_EVAL]) +def eval_metrics(request): + return json.loads(request.param.read_text()), request.param + + +@pytest.fixture(scope="module", params=ALL_RUNTIME, ids=[p.name for p in ALL_RUNTIME]) +def runtime_metrics(request): + return json.loads(request.param.read_text()), request.param + + +class TestEvalSchema: + def test_actionable_recommendations_is_required(self, eval_metrics): + data, path = eval_metrics + for m in data: + req = m["jsonschema"].get("required", []) + assert "actionable_recommendations" in req, ( + f"{path.name}::{m['name']} missing actionable_recommendations " + f"from required" + ) + + def test_schema_is_unified_diff_shape(self, eval_metrics): + data, path = eval_metrics + for m in data: + ar = m["jsonschema"]["properties"]["actionable_recommendations"] + assert ar["type"] == "array" + item = ar["items"] + assert item["type"] == "object" + # Required fields + assert set(item.get("required", [])) >= set(REQUIRED_ITEM_KEYS), ( + f"{path.name}::{m['name']} missing required keys in rec item" + ) + props = item["properties"] + # Target must be the closed enum + assert set(props["target"]["enum"]) == ALLOWED_TARGETS, ( + f"{path.name}::{m['name']} target enum must be exactly {ALLOWED_TARGETS}" + ) + # Importance must be [0, 1] + assert props["importance"]["minimum"] == 0 + assert props["importance"]["maximum"] == 1 + # No legacy fields (quote / recommendation enum / details) remain + for legacy in ("quote", "recommendation", "details"): + assert legacy not in props, ( + f"{path.name}::{m['name']} legacy field {legacy!r} still in schema" + ) + + def test_no_legacy_recommendation_enum_in_schema(self, eval_metrics): + data, path = eval_metrics + for m in data: + blob = json.dumps(m["jsonschema"]) + # The old schema embedded enums with the uppercase tokens below. + for legacy_enum in ( + "SYSTEM_PROMPT_INSTRUCTION", + "TOOL_DOCUMENTATION", + "TOOL_USAGE_EXAMPLES", + "PARAMETER_DOCUMENTATION", + "PARAMETER_EXAMPLES", + "PARAMETER_FORMAT_DOCUMENTATION", + "INSTRUCTIONS_ADDITIONS", + "SYSTEM_PROMPT_ADDITIONS", + "PREREQUISITE_TRACKING", + "TOOL_DEPENDENCY_DOCUMENTATION", + ): + assert legacy_enum not in blob, ( + f"{path.name}::{m['name']} still references legacy rec enum {legacy_enum!r}" + ) + + +class TestEvalExamples: + def test_every_example_has_a_rec_list(self, eval_metrics): + data, path = eval_metrics + for m in data: + for i, ex in enumerate(m.get("examples", [])): + recs = ex["output"].get("actionable_recommendations") + assert isinstance(recs, list), ( + f"{path.name}::{m['name']} example[{i}] must have an " + f"actionable_recommendations list (possibly empty)" + ) + + def test_every_rec_has_required_fields(self, eval_metrics): + data, path = eval_metrics + for m in data: + for i, ex in enumerate(m.get("examples", [])): + for j, rec in enumerate( + ex["output"].get("actionable_recommendations", []) + ): + for key in REQUIRED_ITEM_KEYS: + assert key in rec, ( + f"{path.name}::{m['name']} ex[{i}].rec[{j}] " + f"missing required key {key!r}" + ) + + def test_importance_in_unit_interval(self, eval_metrics): + data, path = eval_metrics + for m in data: + for i, ex in enumerate(m.get("examples", [])): + for j, rec in enumerate( + ex["output"].get("actionable_recommendations", []) + ): + assert 0.0 <= float(rec["importance"]) <= 1.0, ( + f"{path.name}::{m['name']} ex[{i}].rec[{j}] " + f"importance out of range" + ) + + def test_target_is_in_allowed_enum(self, eval_metrics): + data, path = eval_metrics + for m in data: + for i, ex in enumerate(m.get("examples", [])): + for j, rec in enumerate( + ex["output"].get("actionable_recommendations", []) + ): + assert rec["target"] in ALLOWED_TARGETS, ( + f"{path.name}::{m['name']} ex[{i}].rec[{j}] target " + f"{rec['target']!r} not in {ALLOWED_TARGETS}" + ) + + def test_tool_scoped_target_has_tool_name(self, eval_metrics): + data, path = eval_metrics + for m in data: + for i, ex in enumerate(m.get("examples", [])): + for j, rec in enumerate( + ex["output"].get("actionable_recommendations", []) + ): + if rec["target"] in ( + "tool_description", + "parameter_description", + "parameter_examples", + ): + assert rec.get("tool_name"), ( + f"{path.name}::{m['name']} ex[{i}].rec[{j}] " + f"target={rec['target']} requires a tool_name" + ) + + def test_parameter_scoped_target_has_parameter_name(self, eval_metrics): + data, path = eval_metrics + for m in data: + for i, ex in enumerate(m.get("examples", [])): + for j, rec in enumerate( + ex["output"].get("actionable_recommendations", []) + ): + if rec["target"] in ( + "parameter_description", + "parameter_examples", + ): + assert rec.get("parameter_name"), ( + f"{path.name}::{m['name']} ex[{i}].rec[{j}] " + f"target={rec['target']} requires a parameter_name" + ) + + def test_diff_is_unified_format(self, eval_metrics): + data, path = eval_metrics + # Unified-diff header pattern: ``--- a/\n+++ b/\n@@`` anywhere. + header_re = re.compile(r"^--- a/.+\n\+\+\+ b/.+\n@@", re.MULTILINE) + for m in data: + for i, ex in enumerate(m.get("examples", [])): + for j, rec in enumerate( + ex["output"].get("actionable_recommendations", []) + ): + assert header_re.search(rec["diff"]), ( + f"{path.name}::{m['name']} ex[{i}].rec[{j}] diff is " + f"not unified-diff format:\n{rec['diff']!r}" + ) + + +class TestRuntimeDoesNotRequireRecs: + def test_runtime_does_not_require_actionable_recommendations(self, runtime_metrics): + data, path = runtime_metrics + for m in data: + req = m["jsonschema"].get("required", []) + assert "actionable_recommendations" not in req, ( + f"{path.name}::{m['name']} must NOT require " + f"actionable_recommendations in runtime mode" + ) diff --git a/tests/pre_tool/sparc/test_no_benchmark_leakage.py b/tests/pre_tool/sparc/test_no_benchmark_leakage.py new file mode 100644 index 00000000..79c8d7ac --- /dev/null +++ b/tests/pre_tool/sparc/test_no_benchmark_leakage.py @@ -0,0 +1,163 @@ +"""Guardrail: SPARC prompts must not leak benchmark-specific names. + +The SPARC metrics ship with CLEAR, τ-bench retail/airline, and AppWorld +as the closest examples of upstream use cases — but the prompts are +consumed by arbitrary tool-calling agents (Claude Code, production +assistants, user-defined agents). Any benchmark-specific token that +slipped into a shared prompt would bias future judgments toward those +use cases. + +This test checks that all prompt sources that are ALWAYS loaded into a +judge's context are free of a blocklist of benchmark-specific terms. +Few-shot examples inside user prompts (and the ``examples`` arrays of +metric JSONs, which are concrete demonstrations) are intentionally +excluded — concrete grounding examples are load-bearing in few-shot +learning and do NOT bias the judge in the same way as a rule. +""" + +from __future__ import annotations + +import json +import re +from pathlib import Path + +import pytest + + +ROOT = Path(__file__).resolve().parents[3] / "altk/pre_tool/sparc/function_calling" +RUNTIME_GENERAL = ROOT / "metrics/function_call/general_metrics_runtime.json" +RUNTIME_FUNCSEL = ROOT / "metrics/function_selection/function_selection_metrics_runtime.json" +RUNTIME_PARAM = ROOT / "metrics/parameter/parameter_metrics_runtime.json" +EVAL_GENERAL = ROOT / "metrics/function_call/general_metrics.json" +EVAL_FUNCSEL = ROOT / "metrics/function_selection/function_selection_metrics.json" +EVAL_PARAM = ROOT / "metrics/parameter/parameter_metrics.json" +COMMON = ROOT / "metrics/common_principles.py" +TRANSFORM = ROOT / "pipeline/transformation_prompts.py" + + +# Terms that must not appear in ALWAYS-loaded prompt text (task descriptions, +# output-rubric descriptions, common principles, transformation system prompt). +# Names we know are specific to the public benchmarks we've tested against. +BENCHMARK_TERMS = ( + "tau-bench", + "τ-bench", + "taubench", + "appworld", + "tau2_retail", + "tau2_airline", + # Function-name / resource-name shapes unique to τ-bench domains: + "reservation_id", + "book_flight", + # The "May 20" / "2024" partial-date anchor that previously leaked in + # from the transformation prompt: + "may 20", + "2024-05-20", + # A named demo restaurant that appeared in a CLEAR test fixture: + "the french bistro", +) + +# Function-name prefix enumerations that biased the confirmation-scope rule +# toward τ-bench tool naming. Conceptual phrasing ("persistent state", +# "information-gathering") is preferred. +PREFIX_ENUMERATIONS = ("get_*", "find_*", "search_*", "list_*", "show_*") + + +def _task_descriptions(path: Path) -> list[str]: + """Return the task_description + output rubric description of every + metric in a runtime JSON — i.e. the strings the judge sees every + request. Examples / few-shots are intentionally excluded.""" + data = json.loads(path.read_text()) + out: list[str] = [] + for metric in data: + td = metric.get("task_description", "") + out.append(td) + props = metric.get("jsonschema", {}).get("properties", {}) + out.append(props.get("output", {}).get("description", "") or "") + return out + + +def _system_prompt_constants_from_transformation() -> str: + """Pull just the system-prompt constants from transformation_prompts. + + The module also exposes ``*_USER`` constants that contain few-shot + examples — concrete demonstrations are expected to be specific and + are exempt from the domain-leak blocklist. + """ + from altk.pre_tool.sparc.function_calling.pipeline import transformation_prompts as tp + + parts = [] + for name in ("MULTI_EXTRACT_UNITS_SYSTEM", "GENERATE_CODE_SYSTEM"): + val = getattr(tp, name, None) + if isinstance(val, str): + parts.append(val) + return "\n\n".join(parts) + + +def _common_principles_text() -> str: + from altk.pre_tool.sparc.function_calling.metrics.common_principles import ( + COMMON_PRINCIPLES, + ) + + return COMMON_PRINCIPLES + + +@pytest.fixture(scope="module") +def shared_corpus() -> list[tuple[str, str]]: + """(label, text) pairs of every ALWAYS-loaded prompt surface. + + Explicitly excluded: few-shot example constants (``*_USER``), the + ``examples`` array inside metric JSONs. Concrete demonstrations in + those locations are expected to be specific and do NOT bias the + judge the way a general rule would. + """ + blobs: list[tuple[str, str]] = [] + for path in ( + RUNTIME_GENERAL, + RUNTIME_FUNCSEL, + RUNTIME_PARAM, + EVAL_GENERAL, + EVAL_FUNCSEL, + EVAL_PARAM, + ): + for i, td in enumerate(_task_descriptions(path)): + blobs.append((f"{path.name}[{i}]", td)) + blobs.append(("common_principles.COMMON_PRINCIPLES", _common_principles_text())) + blobs.append( + ("transformation_prompts.*_SYSTEM", _system_prompt_constants_from_transformation()) + ) + return blobs + + +@pytest.mark.parametrize("term", BENCHMARK_TERMS) +def test_no_benchmark_term_in_shared_prompts(shared_corpus, term): + lowered_corpus = [(label, text.lower()) for label, text in shared_corpus] + offenders = [label for label, text in lowered_corpus if term in text] + assert not offenders, ( + f"benchmark-specific term {term!r} leaked into shared prompt text: " + f"{offenders}" + ) + + +@pytest.mark.parametrize("prefix", PREFIX_ENUMERATIONS) +def test_no_prefix_enumeration_in_shared_rules(shared_corpus, prefix): + # Function-name prefix enumerations biased the confirmation-scope + # rule. They must not appear in runtime task_descriptions or in the + # common_principles / transformation_prompts modules (always loaded). + # Few-shot examples are excluded from `shared_corpus` by design. + offenders = [label for label, text in shared_corpus if prefix in text] + assert not offenders, ( + f"function-name prefix enumeration {prefix!r} must not appear in " + f"shared prompt rules: {offenders}" + ) + + +def test_mutating_as_rule_keyword_is_gone(shared_corpus): + # The literal uppercase "MUTATING" used to be the anchor for the + # confirmation-scope rule. It made the text feel benchmark-specific + # and lowercased "mutating" as a descriptive adjective is fine. + # Assert the shouty form is gone from rules. + pattern = re.compile(r"\bMUTATING\b") + offenders = [label for label, text in shared_corpus if pattern.search(text)] + assert not offenders, ( + f"uppercase 'MUTATING' must not appear in shared prompt rules: {offenders}" + ) diff --git a/tests/pre_tool/sparc/test_reflection_score.py b/tests/pre_tool/sparc/test_reflection_score.py new file mode 100644 index 00000000..b96fa5ba --- /dev/null +++ b/tests/pre_tool/sparc/test_reflection_score.py @@ -0,0 +1,289 @@ +"""Tests for per-metric rubric score + aggregate score in SPARC output. + +Covers: +- ``SPARCReflectionIssue.output_value`` / ``.confidence`` carry per-metric + numeric context when available. +- ``SPARCReflectionResult.score`` aggregates across all semantic metrics + that produced a rating (approved or not). +- ``.approved`` boolean and ``.normalized_score`` 0-1 helper. +""" + +from __future__ import annotations + +from types import SimpleNamespace + +import pytest + +from altk.pre_tool.core.types import ( + SPARCReflectionDecision, + SPARCReflectionIssue, + SPARCReflectionIssueType, + SPARCReflectionResult, +) +from altk.pre_tool.sparc.sparc import SPARCReflectionComponent + + +def _metric(is_issue: bool, output: float | None, confidence: float = 0.9, error: str = ""): + raw = {"output": output, "confidence": confidence, "explanation": "e", "correction": None} + return SimpleNamespace( + is_issue=is_issue, raw_response=raw, error=error + ) + + +class _PipelineResult(SimpleNamespace): + pass + + +# --------------------------------------------------------------------------- +# Reflection result helpers +# --------------------------------------------------------------------------- + + +class TestResultHelpers: + def test_approved_shortcut_true(self): + r = SPARCReflectionResult(decision=SPARCReflectionDecision.APPROVE) + assert r.approved is True + + def test_approved_shortcut_false(self): + r = SPARCReflectionResult(decision=SPARCReflectionDecision.REJECT) + assert r.approved is False + + def test_normalized_score_none_when_unset(self): + r = SPARCReflectionResult(decision=SPARCReflectionDecision.APPROVE) + assert r.normalized_score is None + + @pytest.mark.parametrize( + "raw,expected", + [ + (1.0, 0.0), + (3.0, 0.5), + (5.0, 1.0), + (4.5, 0.875), + (0.0, 0.0), # clamped + (6.0, 1.0), # clamped + ], + ) + def test_normalized_score_mapping(self, raw, expected): + r = SPARCReflectionResult(decision=SPARCReflectionDecision.APPROVE, score=raw) + assert r.normalized_score == pytest.approx(expected) + + +# --------------------------------------------------------------------------- +# _rubric_score extraction +# --------------------------------------------------------------------------- + + +class TestRubricExtraction: + def test_valid_output(self): + m = _metric(is_issue=False, output=4.0) + assert SPARCReflectionComponent._rubric_score(m) == 4.0 + assert SPARCReflectionComponent._rubric_confidence(m) == 0.9 + + def test_missing_output(self): + m = SimpleNamespace(raw_response={"confidence": 0.5}) + assert SPARCReflectionComponent._rubric_score(m) is None + + def test_non_numeric_output(self): + m = SimpleNamespace(raw_response={"output": "not-a-number"}) + assert SPARCReflectionComponent._rubric_score(m) is None + + def test_no_raw_response(self): + m = SimpleNamespace(error="oops") + assert SPARCReflectionComponent._rubric_score(m) is None + + +# --------------------------------------------------------------------------- +# End-to-end aggregation via _process_pipeline_result +# --------------------------------------------------------------------------- + + +def _build_component(): + """Create a minimal SPARCReflectionComponent for direct _process_pipeline_result + calls, bypassing LLM-client validation.""" + from altk.core.toolkit import ComponentConfig + + class _BareComponent(SPARCReflectionComponent): + def __init__(self): # noqa: D401 + # Skip __init__ chain — we only exercise _process_pipeline_result. + pass + + return _BareComponent() + + +def _pipeline(general=None, function_selection=None, parameter=None, transform=None, static=None): + """Build a PipelineResult-shaped SimpleNamespace.""" + return _PipelineResult( + static=static, + semantic=SimpleNamespace( + general=SimpleNamespace(metrics=general or {}) if general is not None else None, + function_selection=( + SimpleNamespace(metrics=function_selection or {}) + if function_selection is not None + else None + ), + parameter=parameter or {}, + transform=transform or {}, + ), + ) + + +class TestAggregateScore: + def test_all_approved(self): + comp = _build_component() + pr = _pipeline( + general={"g1": _metric(False, 5.0), "g2": _metric(False, 4.0)}, + function_selection={"f1": _metric(False, 5.0)}, + ) + result = comp._process_pipeline_result(pr) + assert result.decision == SPARCReflectionDecision.APPROVE + assert result.approved is True + assert result.score == pytest.approx((5 + 4 + 5) / 3) + assert result.issues == [] + + def test_rejected_with_per_issue_score(self): + comp = _build_component() + bad = _metric(True, 2.0, confidence=0.8) + good = _metric(False, 5.0) + pr = _pipeline( + general={"g1": bad, "g2": good}, + function_selection={"f1": good}, + ) + result = comp._process_pipeline_result(pr) + assert result.decision == SPARCReflectionDecision.REJECT + assert result.score == pytest.approx((2 + 5 + 5) / 3) + # The issue must carry its per-metric rubric info + assert len(result.issues) == 1 + assert result.issues[0].output_value == 2.0 + assert result.issues[0].confidence == pytest.approx(0.8) + + def test_score_none_when_no_semantic_metrics(self): + comp = _build_component() + pr = _pipeline() # no semantic results + result = comp._process_pipeline_result(pr) + assert result.score is None + + def test_function_selection_issue_masks_general(self): + """When function_selection has an issue, general/parameter metrics are + skipped (existing SPARC behavior). Aggregate score should only include + what was actually evaluated.""" + comp = _build_component() + fs_bad = _metric(True, 1.0) + # general metrics are present but should be skipped by the masking + pr = _pipeline( + general={"g1": _metric(False, 5.0)}, + function_selection={"f1": fs_bad}, + ) + result = comp._process_pipeline_result(pr) + assert result.decision == SPARCReflectionDecision.REJECT + # Only the function_selection metric's score contributed + assert result.score == pytest.approx(1.0) + assert result.issues[0].issue_type == SPARCReflectionIssueType.SEMANTIC_FUNCTION + assert result.issues[0].output_value == 1.0 + + def test_errored_metric_records_no_score(self): + comp = _build_component() + errored = SimpleNamespace( + is_issue=False, error="llm timeout", raw_response=None + ) + pr = _pipeline(general={"g1": errored}) + result = comp._process_pipeline_result(pr) + assert result.decision == SPARCReflectionDecision.ERROR + assert result.score is None + + +# --------------------------------------------------------------------------- +# Actionable-recommendation extraction (evaluation-time mode) +# --------------------------------------------------------------------------- + + +def _metric_with_recs(is_issue: bool, output: float, recs: list[dict], error: str = ""): + raw = { + "output": output, + "confidence": 0.9, + "explanation": "e", + "correction": None, + "actionable_recommendations": recs, + } + return SimpleNamespace(is_issue=is_issue, raw_response=raw, error=error) + + +class TestRecommendationExtraction: + GOOD_REC = { + "target": "system_prompt", + "tool_name": None, + "parameter_name": None, + "diff": "--- a/system_prompt\n+++ b/system_prompt\n@@\n+Call search before book.", + "rationale": "Prior turn skipped the search step.", + "importance": 0.8, + } + PARAM_REC = { + "target": "parameter_description", + "tool_name": "book_resource", + "parameter_name": "resource_id", + "diff": "--- a/tool/book_resource#resource_id\n+++ b/tool/book_resource#resource_id\n@@\n+Must come from a prior lookup call.", + "rationale": "Agent fabricated ids repeatedly.", + "importance": 0.55, + } + + def test_single_rec_on_issue(self): + comp = _build_component() + bad = _metric_with_recs(True, 2.0, [self.GOOD_REC]) + result = comp._process_pipeline_result(_pipeline(general={"g1": bad})) + assert len(result.issues) == 1 + assert len(result.issues[0].recommendations) == 1 + rec = result.issues[0].recommendations[0] + assert rec.target.value == "system_prompt" + assert rec.importance == 0.8 + assert len(result.all_recommendations) == 1 + + def test_rec_on_non_issue_still_collected(self): + # A grade-5 (no issue) call can still surface spec improvements. + comp = _build_component() + ok = _metric_with_recs(False, 5.0, [self.PARAM_REC]) + result = comp._process_pipeline_result(_pipeline(general={"g1": ok})) + assert result.issues == [] + # all_recommendations must still carry the rec. + assert len(result.all_recommendations) == 1 + assert result.all_recommendations[0].tool_name == "book_resource" + + def test_multiple_metrics_aggregate_all_recs(self): + comp = _build_component() + bad = _metric_with_recs(True, 2.0, [self.GOOD_REC]) + ok = _metric_with_recs(False, 5.0, [self.PARAM_REC]) + result = comp._process_pipeline_result( + _pipeline(general={"g1": bad, "g2": ok}) + ) + assert len(result.all_recommendations) == 2 + + def test_malformed_rec_is_dropped(self): + comp = _build_component() + bad_rec = {**self.GOOD_REC, "target": "not_a_real_target"} + empty_diff = {**self.GOOD_REC, "diff": ""} + good = _metric_with_recs(True, 2.0, [bad_rec, empty_diff, self.GOOD_REC]) + result = comp._process_pipeline_result(_pipeline(general={"g1": good})) + # Only the one valid rec survives. + assert len(result.all_recommendations) == 1 + + def test_importance_clamped_to_unit_interval(self): + comp = _build_component() + recs = [ + {**self.GOOD_REC, "importance": 1.5}, + {**self.GOOD_REC, "importance": -0.3}, + {**self.GOOD_REC, "importance": "not-numeric"}, + ] + good = _metric_with_recs(True, 2.0, recs) + result = comp._process_pipeline_result(_pipeline(general={"g1": good})) + imps = [r.importance for r in result.all_recommendations] + # 1.5 -> 1.0; -0.3 -> 0.0; "not-numeric" -> 0.5 (fallback). + assert sorted(imps) == pytest.approx([0.0, 0.5, 1.0]) + + def test_no_actionable_recommendations_field_is_safe(self): + # Runtime mode: raw_response has no actionable_recommendations key. + comp = _build_component() + m = SimpleNamespace( + is_issue=False, + raw_response={"output": 5.0, "confidence": 0.9, "explanation": "e"}, + error="", + ) + result = comp._process_pipeline_result(_pipeline(general={"g1": m})) + assert result.all_recommendations == [] diff --git a/tests/pre_tool/sparc/test_runtime_prompts.py b/tests/pre_tool/sparc/test_runtime_prompts.py new file mode 100644 index 00000000..b179e271 --- /dev/null +++ b/tests/pre_tool/sparc/test_runtime_prompts.py @@ -0,0 +1,286 @@ +"""Smoke tests for the shipped metric ``task_description`` strings. + +Goals: + - The runtime JSON files parse and include every configured metric. + - The prompts can be instantiated via the public loader without errors. + - The production-ready rules we baked into the prompts are present + (evidence hierarchy, mid-trajectory awareness, redundancy-by-args, + recovery after failure, confirmation scope, optional-parameter rule). + +This keeps us honest if someone regenerates the JSONs and accidentally +drops the guardrails. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Dict, Iterable, List + +import pytest + +from altk.pre_tool.core.consts import ( + METRIC_AGENTIC_CONSTRAINTS_SATISFACTION, + METRIC_FUNCTION_SELECTION_APPROPRIATENESS, + METRIC_GENERAL_CONVERSATION_GROUNDED_CORRECTNESS, + METRIC_GENERAL_HALLUCINATION_CHECK, + METRIC_GENERAL_VALUE_FORMAT_ALIGNMENT, +) +from altk.pre_tool.sparc.function_calling.metrics.loader import ( + PromptKind, + load_prompts_from_list, +) + + +ROOT = Path(__file__).resolve().parents[3] / "altk/pre_tool/sparc/function_calling/metrics" +GENERAL_JSON = ROOT / "function_call/general_metrics_runtime.json" +FUNCSEL_JSON = ROOT / "function_selection/function_selection_metrics_runtime.json" +PARAM_JSON = ROOT / "parameter/parameter_metrics_runtime.json" + + +def _load(path: Path) -> List[Dict]: + with path.open() as f: + return json.load(f) + + +def _by_name(metrics: Iterable[Dict]) -> Dict[str, Dict]: + return {m["name"]: m for m in metrics} + + +@pytest.fixture(scope="module") +def general_metrics() -> Dict[str, Dict]: + return _by_name(_load(GENERAL_JSON)) + + +@pytest.fixture(scope="module") +def funcsel_metrics() -> Dict[str, Dict]: + return _by_name(_load(FUNCSEL_JSON)) + + +# --------------------------------------------------------------------------- +# Structural checks — the JSONs still contain every canonical metric and +# the loader accepts them. +# --------------------------------------------------------------------------- + + +class TestRuntimeJsonStructure: + def test_general_json_has_expected_metrics(self, general_metrics): + assert set(general_metrics) >= { + METRIC_GENERAL_HALLUCINATION_CHECK, + METRIC_GENERAL_VALUE_FORMAT_ALIGNMENT, + METRIC_GENERAL_CONVERSATION_GROUNDED_CORRECTNESS, + } + + def test_funcsel_json_has_expected_metrics(self, funcsel_metrics): + assert set(funcsel_metrics) >= { + METRIC_FUNCTION_SELECTION_APPROPRIATENESS, + METRIC_AGENTIC_CONSTRAINTS_SATISFACTION, + } + + def test_every_metric_has_the_score_field(self, general_metrics, funcsel_metrics): + # All metrics use the integer 1-5 rubric stored at properties.output. + for m in (*general_metrics.values(), *funcsel_metrics.values()): + props = m["jsonschema"]["properties"] + assert "output" in props, f"metric {m['name']} missing output" + assert props["output"]["type"] == "integer" + assert props["output"]["minimum"] == 1 + assert props["output"]["maximum"] == 5 + + def test_loader_accepts_every_general_metric(self): + # Loader must instantiate prompt objects without raising; if the JSON + # becomes malformed this will fail loudly. + prompts = load_prompts_from_list(_load(GENERAL_JSON), PromptKind.GENERAL) + assert len(prompts) == len(_load(GENERAL_JSON)) + + def test_loader_accepts_every_funcsel_metric(self): + prompts = load_prompts_from_list( + _load(FUNCSEL_JSON), PromptKind.FUNCTION_SELECTION + ) + assert len(prompts) == len(_load(FUNCSEL_JSON)) + + +# --------------------------------------------------------------------------- +# Guardrails — key production-ready rules must remain in the task descriptions. +# +# We check each rule by looking for a small, stable anchor phrase. If a prompt +# is reorganized, these strings are the contract that must be preserved in +# spirit — update the anchor here AND the prompt together. +# --------------------------------------------------------------------------- + + +MID_TRAJ_ANCHORS = ( + "trajectory", # "one step in an ongoing trajectory" etc. +) + +REDUNDANCY_ANCHORS = ( + "SAME function name AND", # "SAME function name AND SAME arguments" — anywhere + "same arguments", +) + +RECOVERY_ANCHORS = ( + "fallback", # "fallback strategies" in recovery-after-failure text + "returned empty", +) + +CONFIRMATION_ANCHORS = ( + # Confirmation scope is now phrased conceptually: "change persistent + # state" covers the old MUTATING list; "information-gathering" + # covers the old read-only prefix list. + "persistent state", + "information-gathering", +) + +OPTIONAL_PARAM_ANCHORS = ( + "optional", # spec-optional handling for hallucination + "required", +) + +EVIDENCE_ANCHORS = ( + "evidence", # either "evidence-based" or "explicit evidence" +) + + +def _contains_all(text: str, anchors: Iterable[str]) -> bool: + # Normalize any line wrapping/indentation so anchors that span multiple + # words are still detectable. + flat = " ".join(text.split()).lower() + return all(a.lower() in flat for a in anchors) + + +class TestProductionRulesInCommonBlock: + """Shared guardrails now live in ``common_principles.COMMON_PRINCIPLES`` + and are injected into every function-calling metric system prompt via + ``{{ common_principles }}`` (see FunctionMetricsPrompt). Each prompt's + effective system message = common_principles + per-metric task_description, + so the anchors must be present in at least one of those two sources.""" + + @pytest.fixture(scope="class") + def common(self): + from altk.pre_tool.sparc.function_calling.metrics.common_principles import ( + COMMON_PRINCIPLES, + ) + + return COMMON_PRINCIPLES + + def _effective_prompt(self, common: str, td: str) -> str: + return common + "\n\n" + td + + def test_general_hallucination_rules(self, common, general_metrics): + td = general_metrics[METRIC_GENERAL_HALLUCINATION_CHECK]["task_description"] + eff = self._effective_prompt(common, td) + assert _contains_all(eff, EVIDENCE_ANCHORS) + assert _contains_all(eff, OPTIONAL_PARAM_ANCHORS) + assert _contains_all(eff, MID_TRAJ_ANCHORS) + assert _contains_all(eff, RECOVERY_ANCHORS) + + def test_general_value_format_has_optional_carveout(self, common, general_metrics): + td = general_metrics[METRIC_GENERAL_VALUE_FORMAT_ALIGNMENT]["task_description"] + eff = self._effective_prompt(common, td) + assert "optional" in eff.lower() + assert "omitted" in eff.lower() + + def test_general_conversation_rules(self, common, general_metrics): + td = general_metrics[METRIC_GENERAL_CONVERSATION_GROUNDED_CORRECTNESS][ + "task_description" + ] + eff = self._effective_prompt(common, td) + assert _contains_all(eff, MID_TRAJ_ANCHORS) + assert _contains_all(eff, REDUNDANCY_ANCHORS) + assert _contains_all(eff, RECOVERY_ANCHORS) + assert _contains_all(eff, CONFIRMATION_ANCHORS) + + def test_function_selection_rules(self, common, funcsel_metrics): + td = funcsel_metrics[METRIC_FUNCTION_SELECTION_APPROPRIATENESS][ + "task_description" + ] + eff = self._effective_prompt(common, td) + assert _contains_all(eff, MID_TRAJ_ANCHORS) + assert _contains_all(eff, REDUNDANCY_ANCHORS) + assert _contains_all(eff, RECOVERY_ANCHORS) + + def test_agentic_constraints_rules(self, common, funcsel_metrics): + td = funcsel_metrics[METRIC_AGENTIC_CONSTRAINTS_SATISFACTION][ + "task_description" + ] + eff = self._effective_prompt(common, td) + assert _contains_all(eff, MID_TRAJ_ANCHORS) + assert _contains_all(eff, REDUNDANCY_ANCHORS) + assert _contains_all(eff, CONFIRMATION_ANCHORS) + + +class TestCommonPrinciplesBlock: + """Independent of which metric reads them, the shared block must carry + all the production rules identified in our trace analysis.""" + + @pytest.fixture(scope="class") + def common(self): + from altk.pre_tool.sparc.function_calling.metrics.common_principles import ( + COMMON_PRINCIPLES, + ) + + return COMMON_PRINCIPLES + + def test_has_evidence_hierarchy(self, common): + # Order must be: system > tool outputs > user > assistant + low = common.lower() + assert "system prompt" in low + assert "tool output" in low + assert "user message" in low + assert "assistant message" in low + + def test_has_trajectory_awareness(self, common): + assert "trajectory" in common.lower() + + def test_has_redundancy_by_args(self, common): + # Text is line-wrapped; normalize whitespace before matching. + flat = " ".join(common.split()) + assert "SAME function name" in flat + assert "SAME arguments" in flat + + def test_has_recovery_after_failure(self, common): + low = common.lower() + assert "recovery" in low or "fallback" in low + assert "returned empty" in low or "errors" in low + + def test_confirmation_scope_moved_out_of_common(self, common): + # Confirmation scope now lives in the per-metric task_descriptions + # of agentic_constraints_satisfaction and + # general_conversation_grounded_correctness — it is not a + # universally-applicable rule. Common block must NOT mention it. + assert "persistent state" not in common + assert "Confirmation Scope" not in common + # Guardrail: the old domain-leaky prefix enumeration stays gone. + for banned in ("get_*", "find_*", "search_*", "MUTATING"): + assert banned not in common, f"{banned!r} must not appear in common_principles" + + def test_stringency_moved_out_of_common(self, common): + # Stringency is metric-class-specific and now lives on each + # metric's own task_description. + assert "Stringency" not in common + + def test_has_read_only_exploration_pass(self, common): + low = common.lower() + assert "exploration" in low + assert "approved" in low # "should be APPROVED" + + def test_is_bounded(self, common): + # Shared rules shouldn't balloon past a few thousand chars. + assert len(common) < 3000 + + +class TestPromptSizeIsBounded: + """A weak invariant: task_descriptions shouldn't balloon without review.""" + + MAX_CHARS = 8000 + + def test_general(self, general_metrics): + for name, m in general_metrics.items(): + assert len(m["task_description"]) < self.MAX_CHARS, ( + f"{name} task_description too long ({len(m['task_description'])} chars)" + ) + + def test_funcsel(self, funcsel_metrics): + for name, m in funcsel_metrics.items(): + assert len(m["task_description"]) < self.MAX_CHARS, ( + f"{name} task_description too long ({len(m['task_description'])} chars)" + ) diff --git a/tests/pre_tool/sparc/test_transformation_prompts.py b/tests/pre_tool/sparc/test_transformation_prompts.py new file mode 100644 index 00000000..e4ffe8e3 --- /dev/null +++ b/tests/pre_tool/sparc/test_transformation_prompts.py @@ -0,0 +1,63 @@ +"""Tests for the unit/format-extraction prompts. + +The extractor applies a strict evidence hierarchy and prefers conservative +no-ops over spurious "needs transformation" verdicts. These tests verify +the prompt contains the conceptual anchors that encode those rules — not +domain-specific phrasing. Runtime behaviour is covered by the SPARC +pipeline integration tests (marked ``llm``). +""" + +from altk.pre_tool.sparc.function_calling.pipeline import transformation_prompts as tp + + +def _flat(s: str) -> str: + return " ".join(s.split()).lower() + + +class TestMultiExtractSystem: + def test_mentions_evidence_hierarchy_sources(self): + flat = _flat(tp.MULTI_EXTRACT_UNITS_SYSTEM) + # The extractor must know about the evidence hierarchy's + # higher-priority sources generically, without hard-coding any + # specific anchor kind (date, year, region, …). + assert "system prompt" in flat + assert "tool output" in flat + + def test_has_evidence_priority_ordering(self): + flat = _flat(tp.MULTI_EXTRACT_UNITS_SYSTEM) + assert "system prompt > tool outputs > user messages > assistant messages" in flat + + def test_has_under_specified_grounding_rule(self): + flat = _flat(tp.MULTI_EXTRACT_UNITS_SYSTEM) + # The prompt must teach the extractor to COMPLETE an + # under-specified value from a higher-priority anchor BEFORE + # reporting it — without naming a specific example (year, date, + # region, etc.) in the rule itself. + assert "under-specified" in flat + assert "higher-priority source" in flat or "higher priority source" in flat + assert "before reporting" in flat + + def test_grounded_completion_is_not_a_transformation(self): + flat = _flat(tp.MULTI_EXTRACT_UNITS_SYSTEM) + # The rule should clarify that completing a partial value from + # context is NOT a code-level transformation (keeps + # transformation_summary empty). + assert "transformation_summary" in flat + assert "no code-level" in flat or "no code level" in flat + + def test_conservative_noop_rule(self): + flat = _flat(tp.MULTI_EXTRACT_UNITS_SYSTEM) + assert "conservative" in flat + assert "no-op" in flat or "empty strings" in flat + + +class TestMultiExtractUser: + def test_user_prompt_has_grounding_example(self): + # The user prompt still demonstrates grounding via a concrete + # few-shot — concrete examples in few-shots are expected and + # load-bearing. What matters is that the demonstrated + # transformation_summary is empty (contextual completion ≠ code + # transformation). + body = tp.MULTI_EXTRACT_UNITS_USER + flat = " ".join(body.split()) + assert 'transformation_summary":""' in flat From f00735f696409ca4c037c2cf41d109a8da071b6a Mon Sep 17 00:00:00 2001 From: Osher-Elhadad Date: Wed, 6 May 2026 00:25:47 +0300 Subject: [PATCH 4/7] feat: Format with black Signed-off-by: Osher-Elhadad --- .../python_tool_enrichment/enrichment.py | 24 +- .../enrichment_utils/tool/docstring_utils.py | 4 +- .../enrichment_utils/tool/service.py | 14 +- .../nl_utterance_generation_utils/nlg_util.py | 4 +- .../all_testcase_post_process.py | 4 +- .../test_case_generation_utils/utils.py | 1 + altk/core/llm/output_parser.py | 17 +- .../core/llm_examples/azure_openai_example.py | 1 - .../llm_examples/ibm_watsonx_ai_example.py | 1 - .../llm_examples/litellm_ollama_example.py | 1 - .../llm_examples/litellm_watsonx_example.py | 1 - .../routing/follow_up_detection/follow_up.py | 1 - .../retrieval_augmented_thinking/__init__.py | 1 - .../topic_extractor/cli.py | 1 - altk/pre_llm/spotlight/spotlight.py | 12 +- .../pre_response/policy_guard/core/toolkit.py | 1 - .../policy_guard/detect/task_judge.py | 6 +- .../policy_guard/repair/repair_prompts.py | 12 +- .../policy_guard/repair/repairer.py | 1 - altk/pre_tool/refraction/refraction.py | 1 - altk/pre_tool/refraction/src/__init__.py | 1 - .../refraction/src/integration/__init__.py | 1 - .../sparc/examples/semantic_issues_example.py | 1 - .../sparc/examples/static_issues_example.py | 1 - .../units_conversion_error_example.py | 1 - .../function_calling/metrics/__init__.py | 1 - .../function_calling/pipeline/adapters.py | 1 - .../pipeline/transformation_prompts.py | 30 +-- altk/pre_tool/sparc/sparc.py | 24 +- .../tool_enrichment_custom_test.py | 6 +- .../test_case_generation_custom_test.py | 6 +- tests/core/test_validating_llm_client.py | 14 +- tests/post_tool/ragrepair_test.py | 1 - .../test_wx_orchestrate.py | 250 +++++++++--------- .../milvus/test_topic_retriever.py | 1 - .../milvus/test_topic_sink.py | 1 - .../test_topic_loading_milvus.py | 1 - tests/pre_response/policy_guard_test.py | 7 +- .../sparc/semantic_validation_test.py | 30 +-- .../pre_tool/sparc/static_validation_test.py | 30 +-- .../sparc/test_adapter_compact_schema.py | 34 ++- .../sparc/test_eval_metrics_schema.py | 29 +- .../sparc/test_no_benchmark_leakage.py | 20 +- tests/pre_tool/sparc/test_reflection_score.py | 27 +- tests/pre_tool/sparc/test_runtime_prompts.py | 29 +- .../sparc/test_transformation_prompts.py | 4 +- tests/pre_tool/sparc/units_conversion_test.py | 30 +-- tests/utils/refraction/langgraph/utils.py | 1 - tests/utils/refraction/mellea/prompt.py | 1 - tests/utils/refraction/utils.py | 1 - 50 files changed, 348 insertions(+), 344 deletions(-) diff --git a/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment.py b/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment.py index 8a51dbfd..c6d5e2f3 100644 --- a/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment.py +++ b/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment.py @@ -44,18 +44,18 @@ async def enrich_python_tool( ): options = {} options["tool_enrichment"] = {} - options["tool_enrichment"]["enable_tool_description_enrichment"] = ( - enable_tool_description_enrichment - ) - options["tool_enrichment"]["enable_tool_parameter_description_enrichment"] = ( - enable_tool_parameter_description_enrichment - ) - options["tool_enrichment"]["enable_tool_return_description_enrichment"] = ( - enable_tool_return_description_enrichment - ) - options["tool_enrichment"]["enable_tool_example_enrichment"] = ( - enable_tool_example_enrichment - ) + options["tool_enrichment"][ + "enable_tool_description_enrichment" + ] = enable_tool_description_enrichment + options["tool_enrichment"][ + "enable_tool_parameter_description_enrichment" + ] = enable_tool_parameter_description_enrichment + options["tool_enrichment"][ + "enable_tool_return_description_enrichment" + ] = enable_tool_return_description_enrichment + options["tool_enrichment"][ + "enable_tool_example_enrichment" + ] = enable_tool_example_enrichment return await self.do_enrichment( input_file_contents=python_tool_str, options=options ) diff --git a/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment_utils/tool/docstring_utils.py b/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment_utils/tool/docstring_utils.py index 1c9c38fe..e052dce8 100644 --- a/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment_utils/tool/docstring_utils.py +++ b/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment_utils/tool/docstring_utils.py @@ -657,9 +657,7 @@ def extract_function_names_with_decorators( decorators.append(decorator.func.id) elif isinstance(decorator.func, ast.Attribute): # Handle decorators like @abc.decorator4() - decoratorname = ( - f"{decorator.func.value.id}.{decorator.func.attr}" # type: ignore - ) + decoratorname = f"{decorator.func.value.id}.{decorator.func.attr}" # type: ignore decorators.append(decoratorname) elif isinstance(decorator, ast.Attribute): # Handle decorators like @abc.decorator4 (without parentheses) diff --git a/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment_utils/tool/service.py b/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment_utils/tool/service.py index bf1f85c4..8e0adcef 100644 --- a/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment_utils/tool/service.py +++ b/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment_utils/tool/service.py @@ -176,7 +176,9 @@ def enrich(enrichment_config: ToolEnrichmentConfig): else: prompts_dir = prompts_dir + "restofcode_based_prompts/" - if enrichment_config.input_details.options.enable_tool_description_enrichment: + if ( + enrichment_config.input_details.options.enable_tool_description_enrichment + ): prompt_file_template = os.path.join( script_dir, prompts_dir, "toolops_description_via_code.txt" ) @@ -185,7 +187,9 @@ def enrich(enrichment_config: ToolEnrichmentConfig): ) result[cnst.TOOL_DESCRIPTION_ENRICHMENT] = generated_description - if enrichment_config.input_details.options.enable_tool_return_description_enrichment: + if ( + enrichment_config.input_details.options.enable_tool_return_description_enrichment + ): prompt_file_template = os.path.join( script_dir, prompts_dir, "toolops_return_description_via_code.txt" ) @@ -262,9 +266,9 @@ def enrich(enrichment_config: ToolEnrichmentConfig): key not in generated_input_examples[cnst.LABEL_ENRICHED_EXAMPLES] ): - generated_input_examples[cnst.LABEL_ENRICHED_EXAMPLES][key] = ( - value - ) + generated_input_examples[cnst.LABEL_ENRICHED_EXAMPLES][ + key + ] = value result[cnst.TOOL_EXAMPLES_ENRICHMENT] = generated_input_examples # else: diff --git a/altk/build_time/test_case_generation_toolkit/src/toolops/generation/nl_utterance_generation/nl_utterance_generation_utils/nlg_util.py b/altk/build_time/test_case_generation_toolkit/src/toolops/generation/nl_utterance_generation/nl_utterance_generation_utils/nlg_util.py index 6c2b8868..89798b63 100644 --- a/altk/build_time/test_case_generation_toolkit/src/toolops/generation/nl_utterance_generation/nl_utterance_generation_utils/nlg_util.py +++ b/altk/build_time/test_case_generation_toolkit/src/toolops/generation/nl_utterance_generation/nl_utterance_generation_utils/nlg_util.py @@ -9,8 +9,7 @@ ) -preamble = ( - 'Given an API specification and an input payload, along with a sample utterance, your task as an expert user is to \ +preamble = 'Given an API specification and an input payload, along with a sample utterance, your task as an expert user is to \ create multiple, distinct human-like sentences that convey the same information accurately. Each paraphrased sentence \ should:\n\n1. Maintain Fluency: Ensure the sentences are natural and conversational, avoiding robotic or overly formal \ language.\n - Good Example: "Can you book a flight from Delhi to NYC on the 15th of July?"\n - Bad Example: \ @@ -20,7 +19,6 @@ booking IDs, transaction IDs, etc.). Use them exactly as provided in the user utterance.\n\n4. No Introductory Text: Do not \ include lines like “Here are the paraphrases” or similar — output only the paraphrased sentences.\n\nPlease generate the paraphrased \ sentences based on the following\n\n API specification:' -) def is_invalid_utterance(utterance): diff --git a/altk/build_time/test_case_generation_toolkit/src/toolops/generation/test_case_generation/test_case_generation_utils/all_testcase_post_process.py b/altk/build_time/test_case_generation_toolkit/src/toolops/generation/test_case_generation/test_case_generation_utils/all_testcase_post_process.py index 902ec1cb..0e6d4f73 100644 --- a/altk/build_time/test_case_generation_toolkit/src/toolops/generation/test_case_generation/test_case_generation_utils/all_testcase_post_process.py +++ b/altk/build_time/test_case_generation_toolkit/src/toolops/generation/test_case_generation/test_case_generation_utils/all_testcase_post_process.py @@ -234,7 +234,9 @@ def all_testcase_postprocess( "operation" ][operation_type][param_type][ specific_param - ]["type"] + ][ + "type" + ] param_found_flag = True break if param_found_flag: diff --git a/altk/build_time/test_case_generation_toolkit/src/toolops/generation/test_case_generation/test_case_generation_utils/utils.py b/altk/build_time/test_case_generation_toolkit/src/toolops/generation/test_case_generation/test_case_generation_utils/utils.py index 487b76a5..88681ffb 100644 --- a/altk/build_time/test_case_generation_toolkit/src/toolops/generation/test_case_generation/test_case_generation_utils/utils.py +++ b/altk/build_time/test_case_generation_toolkit/src/toolops/generation/test_case_generation/test_case_generation_utils/utils.py @@ -1,5 +1,6 @@ import json import logging + # import sys # import os diff --git a/altk/core/llm/output_parser.py b/altk/core/llm/output_parser.py index 8d47f88e..d5a9fc1b 100644 --- a/altk/core/llm/output_parser.py +++ b/altk/core/llm/output_parser.py @@ -73,7 +73,11 @@ def parse_type( prop_schema: Dict[str, Any], ) -> Type[T]: def _lookup(t: str) -> Type: - return _map_object_for_prop(prop_schema) if t == "object" else type_mapping.get(t, Any) + return ( + _map_object_for_prop(prop_schema) + if t == "object" + else type_mapping.get(t, Any) + ) if isinstance(type_def, list): python_types = [_lookup(t) for t in type_def] @@ -222,13 +226,10 @@ def _safe_parse(raw): # noqa: ANN001, ANN202 _msg = getattr(c0, "message", None) or ( c0.get("message", {}) if isinstance(c0, dict) else {} ) - _reasoning = ( - getattr(_msg, "reasoning_content", None) - or ( - _msg.get("reasoning_content") - if isinstance(_msg, dict) - else None - ) + _reasoning = getattr(_msg, "reasoning_content", None) or ( + _msg.get("reasoning_content") + if isinstance(_msg, dict) + else None ) _finish = getattr(c0, "finish_reason", None) or ( c0.get("finish_reason") if isinstance(c0, dict) else None diff --git a/altk/core/llm_examples/azure_openai_example.py b/altk/core/llm_examples/azure_openai_example.py index c1feb6b5..55c756a5 100644 --- a/altk/core/llm_examples/azure_openai_example.py +++ b/altk/core/llm_examples/azure_openai_example.py @@ -6,7 +6,6 @@ from altk.core.llm import get_llm, GenerationMode from altk.core.llm.types import GenerationArgs - # ────────────────────────────────────────────────────────────────────────────── # 1. Define schemas for structured output # ────────────────────────────────────────────────────────────────────────────── diff --git a/altk/core/llm_examples/ibm_watsonx_ai_example.py b/altk/core/llm_examples/ibm_watsonx_ai_example.py index aa7390ad..04ea155d 100644 --- a/altk/core/llm_examples/ibm_watsonx_ai_example.py +++ b/altk/core/llm_examples/ibm_watsonx_ai_example.py @@ -5,7 +5,6 @@ from altk.core.llm import get_llm, GenerationMode from altk.core.llm.types import GenerationArgs - # ────────────────────────────────────────────────────────────────────────────── # 1. Define schemas for structured output # ────────────────────────────────────────────────────────────────────────────── diff --git a/altk/core/llm_examples/litellm_ollama_example.py b/altk/core/llm_examples/litellm_ollama_example.py index 9e35b2ce..1c4f9e51 100644 --- a/altk/core/llm_examples/litellm_ollama_example.py +++ b/altk/core/llm_examples/litellm_ollama_example.py @@ -5,7 +5,6 @@ from altk.core.llm import get_llm from altk.core.llm.types import GenerationArgs - # ────────────────────────────────────────────────────────────────────────────── # 1. Define schemas for structured output # ────────────────────────────────────────────────────────────────────────────── diff --git a/altk/core/llm_examples/litellm_watsonx_example.py b/altk/core/llm_examples/litellm_watsonx_example.py index 174d6759..96c1b92e 100644 --- a/altk/core/llm_examples/litellm_watsonx_example.py +++ b/altk/core/llm_examples/litellm_watsonx_example.py @@ -5,7 +5,6 @@ from altk.core.llm import get_llm from altk.core.llm.types import GenerationArgs - # ────────────────────────────────────────────────────────────────────────────── # 1. Define schemas for structured output # ────────────────────────────────────────────────────────────────────────────── diff --git a/altk/pre_llm/routing/follow_up_detection/follow_up.py b/altk/pre_llm/routing/follow_up_detection/follow_up.py index e748e554..d76b8703 100644 --- a/altk/pre_llm/routing/follow_up_detection/follow_up.py +++ b/altk/pre_llm/routing/follow_up_detection/follow_up.py @@ -18,7 +18,6 @@ ) from langchain_core.messages import BaseMessage, HumanMessage, AIMessage - logger = logging.getLogger(__name__) FOLLOW_UP_PROMPT = [ diff --git a/altk/pre_llm/routing/retrieval_augmented_thinking/__init__.py b/altk/pre_llm/routing/retrieval_augmented_thinking/__init__.py index 2d7c9241..ab4943e3 100644 --- a/altk/pre_llm/routing/retrieval_augmented_thinking/__init__.py +++ b/altk/pre_llm/routing/retrieval_augmented_thinking/__init__.py @@ -1,6 +1,5 @@ import logging - logger = logging.getLogger(__name__) # import built-in TopicRetriever implementations to force their registering try: diff --git a/altk/pre_llm/routing/retrieval_augmented_thinking/topic_extractor/cli.py b/altk/pre_llm/routing/retrieval_augmented_thinking/topic_extractor/cli.py index a21e149f..196e78da 100644 --- a/altk/pre_llm/routing/retrieval_augmented_thinking/topic_extractor/cli.py +++ b/altk/pre_llm/routing/retrieval_augmented_thinking/topic_extractor/cli.py @@ -13,7 +13,6 @@ run_topic_extractions, ) - logger = logging.getLogger(__name__) diff --git a/altk/pre_llm/spotlight/spotlight.py b/altk/pre_llm/spotlight/spotlight.py index 8a4be492..4d05d82e 100644 --- a/altk/pre_llm/spotlight/spotlight.py +++ b/altk/pre_llm/spotlight/spotlight.py @@ -182,9 +182,9 @@ def get_span_range( elif isinstance(emph_strings[0], str): emph_strings = [[s] for s in emph_strings] - assert len(prompts) == len(emph_strings), ( - "Mismatch prompts ↔ emphasised strings" - ) + assert len(prompts) == len( + emph_strings + ), "Mismatch prompts ↔ emphasised strings" span_ranges_per_sample = [] for prompt, span_list, offsets in zip(prompts, emph_strings, offset_mappings): @@ -355,9 +355,9 @@ def _run(self, data: SpotLightRunInput) -> SpotLightRunOutput: messages, tokenize=False, add_generation_prompt=True ) ] - assert len(emph_strings) == len(chat), ( - "Need 1:1 mapping between number of prompts and emph_strings per prompt" - ) + assert len(emph_strings) == len( + chat + ), "Need 1:1 mapping between number of prompts and emph_strings per prompt" inputs, offset_mappings, tokenized_inp_len = self.tokenize_inputs( chat, self.tokenizer diff --git a/altk/pre_response/policy_guard/core/toolkit.py b/altk/pre_response/policy_guard/core/toolkit.py index 4e16c18c..27f872e5 100644 --- a/altk/pre_response/policy_guard/core/toolkit.py +++ b/altk/pre_response/policy_guard/core/toolkit.py @@ -4,7 +4,6 @@ from altk.core.toolkit import ComponentInput, ComponentOutput - ######### Policy Guard Middleware Interfaces ############## diff --git a/altk/pre_response/policy_guard/detect/task_judge.py b/altk/pre_response/policy_guard/detect/task_judge.py index 6e7e45e9..19eb3763 100644 --- a/altk/pre_response/policy_guard/detect/task_judge.py +++ b/altk/pre_response/policy_guard/detect/task_judge.py @@ -28,7 +28,8 @@ def create_adherence_check_report(results: list[dict]) -> dict: class TaskJudge: def __init__(self, config: ComponentConfig): self.config = config - self.task_completion_prompt = Template("""<|begin_of_text|><|start_header_id|>system<|end_header_id|> + self.task_completion_prompt = Template( + """<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a grader whose job is to determine if a response is a valid response to a query. Your score should be "Yes" or "No". If a response is ambiguous your score should be "Yes". The score should only be "No" if the response is definitely not a valid response.<|eot_id|> <|start_header_id|>user<|end_header_id|> @@ -47,7 +48,8 @@ def __init__(self, config: ComponentConfig): } Return ONLY one JSON object, nothing else. Do not include any additional text or explanations outside the JSON object.<|eot_id|> - <|start_header_id|>assistant<|end_header_id|>""") + <|start_header_id|>assistant<|end_header_id|>""" + ) def check_task_completion(self, task: str, response: str): prompt = self.task_completion_prompt.safe_substitute( diff --git a/altk/pre_response/policy_guard/repair/repair_prompts.py b/altk/pre_response/policy_guard/repair/repair_prompts.py index e96f2adf..c6abc0ed 100644 --- a/altk/pre_response/policy_guard/repair/repair_prompts.py +++ b/altk/pre_response/policy_guard/repair/repair_prompts.py @@ -375,15 +375,13 @@ """ ) -mapreduce_guideline_response_template = Template( - """ +mapreduce_guideline_response_template = Template(""" Guideline: ${guideline} Response: ${response} -""" -) +""") def simple_single_repair_prompt(model_id: str, text: str, policy: str) -> str: @@ -567,9 +565,9 @@ def priority_repair_prompt( def ordered_repair_prompt( text: str, policies: list[str], weights: list[float] = None, ranks: list[int] = None ) -> str: - assert not (weights is None and ranks is None), ( - "One of weights or ranks must be set. " - ) + assert not ( + weights is None and ranks is None + ), "One of weights or ranks must be set. " if weights is not None: # Sort from highest to lowest weight sorted_policies = [x for _, x in sorted(zip(weights, policies), reverse=True)] diff --git a/altk/pre_response/policy_guard/repair/repairer.py b/altk/pre_response/policy_guard/repair/repairer.py index 8b26eefe..e5133675 100644 --- a/altk/pre_response/policy_guard/repair/repairer.py +++ b/altk/pre_response/policy_guard/repair/repairer.py @@ -22,7 +22,6 @@ PolicyDetectorInput, ) - BATCH_REPAIR_NAME = "batch_policy_llm_repairer" ITERATIVE_REPAIR_NAME = "iterative_llm_repairer" RETRY_REPAIR_NAME = "retry_llm_repairer" diff --git a/altk/pre_tool/refraction/refraction.py b/altk/pre_tool/refraction/refraction.py index f192f7ef..7fba06eb 100644 --- a/altk/pre_tool/refraction/refraction.py +++ b/altk/pre_tool/refraction/refraction.py @@ -27,7 +27,6 @@ PromptType, ) - logger = logging.getLogger(__name__) diff --git a/altk/pre_tool/refraction/src/__init__.py b/altk/pre_tool/refraction/src/__init__.py index 4cf02f07..424e1db3 100644 --- a/altk/pre_tool/refraction/src/__init__.py +++ b/altk/pre_tool/refraction/src/__init__.py @@ -8,7 +8,6 @@ generate_prompt, ) - __all__ = [ "refract", "refract_sequence_object", diff --git a/altk/pre_tool/refraction/src/integration/__init__.py b/altk/pre_tool/refraction/src/integration/__init__.py index 82e85765..a6346d57 100644 --- a/altk/pre_tool/refraction/src/integration/__init__.py +++ b/altk/pre_tool/refraction/src/integration/__init__.py @@ -5,7 +5,6 @@ refract, ) - __all__ = [ "Refractor", "refract", diff --git a/altk/pre_tool/sparc/examples/semantic_issues_example.py b/altk/pre_tool/sparc/examples/semantic_issues_example.py index b1d08917..377ea65a 100644 --- a/altk/pre_tool/sparc/examples/semantic_issues_example.py +++ b/altk/pre_tool/sparc/examples/semantic_issues_example.py @@ -13,7 +13,6 @@ from altk.core.toolkit import AgentPhase, ComponentConfig from altk.core.llm import get_llm - # Example tool specifications for mixed-domain scenarios MULTI_DOMAIN_TOOL_SPECS = [ { diff --git a/altk/pre_tool/sparc/examples/static_issues_example.py b/altk/pre_tool/sparc/examples/static_issues_example.py index 929ef801..579577a7 100644 --- a/altk/pre_tool/sparc/examples/static_issues_example.py +++ b/altk/pre_tool/sparc/examples/static_issues_example.py @@ -12,7 +12,6 @@ from altk.core.toolkit import AgentPhase, ComponentConfig from altk.core.llm import get_llm - # Example tool specifications EMAIL_TOOL_SPECS = [ { diff --git a/altk/pre_tool/sparc/examples/units_conversion_error_example.py b/altk/pre_tool/sparc/examples/units_conversion_error_example.py index 3b974fb0..85db0077 100644 --- a/altk/pre_tool/sparc/examples/units_conversion_error_example.py +++ b/altk/pre_tool/sparc/examples/units_conversion_error_example.py @@ -13,7 +13,6 @@ from altk.core.toolkit import AgentPhase, ComponentConfig from altk.core.llm import get_llm - # Example tool specifications WEATHER_TOOL_SPECS = [ { diff --git a/altk/pre_tool/sparc/function_calling/metrics/__init__.py b/altk/pre_tool/sparc/function_calling/metrics/__init__.py index 7f3da8cd..624c3f2b 100644 --- a/altk/pre_tool/sparc/function_calling/metrics/__init__.py +++ b/altk/pre_tool/sparc/function_calling/metrics/__init__.py @@ -19,7 +19,6 @@ get_parameter_metrics_prompt, ) - __all__ = [ "get_general_metrics_prompt", "GeneralMetricsPrompt", diff --git a/altk/pre_tool/sparc/function_calling/pipeline/adapters.py b/altk/pre_tool/sparc/function_calling/pipeline/adapters.py index 4b7257bb..bf7e6ebd 100644 --- a/altk/pre_tool/sparc/function_calling/pipeline/adapters.py +++ b/altk/pre_tool/sparc/function_calling/pipeline/adapters.py @@ -10,7 +10,6 @@ ToolSpec, ) - CompactMode = Literal["auto", "never", "always"] """Compact tool-inventory rendering mode. diff --git a/altk/pre_tool/sparc/function_calling/pipeline/transformation_prompts.py b/altk/pre_tool/sparc/function_calling/pipeline/transformation_prompts.py index e205f44f..44e4b819 100644 --- a/altk/pre_tool/sparc/function_calling/pipeline/transformation_prompts.py +++ b/altk/pre_tool/sparc/function_calling/pipeline/transformation_prompts.py @@ -399,8 +399,7 @@ def build_multi_extract_units_schema(params: List[str]) -> Dict[str, Any]: " return int(transformed_value)\n" ) -transformation_eval_example1 = ( - """ +transformation_eval_example1 = """ ### Example 1: TRANSFORMATION SUMMARY: Convert a date string with the format 'month day, year' to a unix timestamp, e.g., 'December 1st, 2024' converted to a unix timestamp. @@ -411,12 +410,7 @@ def build_multi_extract_units_schema(params: List[str]) -> Dict[str, Any]: TRANSFORMED TYPE: int RESPONSE: -{{""" - + '"generated_code": "' - + generated_code_example1 - + '"' - + """}}""" -) +{{""" + '"generated_code": "' + generated_code_example1 + '"' + """}}""" generated_code_example2 = ( "def transformation_code(input_value: str) -> float:\n" @@ -445,8 +439,7 @@ def build_multi_extract_units_schema(params: List[str]) -> Dict[str, Any]: " return float(transformed_value.strip())\n" ) -transformation_eval_example2 = ( - """ +transformation_eval_example2 = """ ### Example 2: TRANSFORMATION SUMMARY: Convert a string in milliseconds to seconds, e.g., '1000' (milliseconds) divided by 1000 to be in seconds. @@ -457,12 +450,7 @@ def build_multi_extract_units_schema(params: List[str]) -> Dict[str, Any]: TRANSFORMED TYPE: float RESPONSE: -{{""" - + '"generated_code": "' - + generated_code_example2 - + '"' - + """}}""" -) +{{""" + '"generated_code": "' + generated_code_example2 + '"' + """}}""" generated_code_example3 = ( "def transformation_code(input_value: str) -> list[float]:\n" @@ -497,8 +485,7 @@ def build_multi_extract_units_schema(params: List[str]) -> Dict[str, Any]: " return [float(transformed_value)]\n" ) -transformation_eval_example3 = ( - """ +transformation_eval_example3 = """ ### Example 3: TRANSFORMATION SUMMARY: Convert a temperature string in Celsius to Kelvin, e.g., '25' (Celsius) added 273.15 to be in Kelvin. @@ -509,12 +496,7 @@ def build_multi_extract_units_schema(params: List[str]) -> Dict[str, Any]: TRANSFORMED TYPE: list RESPONSE: -{{""" - + '"generated_code": "' - + generated_code_example3 - + '"' - + """}}""" -) +{{""" + '"generated_code": "' + generated_code_example3 + '"' + """}}""" transformation_eval_example4 = """ ### Unsupported Transformation Example: diff --git a/altk/pre_tool/sparc/sparc.py b/altk/pre_tool/sparc/sparc.py index 010123ee..605324f8 100644 --- a/altk/pre_tool/sparc/sparc.py +++ b/altk/pre_tool/sparc/sparc.py @@ -428,7 +428,9 @@ def _extract_recommendations(metric_result): rationale = item.get("rationale") or item.get("details") or "" importance_raw = item.get("importance") try: - importance = float(importance_raw) if importance_raw is not None else 0.5 + importance = ( + float(importance_raw) if importance_raw is not None else 0.5 + ) except (TypeError, ValueError): importance = 0.5 # Clamp into [0, 1]. @@ -516,7 +518,9 @@ def _process_pipeline_result( correction=metric_result.raw_response.get("correction"), output_value=self._rubric_score(metric_result), confidence=self._rubric_confidence(metric_result), - recommendations=self._extract_recommendations(metric_result), + recommendations=self._extract_recommendations( + metric_result + ), ) ) # Always record the rubric score (issue or not) so the @@ -565,7 +569,9 @@ def _process_pipeline_result( ), output_value=self._rubric_score(metric_result), confidence=self._rubric_confidence(metric_result), - recommendations=self._extract_recommendations(metric_result), + recommendations=self._extract_recommendations( + metric_result + ), ) ) s = self._rubric_score(metric_result) @@ -609,14 +615,20 @@ def _process_pipeline_result( "correction" ), output_value=self._rubric_score(metric_result), - confidence=self._rubric_confidence(metric_result), - recommendations=self._extract_recommendations(metric_result), + confidence=self._rubric_confidence( + metric_result + ), + recommendations=self._extract_recommendations( + metric_result + ), ) ) s = self._rubric_score(metric_result) if s is not None: per_metric_scores.append(s) - all_recs.extend(self._extract_recommendations(metric_result)) + all_recs.extend( + self._extract_recommendations(metric_result) + ) # Transform results - check for errors and corrections if pipeline_result.semantic.transform: diff --git a/tests/build_time/tool_enrichment_toolkit/tool_enrichment_custom_test.py b/tests/build_time/tool_enrichment_toolkit/tool_enrichment_custom_test.py index 16ecb8ba..088dcf8e 100644 --- a/tests/build_time/tool_enrichment_toolkit/tool_enrichment_custom_test.py +++ b/tests/build_time/tool_enrichment_toolkit/tool_enrichment_custom_test.py @@ -28,8 +28,7 @@ def test_tool_enrichment_with_toolkit_interface(): PythonToolEnrichConfig, ) - tool_enrich_input = PythonToolEnrichBuildInput( - python_tool='''import requests + tool_enrich_input = PythonToolEnrichBuildInput(python_tool='''import requests from typing import Optional, Dict, Any from langchain_core.tools import tool from model_utils import load_github_token @@ -49,8 +48,7 @@ def listIssues(owner: str, repo: str, requestBody: Optional[Dict[str, Any]] = No response = requests.get(url, headers=headers, params=requestBody or {}) response.raise_for_status() - return response.json()''' - ) + return response.json()''') config = PythonToolEnrichConfig( llm_client=get_llm_client_obj(model_name="mistralai/mistral-medium-2505"), gen_mode=GenerationMode.TEXT, diff --git a/tests/build_time/toolops_test_case_generation_toolkit/test_case_generation_custom_test.py b/tests/build_time/toolops_test_case_generation_toolkit/test_case_generation_custom_test.py index 5a85d837..07e6c7e5 100644 --- a/tests/build_time/toolops_test_case_generation_toolkit/test_case_generation_custom_test.py +++ b/tests/build_time/toolops_test_case_generation_toolkit/test_case_generation_custom_test.py @@ -22,8 +22,7 @@ def get_llm_client_obj(model_name="mistralai/mistral-medium-2505"): def test_case_generation_with_toolkit_interface(): - test_case_gen_input = TestCaseGenBuildInput( - python_tool_str='''import requests + test_case_gen_input = TestCaseGenBuildInput(python_tool_str='''import requests from typing import Optional, Dict, Any from langchain_core.tools import tool from model_utils import load_github_token @@ -43,8 +42,7 @@ def listIssues(owner: str, repo: str, requestBody: Optional[Dict[str, Any]] = No response = requests.get(url, headers=headers, params=requestBody or {}) response.raise_for_status() - return response.json()''' - ) + return response.json()''') config = TestCaseGenConfig( llm_client=get_llm_client_obj(model_name="mistralai/mistral-medium-2505"), gen_mode=GenerationMode.TEXT, diff --git a/tests/core/test_validating_llm_client.py b/tests/core/test_validating_llm_client.py index 452e0347..de8ecd93 100644 --- a/tests/core/test_validating_llm_client.py +++ b/tests/core/test_validating_llm_client.py @@ -29,7 +29,6 @@ relax_freeform_object_schema, ) - # --------------------------------------------------------------------------- # Dummy client used throughout — no actual LLM call. # --------------------------------------------------------------------------- @@ -192,18 +191,14 @@ def test_strict_rejects_json_string_for_object_field(self): c._validate('{"a": "{\\"k\\": 1}"}', self._schema) def test_relaxed_accepts_json_string_for_object_field(self): - c = _FakeValidating( - free_form_object_as_str=True, client=object() - ) + c = _FakeValidating(free_form_object_as_str=True, client=object()) # The LLM returned {"a": ""} — still # valid with relaxed schema. got = c._validate('{"a": "arbitrary JSON-ish"}', self._schema) assert got == {"a": "arbitrary JSON-ish"} def test_relaxed_still_accepts_normal_object(self): - c = _FakeValidating( - free_form_object_as_str=True, client=object() - ) + c = _FakeValidating(free_form_object_as_str=True, client=object()) assert c._validate('{"a": {"k": 1}}', self._schema) == {"a": {"k": 1}} @@ -237,7 +232,9 @@ def test_defaults_applied_when_caller_does_not_set(self, monkeypatch): default_generation_kwargs={"max_tokens": 123, "temperature": 0.0}, client=object(), ) - c.generate([], schema={"type": "object", "properties": {"a": {"type": "string"}}}) + c.generate( + [], schema={"type": "object", "properties": {"a": {"type": "string"}}} + ) obs = observed[-1] assert obs["max_tokens"] == 123 assert obs["temperature"] == 0.0 @@ -286,6 +283,7 @@ def test_reasoning_budget_warning_logged(self, caplog): } ] } + # Wrap raw in a class whose attribute-access fails, forcing ValueError. class _FailingParse: def __init__(self, payload): diff --git a/tests/post_tool/ragrepair_test.py b/tests/post_tool/ragrepair_test.py index eba593a4..af254a24 100644 --- a/tests/post_tool/ragrepair_test.py +++ b/tests/post_tool/ragrepair_test.py @@ -15,7 +15,6 @@ RAGRepairBuildInput, ) - load_dotenv() diff --git a/tests/pre_llm/routing/follow_up_detection/test_wx_orchestrate.py b/tests/pre_llm/routing/follow_up_detection/test_wx_orchestrate.py index 7f47ed1c..61b8e67f 100644 --- a/tests/pre_llm/routing/follow_up_detection/test_wx_orchestrate.py +++ b/tests/pre_llm/routing/follow_up_detection/test_wx_orchestrate.py @@ -23,69 +23,71 @@ def test_basic_conversation(caplog, llm_client): config=ComponentConfig(llm_client=llm_client), follow_up_detector=WxOrchestrateFollowUpDetector(), ) - follow_up_detection_result: FollowUpDetectionRunOutput = follow_up_detection.process( - data=FollowUpDetectionRunInput[WxOrchestrateMessages]( - conversation_history=[ - { - "role": "user", - "content": [ - { - "response_type": "text", - "text": "how do i make an opportunity", - } - ], - }, - { - "role": "assistant", - "content": [ - { - "response_type": "text", - "text": "\n\n### Creating an Opportunity in IBM Sales Cloud (ISC)\n\nTo create an opportunity in IBM Sales Cloud (ISC), follow these steps:\n\n1. **Create a new opportunity**: Click on 'New' in the 'My Opportunities' section.\n2. **Fill in the necessary details**: Enter the opportunity name, select the account, add the sales stage, and input the closing date. Click save.\n3. **Add a product**: Scroll down to the 'Products' section and click 'Add Products.' Search for the product in the catalog, ensuring it's part of the SW Services product family.\n4. **Complete product details**: Enter the estimated total revenue, term, classification, and type code.\n5. **Add 'Data Observability by Databand' product**: Search for 'Data Observability by Databand' and select either 'SW Subscription' or 'SaaS' depending on the customer scenario.\n6. **Complete additional features**: In the 'Additional Features' section, select 'Data Observability.'\n7. **Add opportunity team members**: For an Americas opportunity, add Rodney Cobb; for EMEA, add Sonia Batanero; for APAC and Japan, add Brian Mayer.\n\n\nFor more detailed instructions and variations based on specific products or scenarios, refer to the following resources:\n* [Creating a Databand Opportunity in ISC](https://ibm.seismic.com/Link/Content/DCqmJDP6HVW7pGCFp887M9W7D8mV)\n* [Input Offering Type in ISC -transcript - 2023-Aug-28](https://ibm.seismic.com/Link/Content/DCWPJQd2M2q6m89JFM2HQ6PW9C78)", - "format": {"use_padding": True}, - "streaming_metadata": { - "id": 1, - "stream_id": "72c986e2-9de7-4811-bb0f-04658374a712", + follow_up_detection_result: FollowUpDetectionRunOutput = ( + follow_up_detection.process( + data=FollowUpDetectionRunInput[WxOrchestrateMessages]( + conversation_history=[ + { + "role": "user", + "content": [ + { + "response_type": "text", + "text": "how do i make an opportunity", + } + ], + }, + { + "role": "assistant", + "content": [ + { + "response_type": "text", + "text": "\n\n### Creating an Opportunity in IBM Sales Cloud (ISC)\n\nTo create an opportunity in IBM Sales Cloud (ISC), follow these steps:\n\n1. **Create a new opportunity**: Click on 'New' in the 'My Opportunities' section.\n2. **Fill in the necessary details**: Enter the opportunity name, select the account, add the sales stage, and input the closing date. Click save.\n3. **Add a product**: Scroll down to the 'Products' section and click 'Add Products.' Search for the product in the catalog, ensuring it's part of the SW Services product family.\n4. **Complete product details**: Enter the estimated total revenue, term, classification, and type code.\n5. **Add 'Data Observability by Databand' product**: Search for 'Data Observability by Databand' and select either 'SW Subscription' or 'SaaS' depending on the customer scenario.\n6. **Complete additional features**: In the 'Additional Features' section, select 'Data Observability.'\n7. **Add opportunity team members**: For an Americas opportunity, add Rodney Cobb; for EMEA, add Sonia Batanero; for APAC and Japan, add Brian Mayer.\n\n\nFor more detailed instructions and variations based on specific products or scenarios, refer to the following resources:\n* [Creating a Databand Opportunity in ISC](https://ibm.seismic.com/Link/Content/DCqmJDP6HVW7pGCFp887M9W7D8mV)\n* [Input Offering Type in ISC -transcript - 2023-Aug-28](https://ibm.seismic.com/Link/Content/DCWPJQd2M2q6m89JFM2HQ6PW9C78)", + "format": {"use_padding": True}, + "streaming_metadata": { + "id": 1, + "stream_id": "72c986e2-9de7-4811-bb0f-04658374a712", + }, }, - }, - { - "response_type": "user_defined", - "user_defined": { - "user_defined_type": "sales-assets-rag", - "is_everyone_social_enabled": False, + { + "response_type": "user_defined", + "user_defined": { + "user_defined_type": "sales-assets-rag", + "is_everyone_social_enabled": False, + }, + "repeat_on_reprompt": False, + "streaming_metadata": {"id": 2}, }, - "repeat_on_reprompt": False, - "streaming_metadata": {"id": 2}, - }, - { - "response_type": "user_defined", - "user_defined": { - "no_label": "👎🏼", - "yes_label": "👍🏼", - "issue_options": [ - "Partial Answer", - "Wrong No Source", - "Format Style Issues", - "Incorrect Response", - "Expected Tabular Response", - "Other Issues", - ], - "place_holder_text": "", - "user_defined_type": "custom_feedback", - "custom_user_defined_type": "custom_feedback", - "text_on_negative_feedback": "Thank you for your feedback!", - "text_on_positive_feedback": "Thank you for your feedback!", - "slack_notification_channel": "ibmSales", - "slack_negative_greeting_message": "Dear SMEs \n\n Please review the following interaction to identify potential areas for improvement", - "slack_positive_greeting_message": "Hooray :party_1:, user liked the assistant response :tada:", + { + "response_type": "user_defined", + "user_defined": { + "no_label": "👎🏼", + "yes_label": "👍🏼", + "issue_options": [ + "Partial Answer", + "Wrong No Source", + "Format Style Issues", + "Incorrect Response", + "Expected Tabular Response", + "Other Issues", + ], + "place_holder_text": "", + "user_defined_type": "custom_feedback", + "custom_user_defined_type": "custom_feedback", + "text_on_negative_feedback": "Thank you for your feedback!", + "text_on_positive_feedback": "Thank you for your feedback!", + "slack_notification_channel": "ibmSales", + "slack_negative_greeting_message": "Dear SMEs \n\n Please review the following interaction to identify potential areas for improvement", + "slack_positive_greeting_message": "Hooray :party_1:, user liked the assistant response :tada:", + }, + "streaming_metadata": {"id": 3}, }, - "streaming_metadata": {"id": 3}, - }, - ], - }, - ], - user_query="Partial Answer", - ), - phase=AgentPhase.RUNTIME, + ], + }, + ], + user_query="Partial Answer", + ), + phase=AgentPhase.RUNTIME, + ) ) assert follow_up_detection_result.is_follow_up assert follow_up_detection_result.metadata.get("detection_type") == "llm" @@ -98,68 +100,76 @@ def test_conversation_with_mention_to_assistant(caplog, llm_client): config=ComponentConfig(llm_client=llm_client), follow_up_detector=WxOrchestrateFollowUpDetector(), ) - follow_up_detection_result: FollowUpDetectionRunOutput = follow_up_detection.process( - data=FollowUpDetectionRunInput[WxOrchestrateMessages]( - conversation_history=[ - { - "role": "user", - "content": [ - { - "response_type": "text", - "text": "what is ibm water policy", - } - ], - }, - { - "role": "assistant", - "content": [ - { - "response_type": "text", - "text": "Thanks!\xa0Before I answer, it would be helpful for me if you could share some of the below information about the deal.", - }, - { - "response_type": "text", - "text": "Which of the below can you share?", - "repeat_on_reprompt": False, - }, - { - "response_type": "option", - "options": [ - { - "label": "ISC Opportunity ID", - "value": {"input": {"text": "ISC Opportunity ID"}}, - }, - { - "label": "ISC Account ID", - "value": {"input": {"text": "ISC Account ID"}}, - }, - { - "label": "Domestic Buying Group ID", - "value": { - "input": {"text": "Domestic Buying Group ID"} + follow_up_detection_result: FollowUpDetectionRunOutput = ( + follow_up_detection.process( + data=FollowUpDetectionRunInput[WxOrchestrateMessages]( + conversation_history=[ + { + "role": "user", + "content": [ + { + "response_type": "text", + "text": "what is ibm water policy", + } + ], + }, + { + "role": "assistant", + "content": [ + { + "response_type": "text", + "text": "Thanks!\xa0Before I answer, it would be helpful for me if you could share some of the below information about the deal.", + }, + { + "response_type": "text", + "text": "Which of the below can you share?", + "repeat_on_reprompt": False, + }, + { + "response_type": "option", + "options": [ + { + "label": "ISC Opportunity ID", + "value": { + "input": {"text": "ISC Opportunity ID"} + }, }, - }, - { - "label": "Domestic Client ID", - "value": {"input": {"text": "Domestic Client ID"}}, - }, - { - "label": "Not related to an opportunity", - "value": { - "input": { - "text": "Not related to an opportunity" - } + { + "label": "ISC Account ID", + "value": {"input": {"text": "ISC Account ID"}}, }, - }, - ], - "repeat_on_reprompt": True, - }, - ], - }, - ], - user_query="Domestic Client ID", - ), - phase=AgentPhase.RUNTIME, + { + "label": "Domestic Buying Group ID", + "value": { + "input": { + "text": "Domestic Buying Group ID" + } + }, + }, + { + "label": "Domestic Client ID", + "value": { + "input": {"text": "Domestic Client ID"} + }, + }, + { + "label": "Not related to an opportunity", + "value": { + "input": { + "text": "Not related to an opportunity" + } + }, + }, + ], + "repeat_on_reprompt": True, + }, + ], + }, + ], + user_query="Domestic Client ID", + ), + phase=AgentPhase.RUNTIME, + ) ) assert follow_up_detection_result.is_follow_up assert follow_up_detection_result.metadata.get("detection_type") == "choosen_option" diff --git a/tests/pre_llm/routing/retrieval_augmented_thinking/milvus/test_topic_retriever.py b/tests/pre_llm/routing/retrieval_augmented_thinking/milvus/test_topic_retriever.py index 9280939f..0cbff6a1 100644 --- a/tests/pre_llm/routing/retrieval_augmented_thinking/milvus/test_topic_retriever.py +++ b/tests/pre_llm/routing/retrieval_augmented_thinking/milvus/test_topic_retriever.py @@ -20,7 +20,6 @@ MilvusProvider, ) - pytest.skip( "current tests hang, perhaps due to current milvus-lite", allow_module_level=True ) diff --git a/tests/pre_llm/routing/retrieval_augmented_thinking/milvus/test_topic_sink.py b/tests/pre_llm/routing/retrieval_augmented_thinking/milvus/test_topic_sink.py index e13c2c7c..55e6bf13 100644 --- a/tests/pre_llm/routing/retrieval_augmented_thinking/milvus/test_topic_sink.py +++ b/tests/pre_llm/routing/retrieval_augmented_thinking/milvus/test_topic_sink.py @@ -14,7 +14,6 @@ MilvusProvider, ) - pytest.skip( "current tests hang, perhaps due to current milvus-lite", allow_module_level=True ) diff --git a/tests/pre_llm/routing/retrieval_augmented_thinking/topic_loading/test_topic_loading_milvus.py b/tests/pre_llm/routing/retrieval_augmented_thinking/topic_loading/test_topic_loading_milvus.py index 48090337..f0e94da4 100644 --- a/tests/pre_llm/routing/retrieval_augmented_thinking/topic_loading/test_topic_loading_milvus.py +++ b/tests/pre_llm/routing/retrieval_augmented_thinking/topic_loading/test_topic_loading_milvus.py @@ -27,7 +27,6 @@ ) from altk.core.toolkit import AgentPhase - pytest.skip( "current milvus tests hang, perhaps due to current milvus-lite", allow_module_level=True, diff --git a/tests/pre_response/policy_guard_test.py b/tests/pre_response/policy_guard_test.py index 09d7a315..7e3db00d 100644 --- a/tests/pre_response/policy_guard_test.py +++ b/tests/pre_response/policy_guard_test.py @@ -20,7 +20,6 @@ RetryRepairer, ) - load_dotenv() @@ -66,9 +65,9 @@ def test_detect(self, llm_client): for policy, expected, output in zip( policies, expected_results, result.policy_outputs ): - assert output.compliance == expected, ( - f"Poicy: {policy} -> expected compliance {expected} didn't match result" - ) + assert ( + output.compliance == expected + ), f"Poicy: {policy} -> expected compliance {expected} didn't match result" # def test_messages(self, tmp_path, llm_client): # config = ComponentConfig(llm_client=llm_client) diff --git a/tests/pre_tool/sparc/semantic_validation_test.py b/tests/pre_tool/sparc/semantic_validation_test.py index 973afd66..2bb58207 100644 --- a/tests/pre_tool/sparc/semantic_validation_test.py +++ b/tests/pre_tool/sparc/semantic_validation_test.py @@ -133,9 +133,9 @@ def multi_domain_tool_specs(self): def test_function_selection_misalignment(self, middleware, multi_domain_tool_specs): """Test detection of function selection that doesn't align with user intent.""" - assert not middleware._initialization_error, ( - f"Initialization failed: {middleware._initialization_error}" - ) + assert ( + not middleware._initialization_error + ), f"Initialization failed: {middleware._initialization_error}" conversation_context = [ {"role": "user", "content": "What's the weather like in New York today?"}, @@ -197,9 +197,9 @@ def test_parameter_value_grounding_issues( ): """Test detection of parameter values not grounded in conversation context.""" - assert not middleware._initialization_error, ( - f"Initialization failed: {middleware._initialization_error}" - ) + assert ( + not middleware._initialization_error + ), f"Initialization failed: {middleware._initialization_error}" conversation_context = [ { @@ -247,9 +247,9 @@ def test_parameter_value_grounding_issues( def test_valid_semantic_alignment(self, middleware, multi_domain_tool_specs): """Test that semantically valid and well-grounded function calls pass validation.""" - assert not middleware._initialization_error, ( - f"Initialization failed: {middleware._initialization_error}" - ) + assert ( + not middleware._initialization_error + ), f"Initialization failed: {middleware._initialization_error}" conversation_context = [ { @@ -294,9 +294,9 @@ def test_valid_semantic_alignment(self, middleware, multi_domain_tool_specs): def test_complex_conversation_context(self, middleware, multi_domain_tool_specs): """Test semantic validation with complex multi-turn conversation.""" # Setup - assert not middleware._initialization_error, ( - f"Initialization failed: {middleware._initialization_error}" - ) + assert ( + not middleware._initialization_error + ), f"Initialization failed: {middleware._initialization_error}" # Complex conversation with multiple topics and clarifications complex_conversation = [ @@ -343,9 +343,9 @@ def test_complex_conversation_context(self, middleware, multi_domain_tool_specs) def test_edge_case_empty_conversation(self, middleware, multi_domain_tool_specs): """Test handling of edge case with minimal conversation context.""" # Setup - assert not middleware._initialization_error, ( - f"Initialization failed: {middleware._initialization_error}" - ) + assert ( + not middleware._initialization_error + ), f"Initialization failed: {middleware._initialization_error}" # Minimal conversation minimal_conversation = [{"role": "user", "content": "Weather?"}] diff --git a/tests/pre_tool/sparc/static_validation_test.py b/tests/pre_tool/sparc/static_validation_test.py index 7826eb14..bc9ecbd8 100644 --- a/tests/pre_tool/sparc/static_validation_test.py +++ b/tests/pre_tool/sparc/static_validation_test.py @@ -148,9 +148,9 @@ def test_missing_required_parameters( ): """Test detection of missing required parameters.""" - assert not middleware._initialization_error, ( - f"Initialization failed: {middleware._initialization_error}" - ) + assert ( + not middleware._initialization_error + ), f"Initialization failed: {middleware._initialization_error}" # Test missing required parameters missing_params_call = { @@ -198,9 +198,9 @@ def test_missing_required_parameters( def test_invalid_parameter_types(self, middleware, meeting_tool_specs): """Test detection of invalid parameter types.""" - assert not middleware._initialization_error, ( - f"Initialization failed: {middleware._initialization_error}" - ) + assert ( + not middleware._initialization_error + ), f"Initialization failed: {middleware._initialization_error}" conversation = [ { @@ -255,9 +255,9 @@ def test_invalid_parameter_types(self, middleware, meeting_tool_specs): def test_schema_constraint_violations(self, middleware, email_tool_specs): """Test detection of schema constraint violations.""" - assert not middleware._initialization_error, ( - f"Initialization failed: {middleware._initialization_error}" - ) + assert ( + not middleware._initialization_error + ), f"Initialization failed: {middleware._initialization_error}" conversation = [ {"role": "user", "content": "Send a high priority email to invalid-email"}, @@ -306,9 +306,9 @@ def test_schema_constraint_violations(self, middleware, email_tool_specs): def test_valid_static_structure(self, middleware, email_tool_specs): """Test that valid structure passes static validation.""" - assert not middleware._initialization_error, ( - f"Initialization failed: {middleware._initialization_error}" - ) + assert ( + not middleware._initialization_error + ), f"Initialization failed: {middleware._initialization_error}" conversation = [ { @@ -360,9 +360,9 @@ def test_malformed_json_arguments( ): """Test handling of malformed JSON in tool call arguments.""" - assert not middleware._initialization_error, ( - f"Initialization failed: {middleware._initialization_error}" - ) + assert ( + not middleware._initialization_error + ), f"Initialization failed: {middleware._initialization_error}" # Malformed JSON in arguments malformed_call = { diff --git a/tests/pre_tool/sparc/test_adapter_compact_schema.py b/tests/pre_tool/sparc/test_adapter_compact_schema.py index d2fb6472..85be96f4 100644 --- a/tests/pre_tool/sparc/test_adapter_compact_schema.py +++ b/tests/pre_tool/sparc/test_adapter_compact_schema.py @@ -34,7 +34,9 @@ def _make_spec(name: str, params: List[str]) -> ToolSpec: "description": f"desc of {name}", "parameters": { "type": "object", - "properties": {p: {"type": "string", "description": p} for p in params}, + "properties": { + p: {"type": "string", "description": p} for p in params + }, "required": [], }, }, @@ -44,7 +46,11 @@ def _make_spec(name: str, params: List[str]) -> ToolSpec: def _call() -> ToolCall: return ToolCall.model_validate( - {"id": "c", "type": "function", "function": {"name": "tool_0", "arguments": "{}"}} + { + "id": "c", + "type": "function", + "function": {"name": "tool_0", "arguments": "{}"}, + } ) @@ -64,25 +70,29 @@ def large_inventory() -> List[ToolSpec]: class TestCompactSchemaModes: - def test_never_keeps_full_summary_regardless_of_size(self, small_inventory, large_inventory): + def test_never_keeps_full_summary_regardless_of_size( + self, small_inventory, large_inventory + ): for specs in (small_inventory, large_inventory): ad = OpenAIAdapter(specs, _call(), compact_tool_schema="never") summary = ad.get_tools_inventory_summary() assert len(summary) == len(specs) for entry in summary: - assert isinstance(entry["tool_parameters"], dict), ( - "never mode should emit {param_name: type} dicts" - ) + assert isinstance( + entry["tool_parameters"], dict + ), "never mode should emit {param_name: type} dicts" - def test_always_uses_compact_regardless_of_size(self, small_inventory, large_inventory): + def test_always_uses_compact_regardless_of_size( + self, small_inventory, large_inventory + ): for specs in (small_inventory, large_inventory): ad = OpenAIAdapter(specs, _call(), compact_tool_schema="always") summary = ad.get_tools_inventory_summary() assert len(summary) == len(specs) for entry in summary: - assert isinstance(entry["tool_parameters"], list), ( - "always mode should emit a list of parameter names" - ) + assert isinstance( + entry["tool_parameters"], list + ), "always mode should emit a list of parameter names" def test_auto_stays_full_under_threshold(self, small_inventory): # default threshold = 20; 5 tools should stay full @@ -150,7 +160,9 @@ def test_defaults(self): assert cfg.compact_tool_threshold == 20 def test_config_override(self): - cfg = SPARCReflectionConfig(compact_tool_schema="always", compact_tool_threshold=5) + cfg = SPARCReflectionConfig( + compact_tool_schema="always", compact_tool_threshold=5 + ) assert cfg.compact_tool_schema == "always" assert cfg.compact_tool_threshold == 5 diff --git a/tests/pre_tool/sparc/test_eval_metrics_schema.py b/tests/pre_tool/sparc/test_eval_metrics_schema.py index e63b8626..0670d6d2 100644 --- a/tests/pre_tool/sparc/test_eval_metrics_schema.py +++ b/tests/pre_tool/sparc/test_eval_metrics_schema.py @@ -14,8 +14,9 @@ import pytest - -ROOT = Path(__file__).resolve().parents[3] / "altk/pre_tool/sparc/function_calling/metrics" +ROOT = ( + Path(__file__).resolve().parents[3] / "altk/pre_tool/sparc/function_calling/metrics" +) EVAL_GENERAL = ROOT / "function_call/general_metrics.json" EVAL_FUNCSEL = ROOT / "function_selection/function_selection_metrics.json" @@ -68,22 +69,22 @@ def test_schema_is_unified_diff_shape(self, eval_metrics): item = ar["items"] assert item["type"] == "object" # Required fields - assert set(item.get("required", [])) >= set(REQUIRED_ITEM_KEYS), ( - f"{path.name}::{m['name']} missing required keys in rec item" - ) + assert set(item.get("required", [])) >= set( + REQUIRED_ITEM_KEYS + ), f"{path.name}::{m['name']} missing required keys in rec item" props = item["properties"] # Target must be the closed enum - assert set(props["target"]["enum"]) == ALLOWED_TARGETS, ( - f"{path.name}::{m['name']} target enum must be exactly {ALLOWED_TARGETS}" - ) + assert ( + set(props["target"]["enum"]) == ALLOWED_TARGETS + ), f"{path.name}::{m['name']} target enum must be exactly {ALLOWED_TARGETS}" # Importance must be [0, 1] assert props["importance"]["minimum"] == 0 assert props["importance"]["maximum"] == 1 # No legacy fields (quote / recommendation enum / details) remain for legacy in ("quote", "recommendation", "details"): - assert legacy not in props, ( - f"{path.name}::{m['name']} legacy field {legacy!r} still in schema" - ) + assert ( + legacy not in props + ), f"{path.name}::{m['name']} legacy field {legacy!r} still in schema" def test_no_legacy_recommendation_enum_in_schema(self, eval_metrics): data, path = eval_metrics @@ -102,9 +103,9 @@ def test_no_legacy_recommendation_enum_in_schema(self, eval_metrics): "PREREQUISITE_TRACKING", "TOOL_DEPENDENCY_DOCUMENTATION", ): - assert legacy_enum not in blob, ( - f"{path.name}::{m['name']} still references legacy rec enum {legacy_enum!r}" - ) + assert ( + legacy_enum not in blob + ), f"{path.name}::{m['name']} still references legacy rec enum {legacy_enum!r}" class TestEvalExamples: diff --git a/tests/pre_tool/sparc/test_no_benchmark_leakage.py b/tests/pre_tool/sparc/test_no_benchmark_leakage.py index 79c8d7ac..0c7cb6d4 100644 --- a/tests/pre_tool/sparc/test_no_benchmark_leakage.py +++ b/tests/pre_tool/sparc/test_no_benchmark_leakage.py @@ -23,10 +23,11 @@ import pytest - ROOT = Path(__file__).resolve().parents[3] / "altk/pre_tool/sparc/function_calling" RUNTIME_GENERAL = ROOT / "metrics/function_call/general_metrics_runtime.json" -RUNTIME_FUNCSEL = ROOT / "metrics/function_selection/function_selection_metrics_runtime.json" +RUNTIME_FUNCSEL = ( + ROOT / "metrics/function_selection/function_selection_metrics_runtime.json" +) RUNTIME_PARAM = ROOT / "metrics/parameter/parameter_metrics_runtime.json" EVAL_GENERAL = ROOT / "metrics/function_call/general_metrics.json" EVAL_FUNCSEL = ROOT / "metrics/function_selection/function_selection_metrics.json" @@ -83,7 +84,9 @@ def _system_prompt_constants_from_transformation() -> str: examples — concrete demonstrations are expected to be specific and are exempt from the domain-leak blocklist. """ - from altk.pre_tool.sparc.function_calling.pipeline import transformation_prompts as tp + from altk.pre_tool.sparc.function_calling.pipeline import ( + transformation_prompts as tp, + ) parts = [] for name in ("MULTI_EXTRACT_UNITS_SYSTEM", "GENERATE_CODE_SYSTEM"): @@ -123,7 +126,10 @@ def shared_corpus() -> list[tuple[str, str]]: blobs.append((f"{path.name}[{i}]", td)) blobs.append(("common_principles.COMMON_PRINCIPLES", _common_principles_text())) blobs.append( - ("transformation_prompts.*_SYSTEM", _system_prompt_constants_from_transformation()) + ( + "transformation_prompts.*_SYSTEM", + _system_prompt_constants_from_transformation(), + ) ) return blobs @@ -158,6 +164,6 @@ def test_mutating_as_rule_keyword_is_gone(shared_corpus): # Assert the shouty form is gone from rules. pattern = re.compile(r"\bMUTATING\b") offenders = [label for label, text in shared_corpus if pattern.search(text)] - assert not offenders, ( - f"uppercase 'MUTATING' must not appear in shared prompt rules: {offenders}" - ) + assert ( + not offenders + ), f"uppercase 'MUTATING' must not appear in shared prompt rules: {offenders}" diff --git a/tests/pre_tool/sparc/test_reflection_score.py b/tests/pre_tool/sparc/test_reflection_score.py index b96fa5ba..3efa193e 100644 --- a/tests/pre_tool/sparc/test_reflection_score.py +++ b/tests/pre_tool/sparc/test_reflection_score.py @@ -23,11 +23,16 @@ from altk.pre_tool.sparc.sparc import SPARCReflectionComponent -def _metric(is_issue: bool, output: float | None, confidence: float = 0.9, error: str = ""): - raw = {"output": output, "confidence": confidence, "explanation": "e", "correction": None} - return SimpleNamespace( - is_issue=is_issue, raw_response=raw, error=error - ) +def _metric( + is_issue: bool, output: float | None, confidence: float = 0.9, error: str = "" +): + raw = { + "output": output, + "confidence": confidence, + "explanation": "e", + "correction": None, + } + return SimpleNamespace(is_issue=is_issue, raw_response=raw, error=error) class _PipelineResult(SimpleNamespace): @@ -110,12 +115,16 @@ def __init__(self): # noqa: D401 return _BareComponent() -def _pipeline(general=None, function_selection=None, parameter=None, transform=None, static=None): +def _pipeline( + general=None, function_selection=None, parameter=None, transform=None, static=None +): """Build a PipelineResult-shaped SimpleNamespace.""" return _PipelineResult( static=static, semantic=SimpleNamespace( - general=SimpleNamespace(metrics=general or {}) if general is not None else None, + general=( + SimpleNamespace(metrics=general or {}) if general is not None else None + ), function_selection=( SimpleNamespace(metrics=function_selection or {}) if function_selection is not None @@ -250,9 +259,7 @@ def test_multiple_metrics_aggregate_all_recs(self): comp = _build_component() bad = _metric_with_recs(True, 2.0, [self.GOOD_REC]) ok = _metric_with_recs(False, 5.0, [self.PARAM_REC]) - result = comp._process_pipeline_result( - _pipeline(general={"g1": bad, "g2": ok}) - ) + result = comp._process_pipeline_result(_pipeline(general={"g1": bad, "g2": ok})) assert len(result.all_recommendations) == 2 def test_malformed_rec_is_dropped(self): diff --git a/tests/pre_tool/sparc/test_runtime_prompts.py b/tests/pre_tool/sparc/test_runtime_prompts.py index b179e271..bd358756 100644 --- a/tests/pre_tool/sparc/test_runtime_prompts.py +++ b/tests/pre_tool/sparc/test_runtime_prompts.py @@ -31,8 +31,9 @@ load_prompts_from_list, ) - -ROOT = Path(__file__).resolve().parents[3] / "altk/pre_tool/sparc/function_calling/metrics" +ROOT = ( + Path(__file__).resolve().parents[3] / "altk/pre_tool/sparc/function_calling/metrics" +) GENERAL_JSON = ROOT / "function_call/general_metrics_runtime.json" FUNCSEL_JSON = ROOT / "function_selection/function_selection_metrics_runtime.json" PARAM_JSON = ROOT / "parameter/parameter_metrics_runtime.json" @@ -108,9 +109,7 @@ def test_loader_accepts_every_funcsel_metric(self): # --------------------------------------------------------------------------- -MID_TRAJ_ANCHORS = ( - "trajectory", # "one step in an ongoing trajectory" etc. -) +MID_TRAJ_ANCHORS = ("trajectory",) # "one step in an ongoing trajectory" etc. REDUNDANCY_ANCHORS = ( "SAME function name AND", # "SAME function name AND SAME arguments" — anywhere @@ -135,9 +134,7 @@ def test_loader_accepts_every_funcsel_metric(self): "required", ) -EVIDENCE_ANCHORS = ( - "evidence", # either "evidence-based" or "explicit evidence" -) +EVIDENCE_ANCHORS = ("evidence",) # either "evidence-based" or "explicit evidence" def _contains_all(text: str, anchors: Iterable[str]) -> bool: @@ -251,7 +248,9 @@ def test_confirmation_scope_moved_out_of_common(self, common): assert "Confirmation Scope" not in common # Guardrail: the old domain-leaky prefix enumeration stays gone. for banned in ("get_*", "find_*", "search_*", "MUTATING"): - assert banned not in common, f"{banned!r} must not appear in common_principles" + assert ( + banned not in common + ), f"{banned!r} must not appear in common_principles" def test_stringency_moved_out_of_common(self, common): # Stringency is metric-class-specific and now lives on each @@ -275,12 +274,12 @@ class TestPromptSizeIsBounded: def test_general(self, general_metrics): for name, m in general_metrics.items(): - assert len(m["task_description"]) < self.MAX_CHARS, ( - f"{name} task_description too long ({len(m['task_description'])} chars)" - ) + assert ( + len(m["task_description"]) < self.MAX_CHARS + ), f"{name} task_description too long ({len(m['task_description'])} chars)" def test_funcsel(self, funcsel_metrics): for name, m in funcsel_metrics.items(): - assert len(m["task_description"]) < self.MAX_CHARS, ( - f"{name} task_description too long ({len(m['task_description'])} chars)" - ) + assert ( + len(m["task_description"]) < self.MAX_CHARS + ), f"{name} task_description too long ({len(m['task_description'])} chars)" diff --git a/tests/pre_tool/sparc/test_transformation_prompts.py b/tests/pre_tool/sparc/test_transformation_prompts.py index e4ffe8e3..1d932477 100644 --- a/tests/pre_tool/sparc/test_transformation_prompts.py +++ b/tests/pre_tool/sparc/test_transformation_prompts.py @@ -25,7 +25,9 @@ def test_mentions_evidence_hierarchy_sources(self): def test_has_evidence_priority_ordering(self): flat = _flat(tp.MULTI_EXTRACT_UNITS_SYSTEM) - assert "system prompt > tool outputs > user messages > assistant messages" in flat + assert ( + "system prompt > tool outputs > user messages > assistant messages" in flat + ) def test_has_under_specified_grounding_rule(self): flat = _flat(tp.MULTI_EXTRACT_UNITS_SYSTEM) diff --git a/tests/pre_tool/sparc/units_conversion_test.py b/tests/pre_tool/sparc/units_conversion_test.py index bcf26b4c..e6225a33 100644 --- a/tests/pre_tool/sparc/units_conversion_test.py +++ b/tests/pre_tool/sparc/units_conversion_test.py @@ -140,9 +140,9 @@ def distance_tool_specs(self): def test_correct_conversion_validation(self, middleware, weather_tool_specs): """Test that correct unit conversions pass validation.""" - assert not middleware._initialization_error, ( - f"Could not initialize transformation pipeline: {middleware._initialization_error}" - ) + assert ( + not middleware._initialization_error + ), f"Could not initialize transformation pipeline: {middleware._initialization_error}" conversation_context = [ { @@ -194,9 +194,9 @@ def test_correct_conversion_validation(self, middleware, weather_tool_specs): def test_no_transformation_needed(self, middleware, weather_tool_specs): """Test that tool calls not requiring transformation pass validation.""" - assert not middleware._initialization_error, ( - f"Could not initialize transformation pipeline: {middleware._initialization_error}" - ) + assert ( + not middleware._initialization_error + ), f"Could not initialize transformation pipeline: {middleware._initialization_error}" conversation_context = [ { @@ -252,9 +252,9 @@ def test_temperature_conversion_accuracy( ): """Test accuracy of temperature conversion detection with various values.""" - assert not middleware._initialization_error, ( - f"Could not initialize transformation pipeline: {middleware._initialization_error}" - ) + assert ( + not middleware._initialization_error + ), f"Could not initialize transformation pipeline: {middleware._initialization_error}" conversation_context = [ { @@ -311,9 +311,9 @@ def test_transformation_disabled_config( ): """Test that transformation validation can be disabled.""" - assert not no_transform_middleware._initialization_error, ( - f"Could not initialize transformation pipeline: {no_transform_middleware._initialization_error}" - ) + assert ( + not no_transform_middleware._initialization_error + ), f"Could not initialize transformation pipeline: {no_transform_middleware._initialization_error}" conversation_context = [ {"role": "user", "content": "Set thermostat to 75 Fahrenheit"}, @@ -355,9 +355,9 @@ def test_transformation_disabled_config( def test_complex_transformation_scenario(self, middleware): """Test complex transformation scenario with multiple unit types.""" - assert not middleware._initialization_error, ( - f"Could not initialize transformation pipeline: {middleware._initialization_error}" - ) + assert ( + not middleware._initialization_error + ), f"Could not initialize transformation pipeline: {middleware._initialization_error}" # Tool with multiple unit-sensitive parameters complex_tool_specs = [ diff --git a/tests/utils/refraction/langgraph/utils.py b/tests/utils/refraction/langgraph/utils.py index 63d8ff75..369c0c19 100644 --- a/tests/utils/refraction/langgraph/utils.py +++ b/tests/utils/refraction/langgraph/utils.py @@ -5,7 +5,6 @@ from nestful.utils import get_token from langchain_core.tools import BaseTool - PROMPT_TEMPLATE = Template(""" You are an expert in correcting tool calls. You are given a set of available tools, a query and an incorrect tool call that was meant to satisfy the query. diff --git a/tests/utils/refraction/mellea/prompt.py b/tests/utils/refraction/mellea/prompt.py index 4a2832f1..3a087f56 100644 --- a/tests/utils/refraction/mellea/prompt.py +++ b/tests/utils/refraction/mellea/prompt.py @@ -1,7 +1,6 @@ from typing import Dict, List, Any from jinja2 import Template - PROMPT_TEMPLATE = """ You are an expert in tool calling. You are given a user query, a set of available tools, and the current working memory. diff --git a/tests/utils/refraction/utils.py b/tests/utils/refraction/utils.py index c4edf3e0..cbaea6c0 100644 --- a/tests/utils/refraction/utils.py +++ b/tests/utils/refraction/utils.py @@ -7,7 +7,6 @@ import json - F = TypeVar("F", bound=Callable[..., Any]) From 3d435f00f88400b9bbc2b0bd0fcf0cafaad61895 Mon Sep 17 00:00:00 2001 From: Osher-Elhadad Date: Wed, 6 May 2026 00:32:36 +0300 Subject: [PATCH 5/7] feat: ruff format Signed-off-by: Osher-Elhadad --- .../python_tool_enrichment/enrichment.py | 24 +- .../enrichment_utils/tool/docstring_utils.py | 4 +- .../enrichment_utils/tool/service.py | 14 +- .../nl_utterance_generation_utils/nlg_util.py | 4 +- .../all_testcase_post_process.py | 4 +- altk/pre_llm/spotlight/spotlight.py | 12 +- .../policy_guard/repair/repair_prompts.py | 6 +- .../pipeline/transformation_prompts.py | 30 ++- .../tool_enrichment_custom_test.py | 6 +- .../test_case_generation_custom_test.py | 6 +- .../test_wx_orchestrate.py | 250 +++++++++--------- tests/pre_response/policy_guard_test.py | 6 +- .../sparc/semantic_validation_test.py | 30 +-- .../pre_tool/sparc/static_validation_test.py | 30 +-- .../sparc/test_adapter_compact_schema.py | 12 +- .../sparc/test_eval_metrics_schema.py | 24 +- .../sparc/test_no_benchmark_leakage.py | 9 +- tests/pre_tool/sparc/test_runtime_prompts.py | 18 +- tests/pre_tool/sparc/units_conversion_test.py | 30 +-- 19 files changed, 264 insertions(+), 255 deletions(-) diff --git a/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment.py b/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment.py index c6d5e2f3..8a51dbfd 100644 --- a/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment.py +++ b/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment.py @@ -44,18 +44,18 @@ async def enrich_python_tool( ): options = {} options["tool_enrichment"] = {} - options["tool_enrichment"][ - "enable_tool_description_enrichment" - ] = enable_tool_description_enrichment - options["tool_enrichment"][ - "enable_tool_parameter_description_enrichment" - ] = enable_tool_parameter_description_enrichment - options["tool_enrichment"][ - "enable_tool_return_description_enrichment" - ] = enable_tool_return_description_enrichment - options["tool_enrichment"][ - "enable_tool_example_enrichment" - ] = enable_tool_example_enrichment + options["tool_enrichment"]["enable_tool_description_enrichment"] = ( + enable_tool_description_enrichment + ) + options["tool_enrichment"]["enable_tool_parameter_description_enrichment"] = ( + enable_tool_parameter_description_enrichment + ) + options["tool_enrichment"]["enable_tool_return_description_enrichment"] = ( + enable_tool_return_description_enrichment + ) + options["tool_enrichment"]["enable_tool_example_enrichment"] = ( + enable_tool_example_enrichment + ) return await self.do_enrichment( input_file_contents=python_tool_str, options=options ) diff --git a/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment_utils/tool/docstring_utils.py b/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment_utils/tool/docstring_utils.py index e052dce8..1c9c38fe 100644 --- a/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment_utils/tool/docstring_utils.py +++ b/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment_utils/tool/docstring_utils.py @@ -657,7 +657,9 @@ def extract_function_names_with_decorators( decorators.append(decorator.func.id) elif isinstance(decorator.func, ast.Attribute): # Handle decorators like @abc.decorator4() - decoratorname = f"{decorator.func.value.id}.{decorator.func.attr}" # type: ignore + decoratorname = ( + f"{decorator.func.value.id}.{decorator.func.attr}" # type: ignore + ) decorators.append(decoratorname) elif isinstance(decorator, ast.Attribute): # Handle decorators like @abc.decorator4 (without parentheses) diff --git a/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment_utils/tool/service.py b/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment_utils/tool/service.py index 8e0adcef..bf1f85c4 100644 --- a/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment_utils/tool/service.py +++ b/altk/build_time/test_case_generation_toolkit/src/toolops/enrichment/python_tool_enrichment/enrichment_utils/tool/service.py @@ -176,9 +176,7 @@ def enrich(enrichment_config: ToolEnrichmentConfig): else: prompts_dir = prompts_dir + "restofcode_based_prompts/" - if ( - enrichment_config.input_details.options.enable_tool_description_enrichment - ): + if enrichment_config.input_details.options.enable_tool_description_enrichment: prompt_file_template = os.path.join( script_dir, prompts_dir, "toolops_description_via_code.txt" ) @@ -187,9 +185,7 @@ def enrich(enrichment_config: ToolEnrichmentConfig): ) result[cnst.TOOL_DESCRIPTION_ENRICHMENT] = generated_description - if ( - enrichment_config.input_details.options.enable_tool_return_description_enrichment - ): + if enrichment_config.input_details.options.enable_tool_return_description_enrichment: prompt_file_template = os.path.join( script_dir, prompts_dir, "toolops_return_description_via_code.txt" ) @@ -266,9 +262,9 @@ def enrich(enrichment_config: ToolEnrichmentConfig): key not in generated_input_examples[cnst.LABEL_ENRICHED_EXAMPLES] ): - generated_input_examples[cnst.LABEL_ENRICHED_EXAMPLES][ - key - ] = value + generated_input_examples[cnst.LABEL_ENRICHED_EXAMPLES][key] = ( + value + ) result[cnst.TOOL_EXAMPLES_ENRICHMENT] = generated_input_examples # else: diff --git a/altk/build_time/test_case_generation_toolkit/src/toolops/generation/nl_utterance_generation/nl_utterance_generation_utils/nlg_util.py b/altk/build_time/test_case_generation_toolkit/src/toolops/generation/nl_utterance_generation/nl_utterance_generation_utils/nlg_util.py index 89798b63..6c2b8868 100644 --- a/altk/build_time/test_case_generation_toolkit/src/toolops/generation/nl_utterance_generation/nl_utterance_generation_utils/nlg_util.py +++ b/altk/build_time/test_case_generation_toolkit/src/toolops/generation/nl_utterance_generation/nl_utterance_generation_utils/nlg_util.py @@ -9,7 +9,8 @@ ) -preamble = 'Given an API specification and an input payload, along with a sample utterance, your task as an expert user is to \ +preamble = ( + 'Given an API specification and an input payload, along with a sample utterance, your task as an expert user is to \ create multiple, distinct human-like sentences that convey the same information accurately. Each paraphrased sentence \ should:\n\n1. Maintain Fluency: Ensure the sentences are natural and conversational, avoiding robotic or overly formal \ language.\n - Good Example: "Can you book a flight from Delhi to NYC on the 15th of July?"\n - Bad Example: \ @@ -19,6 +20,7 @@ booking IDs, transaction IDs, etc.). Use them exactly as provided in the user utterance.\n\n4. No Introductory Text: Do not \ include lines like “Here are the paraphrases” or similar — output only the paraphrased sentences.\n\nPlease generate the paraphrased \ sentences based on the following\n\n API specification:' +) def is_invalid_utterance(utterance): diff --git a/altk/build_time/test_case_generation_toolkit/src/toolops/generation/test_case_generation/test_case_generation_utils/all_testcase_post_process.py b/altk/build_time/test_case_generation_toolkit/src/toolops/generation/test_case_generation/test_case_generation_utils/all_testcase_post_process.py index 0e6d4f73..902ec1cb 100644 --- a/altk/build_time/test_case_generation_toolkit/src/toolops/generation/test_case_generation/test_case_generation_utils/all_testcase_post_process.py +++ b/altk/build_time/test_case_generation_toolkit/src/toolops/generation/test_case_generation/test_case_generation_utils/all_testcase_post_process.py @@ -234,9 +234,7 @@ def all_testcase_postprocess( "operation" ][operation_type][param_type][ specific_param - ][ - "type" - ] + ]["type"] param_found_flag = True break if param_found_flag: diff --git a/altk/pre_llm/spotlight/spotlight.py b/altk/pre_llm/spotlight/spotlight.py index 4d05d82e..8a4be492 100644 --- a/altk/pre_llm/spotlight/spotlight.py +++ b/altk/pre_llm/spotlight/spotlight.py @@ -182,9 +182,9 @@ def get_span_range( elif isinstance(emph_strings[0], str): emph_strings = [[s] for s in emph_strings] - assert len(prompts) == len( - emph_strings - ), "Mismatch prompts ↔ emphasised strings" + assert len(prompts) == len(emph_strings), ( + "Mismatch prompts ↔ emphasised strings" + ) span_ranges_per_sample = [] for prompt, span_list, offsets in zip(prompts, emph_strings, offset_mappings): @@ -355,9 +355,9 @@ def _run(self, data: SpotLightRunInput) -> SpotLightRunOutput: messages, tokenize=False, add_generation_prompt=True ) ] - assert len(emph_strings) == len( - chat - ), "Need 1:1 mapping between number of prompts and emph_strings per prompt" + assert len(emph_strings) == len(chat), ( + "Need 1:1 mapping between number of prompts and emph_strings per prompt" + ) inputs, offset_mappings, tokenized_inp_len = self.tokenize_inputs( chat, self.tokenizer diff --git a/altk/pre_response/policy_guard/repair/repair_prompts.py b/altk/pre_response/policy_guard/repair/repair_prompts.py index c6abc0ed..8251f1ed 100644 --- a/altk/pre_response/policy_guard/repair/repair_prompts.py +++ b/altk/pre_response/policy_guard/repair/repair_prompts.py @@ -565,9 +565,9 @@ def priority_repair_prompt( def ordered_repair_prompt( text: str, policies: list[str], weights: list[float] = None, ranks: list[int] = None ) -> str: - assert not ( - weights is None and ranks is None - ), "One of weights or ranks must be set. " + assert not (weights is None and ranks is None), ( + "One of weights or ranks must be set. " + ) if weights is not None: # Sort from highest to lowest weight sorted_policies = [x for _, x in sorted(zip(weights, policies), reverse=True)] diff --git a/altk/pre_tool/sparc/function_calling/pipeline/transformation_prompts.py b/altk/pre_tool/sparc/function_calling/pipeline/transformation_prompts.py index 44e4b819..e205f44f 100644 --- a/altk/pre_tool/sparc/function_calling/pipeline/transformation_prompts.py +++ b/altk/pre_tool/sparc/function_calling/pipeline/transformation_prompts.py @@ -399,7 +399,8 @@ def build_multi_extract_units_schema(params: List[str]) -> Dict[str, Any]: " return int(transformed_value)\n" ) -transformation_eval_example1 = """ +transformation_eval_example1 = ( + """ ### Example 1: TRANSFORMATION SUMMARY: Convert a date string with the format 'month day, year' to a unix timestamp, e.g., 'December 1st, 2024' converted to a unix timestamp. @@ -410,7 +411,12 @@ def build_multi_extract_units_schema(params: List[str]) -> Dict[str, Any]: TRANSFORMED TYPE: int RESPONSE: -{{""" + '"generated_code": "' + generated_code_example1 + '"' + """}}""" +{{""" + + '"generated_code": "' + + generated_code_example1 + + '"' + + """}}""" +) generated_code_example2 = ( "def transformation_code(input_value: str) -> float:\n" @@ -439,7 +445,8 @@ def build_multi_extract_units_schema(params: List[str]) -> Dict[str, Any]: " return float(transformed_value.strip())\n" ) -transformation_eval_example2 = """ +transformation_eval_example2 = ( + """ ### Example 2: TRANSFORMATION SUMMARY: Convert a string in milliseconds to seconds, e.g., '1000' (milliseconds) divided by 1000 to be in seconds. @@ -450,7 +457,12 @@ def build_multi_extract_units_schema(params: List[str]) -> Dict[str, Any]: TRANSFORMED TYPE: float RESPONSE: -{{""" + '"generated_code": "' + generated_code_example2 + '"' + """}}""" +{{""" + + '"generated_code": "' + + generated_code_example2 + + '"' + + """}}""" +) generated_code_example3 = ( "def transformation_code(input_value: str) -> list[float]:\n" @@ -485,7 +497,8 @@ def build_multi_extract_units_schema(params: List[str]) -> Dict[str, Any]: " return [float(transformed_value)]\n" ) -transformation_eval_example3 = """ +transformation_eval_example3 = ( + """ ### Example 3: TRANSFORMATION SUMMARY: Convert a temperature string in Celsius to Kelvin, e.g., '25' (Celsius) added 273.15 to be in Kelvin. @@ -496,7 +509,12 @@ def build_multi_extract_units_schema(params: List[str]) -> Dict[str, Any]: TRANSFORMED TYPE: list RESPONSE: -{{""" + '"generated_code": "' + generated_code_example3 + '"' + """}}""" +{{""" + + '"generated_code": "' + + generated_code_example3 + + '"' + + """}}""" +) transformation_eval_example4 = """ ### Unsupported Transformation Example: diff --git a/tests/build_time/tool_enrichment_toolkit/tool_enrichment_custom_test.py b/tests/build_time/tool_enrichment_toolkit/tool_enrichment_custom_test.py index 088dcf8e..16ecb8ba 100644 --- a/tests/build_time/tool_enrichment_toolkit/tool_enrichment_custom_test.py +++ b/tests/build_time/tool_enrichment_toolkit/tool_enrichment_custom_test.py @@ -28,7 +28,8 @@ def test_tool_enrichment_with_toolkit_interface(): PythonToolEnrichConfig, ) - tool_enrich_input = PythonToolEnrichBuildInput(python_tool='''import requests + tool_enrich_input = PythonToolEnrichBuildInput( + python_tool='''import requests from typing import Optional, Dict, Any from langchain_core.tools import tool from model_utils import load_github_token @@ -48,7 +49,8 @@ def listIssues(owner: str, repo: str, requestBody: Optional[Dict[str, Any]] = No response = requests.get(url, headers=headers, params=requestBody or {}) response.raise_for_status() - return response.json()''') + return response.json()''' + ) config = PythonToolEnrichConfig( llm_client=get_llm_client_obj(model_name="mistralai/mistral-medium-2505"), gen_mode=GenerationMode.TEXT, diff --git a/tests/build_time/toolops_test_case_generation_toolkit/test_case_generation_custom_test.py b/tests/build_time/toolops_test_case_generation_toolkit/test_case_generation_custom_test.py index 07e6c7e5..5a85d837 100644 --- a/tests/build_time/toolops_test_case_generation_toolkit/test_case_generation_custom_test.py +++ b/tests/build_time/toolops_test_case_generation_toolkit/test_case_generation_custom_test.py @@ -22,7 +22,8 @@ def get_llm_client_obj(model_name="mistralai/mistral-medium-2505"): def test_case_generation_with_toolkit_interface(): - test_case_gen_input = TestCaseGenBuildInput(python_tool_str='''import requests + test_case_gen_input = TestCaseGenBuildInput( + python_tool_str='''import requests from typing import Optional, Dict, Any from langchain_core.tools import tool from model_utils import load_github_token @@ -42,7 +43,8 @@ def listIssues(owner: str, repo: str, requestBody: Optional[Dict[str, Any]] = No response = requests.get(url, headers=headers, params=requestBody or {}) response.raise_for_status() - return response.json()''') + return response.json()''' + ) config = TestCaseGenConfig( llm_client=get_llm_client_obj(model_name="mistralai/mistral-medium-2505"), gen_mode=GenerationMode.TEXT, diff --git a/tests/pre_llm/routing/follow_up_detection/test_wx_orchestrate.py b/tests/pre_llm/routing/follow_up_detection/test_wx_orchestrate.py index 61b8e67f..7f47ed1c 100644 --- a/tests/pre_llm/routing/follow_up_detection/test_wx_orchestrate.py +++ b/tests/pre_llm/routing/follow_up_detection/test_wx_orchestrate.py @@ -23,71 +23,69 @@ def test_basic_conversation(caplog, llm_client): config=ComponentConfig(llm_client=llm_client), follow_up_detector=WxOrchestrateFollowUpDetector(), ) - follow_up_detection_result: FollowUpDetectionRunOutput = ( - follow_up_detection.process( - data=FollowUpDetectionRunInput[WxOrchestrateMessages]( - conversation_history=[ - { - "role": "user", - "content": [ - { - "response_type": "text", - "text": "how do i make an opportunity", - } - ], - }, - { - "role": "assistant", - "content": [ - { - "response_type": "text", - "text": "\n\n### Creating an Opportunity in IBM Sales Cloud (ISC)\n\nTo create an opportunity in IBM Sales Cloud (ISC), follow these steps:\n\n1. **Create a new opportunity**: Click on 'New' in the 'My Opportunities' section.\n2. **Fill in the necessary details**: Enter the opportunity name, select the account, add the sales stage, and input the closing date. Click save.\n3. **Add a product**: Scroll down to the 'Products' section and click 'Add Products.' Search for the product in the catalog, ensuring it's part of the SW Services product family.\n4. **Complete product details**: Enter the estimated total revenue, term, classification, and type code.\n5. **Add 'Data Observability by Databand' product**: Search for 'Data Observability by Databand' and select either 'SW Subscription' or 'SaaS' depending on the customer scenario.\n6. **Complete additional features**: In the 'Additional Features' section, select 'Data Observability.'\n7. **Add opportunity team members**: For an Americas opportunity, add Rodney Cobb; for EMEA, add Sonia Batanero; for APAC and Japan, add Brian Mayer.\n\n\nFor more detailed instructions and variations based on specific products or scenarios, refer to the following resources:\n* [Creating a Databand Opportunity in ISC](https://ibm.seismic.com/Link/Content/DCqmJDP6HVW7pGCFp887M9W7D8mV)\n* [Input Offering Type in ISC -transcript - 2023-Aug-28](https://ibm.seismic.com/Link/Content/DCWPJQd2M2q6m89JFM2HQ6PW9C78)", - "format": {"use_padding": True}, - "streaming_metadata": { - "id": 1, - "stream_id": "72c986e2-9de7-4811-bb0f-04658374a712", - }, + follow_up_detection_result: FollowUpDetectionRunOutput = follow_up_detection.process( + data=FollowUpDetectionRunInput[WxOrchestrateMessages]( + conversation_history=[ + { + "role": "user", + "content": [ + { + "response_type": "text", + "text": "how do i make an opportunity", + } + ], + }, + { + "role": "assistant", + "content": [ + { + "response_type": "text", + "text": "\n\n### Creating an Opportunity in IBM Sales Cloud (ISC)\n\nTo create an opportunity in IBM Sales Cloud (ISC), follow these steps:\n\n1. **Create a new opportunity**: Click on 'New' in the 'My Opportunities' section.\n2. **Fill in the necessary details**: Enter the opportunity name, select the account, add the sales stage, and input the closing date. Click save.\n3. **Add a product**: Scroll down to the 'Products' section and click 'Add Products.' Search for the product in the catalog, ensuring it's part of the SW Services product family.\n4. **Complete product details**: Enter the estimated total revenue, term, classification, and type code.\n5. **Add 'Data Observability by Databand' product**: Search for 'Data Observability by Databand' and select either 'SW Subscription' or 'SaaS' depending on the customer scenario.\n6. **Complete additional features**: In the 'Additional Features' section, select 'Data Observability.'\n7. **Add opportunity team members**: For an Americas opportunity, add Rodney Cobb; for EMEA, add Sonia Batanero; for APAC and Japan, add Brian Mayer.\n\n\nFor more detailed instructions and variations based on specific products or scenarios, refer to the following resources:\n* [Creating a Databand Opportunity in ISC](https://ibm.seismic.com/Link/Content/DCqmJDP6HVW7pGCFp887M9W7D8mV)\n* [Input Offering Type in ISC -transcript - 2023-Aug-28](https://ibm.seismic.com/Link/Content/DCWPJQd2M2q6m89JFM2HQ6PW9C78)", + "format": {"use_padding": True}, + "streaming_metadata": { + "id": 1, + "stream_id": "72c986e2-9de7-4811-bb0f-04658374a712", }, - { - "response_type": "user_defined", - "user_defined": { - "user_defined_type": "sales-assets-rag", - "is_everyone_social_enabled": False, - }, - "repeat_on_reprompt": False, - "streaming_metadata": {"id": 2}, + }, + { + "response_type": "user_defined", + "user_defined": { + "user_defined_type": "sales-assets-rag", + "is_everyone_social_enabled": False, }, - { - "response_type": "user_defined", - "user_defined": { - "no_label": "👎🏼", - "yes_label": "👍🏼", - "issue_options": [ - "Partial Answer", - "Wrong No Source", - "Format Style Issues", - "Incorrect Response", - "Expected Tabular Response", - "Other Issues", - ], - "place_holder_text": "", - "user_defined_type": "custom_feedback", - "custom_user_defined_type": "custom_feedback", - "text_on_negative_feedback": "Thank you for your feedback!", - "text_on_positive_feedback": "Thank you for your feedback!", - "slack_notification_channel": "ibmSales", - "slack_negative_greeting_message": "Dear SMEs \n\n Please review the following interaction to identify potential areas for improvement", - "slack_positive_greeting_message": "Hooray :party_1:, user liked the assistant response :tada:", - }, - "streaming_metadata": {"id": 3}, + "repeat_on_reprompt": False, + "streaming_metadata": {"id": 2}, + }, + { + "response_type": "user_defined", + "user_defined": { + "no_label": "👎🏼", + "yes_label": "👍🏼", + "issue_options": [ + "Partial Answer", + "Wrong No Source", + "Format Style Issues", + "Incorrect Response", + "Expected Tabular Response", + "Other Issues", + ], + "place_holder_text": "", + "user_defined_type": "custom_feedback", + "custom_user_defined_type": "custom_feedback", + "text_on_negative_feedback": "Thank you for your feedback!", + "text_on_positive_feedback": "Thank you for your feedback!", + "slack_notification_channel": "ibmSales", + "slack_negative_greeting_message": "Dear SMEs \n\n Please review the following interaction to identify potential areas for improvement", + "slack_positive_greeting_message": "Hooray :party_1:, user liked the assistant response :tada:", }, - ], - }, - ], - user_query="Partial Answer", - ), - phase=AgentPhase.RUNTIME, - ) + "streaming_metadata": {"id": 3}, + }, + ], + }, + ], + user_query="Partial Answer", + ), + phase=AgentPhase.RUNTIME, ) assert follow_up_detection_result.is_follow_up assert follow_up_detection_result.metadata.get("detection_type") == "llm" @@ -100,76 +98,68 @@ def test_conversation_with_mention_to_assistant(caplog, llm_client): config=ComponentConfig(llm_client=llm_client), follow_up_detector=WxOrchestrateFollowUpDetector(), ) - follow_up_detection_result: FollowUpDetectionRunOutput = ( - follow_up_detection.process( - data=FollowUpDetectionRunInput[WxOrchestrateMessages]( - conversation_history=[ - { - "role": "user", - "content": [ - { - "response_type": "text", - "text": "what is ibm water policy", - } - ], - }, - { - "role": "assistant", - "content": [ - { - "response_type": "text", - "text": "Thanks!\xa0Before I answer, it would be helpful for me if you could share some of the below information about the deal.", - }, - { - "response_type": "text", - "text": "Which of the below can you share?", - "repeat_on_reprompt": False, - }, - { - "response_type": "option", - "options": [ - { - "label": "ISC Opportunity ID", - "value": { - "input": {"text": "ISC Opportunity ID"} - }, - }, - { - "label": "ISC Account ID", - "value": {"input": {"text": "ISC Account ID"}}, - }, - { - "label": "Domestic Buying Group ID", - "value": { - "input": { - "text": "Domestic Buying Group ID" - } - }, - }, - { - "label": "Domestic Client ID", - "value": { - "input": {"text": "Domestic Client ID"} - }, + follow_up_detection_result: FollowUpDetectionRunOutput = follow_up_detection.process( + data=FollowUpDetectionRunInput[WxOrchestrateMessages]( + conversation_history=[ + { + "role": "user", + "content": [ + { + "response_type": "text", + "text": "what is ibm water policy", + } + ], + }, + { + "role": "assistant", + "content": [ + { + "response_type": "text", + "text": "Thanks!\xa0Before I answer, it would be helpful for me if you could share some of the below information about the deal.", + }, + { + "response_type": "text", + "text": "Which of the below can you share?", + "repeat_on_reprompt": False, + }, + { + "response_type": "option", + "options": [ + { + "label": "ISC Opportunity ID", + "value": {"input": {"text": "ISC Opportunity ID"}}, + }, + { + "label": "ISC Account ID", + "value": {"input": {"text": "ISC Account ID"}}, + }, + { + "label": "Domestic Buying Group ID", + "value": { + "input": {"text": "Domestic Buying Group ID"} }, - { - "label": "Not related to an opportunity", - "value": { - "input": { - "text": "Not related to an opportunity" - } - }, + }, + { + "label": "Domestic Client ID", + "value": {"input": {"text": "Domestic Client ID"}}, + }, + { + "label": "Not related to an opportunity", + "value": { + "input": { + "text": "Not related to an opportunity" + } }, - ], - "repeat_on_reprompt": True, - }, - ], - }, - ], - user_query="Domestic Client ID", - ), - phase=AgentPhase.RUNTIME, - ) + }, + ], + "repeat_on_reprompt": True, + }, + ], + }, + ], + user_query="Domestic Client ID", + ), + phase=AgentPhase.RUNTIME, ) assert follow_up_detection_result.is_follow_up assert follow_up_detection_result.metadata.get("detection_type") == "choosen_option" diff --git a/tests/pre_response/policy_guard_test.py b/tests/pre_response/policy_guard_test.py index 7e3db00d..64821855 100644 --- a/tests/pre_response/policy_guard_test.py +++ b/tests/pre_response/policy_guard_test.py @@ -65,9 +65,9 @@ def test_detect(self, llm_client): for policy, expected, output in zip( policies, expected_results, result.policy_outputs ): - assert ( - output.compliance == expected - ), f"Poicy: {policy} -> expected compliance {expected} didn't match result" + assert output.compliance == expected, ( + f"Poicy: {policy} -> expected compliance {expected} didn't match result" + ) # def test_messages(self, tmp_path, llm_client): # config = ComponentConfig(llm_client=llm_client) diff --git a/tests/pre_tool/sparc/semantic_validation_test.py b/tests/pre_tool/sparc/semantic_validation_test.py index 2bb58207..973afd66 100644 --- a/tests/pre_tool/sparc/semantic_validation_test.py +++ b/tests/pre_tool/sparc/semantic_validation_test.py @@ -133,9 +133,9 @@ def multi_domain_tool_specs(self): def test_function_selection_misalignment(self, middleware, multi_domain_tool_specs): """Test detection of function selection that doesn't align with user intent.""" - assert ( - not middleware._initialization_error - ), f"Initialization failed: {middleware._initialization_error}" + assert not middleware._initialization_error, ( + f"Initialization failed: {middleware._initialization_error}" + ) conversation_context = [ {"role": "user", "content": "What's the weather like in New York today?"}, @@ -197,9 +197,9 @@ def test_parameter_value_grounding_issues( ): """Test detection of parameter values not grounded in conversation context.""" - assert ( - not middleware._initialization_error - ), f"Initialization failed: {middleware._initialization_error}" + assert not middleware._initialization_error, ( + f"Initialization failed: {middleware._initialization_error}" + ) conversation_context = [ { @@ -247,9 +247,9 @@ def test_parameter_value_grounding_issues( def test_valid_semantic_alignment(self, middleware, multi_domain_tool_specs): """Test that semantically valid and well-grounded function calls pass validation.""" - assert ( - not middleware._initialization_error - ), f"Initialization failed: {middleware._initialization_error}" + assert not middleware._initialization_error, ( + f"Initialization failed: {middleware._initialization_error}" + ) conversation_context = [ { @@ -294,9 +294,9 @@ def test_valid_semantic_alignment(self, middleware, multi_domain_tool_specs): def test_complex_conversation_context(self, middleware, multi_domain_tool_specs): """Test semantic validation with complex multi-turn conversation.""" # Setup - assert ( - not middleware._initialization_error - ), f"Initialization failed: {middleware._initialization_error}" + assert not middleware._initialization_error, ( + f"Initialization failed: {middleware._initialization_error}" + ) # Complex conversation with multiple topics and clarifications complex_conversation = [ @@ -343,9 +343,9 @@ def test_complex_conversation_context(self, middleware, multi_domain_tool_specs) def test_edge_case_empty_conversation(self, middleware, multi_domain_tool_specs): """Test handling of edge case with minimal conversation context.""" # Setup - assert ( - not middleware._initialization_error - ), f"Initialization failed: {middleware._initialization_error}" + assert not middleware._initialization_error, ( + f"Initialization failed: {middleware._initialization_error}" + ) # Minimal conversation minimal_conversation = [{"role": "user", "content": "Weather?"}] diff --git a/tests/pre_tool/sparc/static_validation_test.py b/tests/pre_tool/sparc/static_validation_test.py index bc9ecbd8..7826eb14 100644 --- a/tests/pre_tool/sparc/static_validation_test.py +++ b/tests/pre_tool/sparc/static_validation_test.py @@ -148,9 +148,9 @@ def test_missing_required_parameters( ): """Test detection of missing required parameters.""" - assert ( - not middleware._initialization_error - ), f"Initialization failed: {middleware._initialization_error}" + assert not middleware._initialization_error, ( + f"Initialization failed: {middleware._initialization_error}" + ) # Test missing required parameters missing_params_call = { @@ -198,9 +198,9 @@ def test_missing_required_parameters( def test_invalid_parameter_types(self, middleware, meeting_tool_specs): """Test detection of invalid parameter types.""" - assert ( - not middleware._initialization_error - ), f"Initialization failed: {middleware._initialization_error}" + assert not middleware._initialization_error, ( + f"Initialization failed: {middleware._initialization_error}" + ) conversation = [ { @@ -255,9 +255,9 @@ def test_invalid_parameter_types(self, middleware, meeting_tool_specs): def test_schema_constraint_violations(self, middleware, email_tool_specs): """Test detection of schema constraint violations.""" - assert ( - not middleware._initialization_error - ), f"Initialization failed: {middleware._initialization_error}" + assert not middleware._initialization_error, ( + f"Initialization failed: {middleware._initialization_error}" + ) conversation = [ {"role": "user", "content": "Send a high priority email to invalid-email"}, @@ -306,9 +306,9 @@ def test_schema_constraint_violations(self, middleware, email_tool_specs): def test_valid_static_structure(self, middleware, email_tool_specs): """Test that valid structure passes static validation.""" - assert ( - not middleware._initialization_error - ), f"Initialization failed: {middleware._initialization_error}" + assert not middleware._initialization_error, ( + f"Initialization failed: {middleware._initialization_error}" + ) conversation = [ { @@ -360,9 +360,9 @@ def test_malformed_json_arguments( ): """Test handling of malformed JSON in tool call arguments.""" - assert ( - not middleware._initialization_error - ), f"Initialization failed: {middleware._initialization_error}" + assert not middleware._initialization_error, ( + f"Initialization failed: {middleware._initialization_error}" + ) # Malformed JSON in arguments malformed_call = { diff --git a/tests/pre_tool/sparc/test_adapter_compact_schema.py b/tests/pre_tool/sparc/test_adapter_compact_schema.py index 85be96f4..47e9312e 100644 --- a/tests/pre_tool/sparc/test_adapter_compact_schema.py +++ b/tests/pre_tool/sparc/test_adapter_compact_schema.py @@ -78,9 +78,9 @@ def test_never_keeps_full_summary_regardless_of_size( summary = ad.get_tools_inventory_summary() assert len(summary) == len(specs) for entry in summary: - assert isinstance( - entry["tool_parameters"], dict - ), "never mode should emit {param_name: type} dicts" + assert isinstance(entry["tool_parameters"], dict), ( + "never mode should emit {param_name: type} dicts" + ) def test_always_uses_compact_regardless_of_size( self, small_inventory, large_inventory @@ -90,9 +90,9 @@ def test_always_uses_compact_regardless_of_size( summary = ad.get_tools_inventory_summary() assert len(summary) == len(specs) for entry in summary: - assert isinstance( - entry["tool_parameters"], list - ), "always mode should emit a list of parameter names" + assert isinstance(entry["tool_parameters"], list), ( + "always mode should emit a list of parameter names" + ) def test_auto_stays_full_under_threshold(self, small_inventory): # default threshold = 20; 5 tools should stay full diff --git a/tests/pre_tool/sparc/test_eval_metrics_schema.py b/tests/pre_tool/sparc/test_eval_metrics_schema.py index 0670d6d2..84e9c210 100644 --- a/tests/pre_tool/sparc/test_eval_metrics_schema.py +++ b/tests/pre_tool/sparc/test_eval_metrics_schema.py @@ -69,22 +69,22 @@ def test_schema_is_unified_diff_shape(self, eval_metrics): item = ar["items"] assert item["type"] == "object" # Required fields - assert set(item.get("required", [])) >= set( - REQUIRED_ITEM_KEYS - ), f"{path.name}::{m['name']} missing required keys in rec item" + assert set(item.get("required", [])) >= set(REQUIRED_ITEM_KEYS), ( + f"{path.name}::{m['name']} missing required keys in rec item" + ) props = item["properties"] # Target must be the closed enum - assert ( - set(props["target"]["enum"]) == ALLOWED_TARGETS - ), f"{path.name}::{m['name']} target enum must be exactly {ALLOWED_TARGETS}" + assert set(props["target"]["enum"]) == ALLOWED_TARGETS, ( + f"{path.name}::{m['name']} target enum must be exactly {ALLOWED_TARGETS}" + ) # Importance must be [0, 1] assert props["importance"]["minimum"] == 0 assert props["importance"]["maximum"] == 1 # No legacy fields (quote / recommendation enum / details) remain for legacy in ("quote", "recommendation", "details"): - assert ( - legacy not in props - ), f"{path.name}::{m['name']} legacy field {legacy!r} still in schema" + assert legacy not in props, ( + f"{path.name}::{m['name']} legacy field {legacy!r} still in schema" + ) def test_no_legacy_recommendation_enum_in_schema(self, eval_metrics): data, path = eval_metrics @@ -103,9 +103,9 @@ def test_no_legacy_recommendation_enum_in_schema(self, eval_metrics): "PREREQUISITE_TRACKING", "TOOL_DEPENDENCY_DOCUMENTATION", ): - assert ( - legacy_enum not in blob - ), f"{path.name}::{m['name']} still references legacy rec enum {legacy_enum!r}" + assert legacy_enum not in blob, ( + f"{path.name}::{m['name']} still references legacy rec enum {legacy_enum!r}" + ) class TestEvalExamples: diff --git a/tests/pre_tool/sparc/test_no_benchmark_leakage.py b/tests/pre_tool/sparc/test_no_benchmark_leakage.py index 0c7cb6d4..5b344741 100644 --- a/tests/pre_tool/sparc/test_no_benchmark_leakage.py +++ b/tests/pre_tool/sparc/test_no_benchmark_leakage.py @@ -139,8 +139,7 @@ def test_no_benchmark_term_in_shared_prompts(shared_corpus, term): lowered_corpus = [(label, text.lower()) for label, text in shared_corpus] offenders = [label for label, text in lowered_corpus if term in text] assert not offenders, ( - f"benchmark-specific term {term!r} leaked into shared prompt text: " - f"{offenders}" + f"benchmark-specific term {term!r} leaked into shared prompt text: {offenders}" ) @@ -164,6 +163,6 @@ def test_mutating_as_rule_keyword_is_gone(shared_corpus): # Assert the shouty form is gone from rules. pattern = re.compile(r"\bMUTATING\b") offenders = [label for label, text in shared_corpus if pattern.search(text)] - assert ( - not offenders - ), f"uppercase 'MUTATING' must not appear in shared prompt rules: {offenders}" + assert not offenders, ( + f"uppercase 'MUTATING' must not appear in shared prompt rules: {offenders}" + ) diff --git a/tests/pre_tool/sparc/test_runtime_prompts.py b/tests/pre_tool/sparc/test_runtime_prompts.py index bd358756..c663e80c 100644 --- a/tests/pre_tool/sparc/test_runtime_prompts.py +++ b/tests/pre_tool/sparc/test_runtime_prompts.py @@ -248,9 +248,9 @@ def test_confirmation_scope_moved_out_of_common(self, common): assert "Confirmation Scope" not in common # Guardrail: the old domain-leaky prefix enumeration stays gone. for banned in ("get_*", "find_*", "search_*", "MUTATING"): - assert ( - banned not in common - ), f"{banned!r} must not appear in common_principles" + assert banned not in common, ( + f"{banned!r} must not appear in common_principles" + ) def test_stringency_moved_out_of_common(self, common): # Stringency is metric-class-specific and now lives on each @@ -274,12 +274,12 @@ class TestPromptSizeIsBounded: def test_general(self, general_metrics): for name, m in general_metrics.items(): - assert ( - len(m["task_description"]) < self.MAX_CHARS - ), f"{name} task_description too long ({len(m['task_description'])} chars)" + assert len(m["task_description"]) < self.MAX_CHARS, ( + f"{name} task_description too long ({len(m['task_description'])} chars)" + ) def test_funcsel(self, funcsel_metrics): for name, m in funcsel_metrics.items(): - assert ( - len(m["task_description"]) < self.MAX_CHARS - ), f"{name} task_description too long ({len(m['task_description'])} chars)" + assert len(m["task_description"]) < self.MAX_CHARS, ( + f"{name} task_description too long ({len(m['task_description'])} chars)" + ) diff --git a/tests/pre_tool/sparc/units_conversion_test.py b/tests/pre_tool/sparc/units_conversion_test.py index e6225a33..bcf26b4c 100644 --- a/tests/pre_tool/sparc/units_conversion_test.py +++ b/tests/pre_tool/sparc/units_conversion_test.py @@ -140,9 +140,9 @@ def distance_tool_specs(self): def test_correct_conversion_validation(self, middleware, weather_tool_specs): """Test that correct unit conversions pass validation.""" - assert ( - not middleware._initialization_error - ), f"Could not initialize transformation pipeline: {middleware._initialization_error}" + assert not middleware._initialization_error, ( + f"Could not initialize transformation pipeline: {middleware._initialization_error}" + ) conversation_context = [ { @@ -194,9 +194,9 @@ def test_correct_conversion_validation(self, middleware, weather_tool_specs): def test_no_transformation_needed(self, middleware, weather_tool_specs): """Test that tool calls not requiring transformation pass validation.""" - assert ( - not middleware._initialization_error - ), f"Could not initialize transformation pipeline: {middleware._initialization_error}" + assert not middleware._initialization_error, ( + f"Could not initialize transformation pipeline: {middleware._initialization_error}" + ) conversation_context = [ { @@ -252,9 +252,9 @@ def test_temperature_conversion_accuracy( ): """Test accuracy of temperature conversion detection with various values.""" - assert ( - not middleware._initialization_error - ), f"Could not initialize transformation pipeline: {middleware._initialization_error}" + assert not middleware._initialization_error, ( + f"Could not initialize transformation pipeline: {middleware._initialization_error}" + ) conversation_context = [ { @@ -311,9 +311,9 @@ def test_transformation_disabled_config( ): """Test that transformation validation can be disabled.""" - assert ( - not no_transform_middleware._initialization_error - ), f"Could not initialize transformation pipeline: {no_transform_middleware._initialization_error}" + assert not no_transform_middleware._initialization_error, ( + f"Could not initialize transformation pipeline: {no_transform_middleware._initialization_error}" + ) conversation_context = [ {"role": "user", "content": "Set thermostat to 75 Fahrenheit"}, @@ -355,9 +355,9 @@ def test_transformation_disabled_config( def test_complex_transformation_scenario(self, middleware): """Test complex transformation scenario with multiple unit types.""" - assert ( - not middleware._initialization_error - ), f"Could not initialize transformation pipeline: {middleware._initialization_error}" + assert not middleware._initialization_error, ( + f"Could not initialize transformation pipeline: {middleware._initialization_error}" + ) # Tool with multiple unit-sensitive parameters complex_tool_specs = [ From d037c36af3be08530975403d4728e6075f6335d3 Mon Sep 17 00:00:00 2001 From: Osher-Elhadad Date: Wed, 6 May 2026 00:36:31 +0300 Subject: [PATCH 6/7] fix: specify ValidationError in tests for invalid config modes and thresholds Signed-off-by: Osher-Elhadad --- tests/pre_tool/sparc/test_adapter_compact_schema.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/pre_tool/sparc/test_adapter_compact_schema.py b/tests/pre_tool/sparc/test_adapter_compact_schema.py index 47e9312e..7ac38cbd 100644 --- a/tests/pre_tool/sparc/test_adapter_compact_schema.py +++ b/tests/pre_tool/sparc/test_adapter_compact_schema.py @@ -17,6 +17,7 @@ from __future__ import annotations from typing import Any, Dict, List +from pydantic import ValidationError import pytest @@ -167,11 +168,11 @@ def test_config_override(self): assert cfg.compact_tool_threshold == 5 def test_invalid_mode_rejected(self): - with pytest.raises(Exception): # pydantic will raise ValidationError + with pytest.raises(ValidationError): # pydantic will raise ValidationError SPARCReflectionConfig(compact_tool_schema="garbage") def test_invalid_threshold_rejected(self): - with pytest.raises(Exception): # ge=1 constraint + with pytest.raises(ValidationError): # ge=1 constraint SPARCReflectionConfig(compact_tool_threshold=0) From d3b7075e7d1a3b4ea85b078bd0f4c1b03029a184 Mon Sep 17 00:00:00 2001 From: Osher-Elhadad Date: Wed, 6 May 2026 00:42:43 +0300 Subject: [PATCH 7/7] fix: remove unnecessary variable assignment in TestSafeParse Signed-off-by: Osher-Elhadad --- tests/core/test_validating_llm_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/test_validating_llm_client.py b/tests/core/test_validating_llm_client.py index de8ecd93..a0c7f1dd 100644 --- a/tests/core/test_validating_llm_client.py +++ b/tests/core/test_validating_llm_client.py @@ -272,7 +272,7 @@ def test_key_error_becomes_empty_string(self): assert c._parse_llm_response(KeyError("missing")) == "" def test_reasoning_budget_warning_logged(self, caplog): - c = _FakeValidating(client=object()) + _FakeValidating(client=object()) # A litellm-shaped response: reasoning_content set, finish_reason=length, # content missing — the classic reasoning-budget exhaustion pattern. raw = {