From dda367e65ff9bffacd6181b15228a81789c60b1d Mon Sep 17 00:00:00 2001 From: Jae <641403035@qq.com> Date: Mon, 8 Jun 2026 16:13:42 +0800 Subject: [PATCH] fix(eval): allow invocation-level rubrics --- src/google/adk/cli/cli_eval.py | 4 +- .../adk/evaluation/rubric_based_evaluator.py | 6 +- .../rubric_based_final_response_quality_v1.py | 2 +- .../rubric_based_tool_use_quality_v1.py | 2 +- .../cli/utils/test_cli_eval_pretty_print.py | 72 +++++++++++++++++++ .../evaluation/test_rubric_based_evaluator.py | 20 ++++++ .../test_rubric_based_tool_use_quality_v1.py | 61 ++++++++++++++++ 7 files changed, 160 insertions(+), 7 deletions(-) create mode 100644 tests/unittests/cli/utils/test_cli_eval_pretty_print.py diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py index 33c1693208..5a023864e8 100644 --- a/src/google/adk/cli/cli_eval.py +++ b/src/google/adk/cli/cli_eval.py @@ -214,7 +214,7 @@ def pretty_print_eval_result(eval_result: EvalCaseResult): click.echo("Rubric Scores:") rubrics_by_id = { r["rubric_id"]: r["rubric_content"]["text_property"] - for r in metric_result.criterion.rubrics + for r in getattr(metric_result.criterion, "rubrics", None) or [] } for rubric_score in metric_result.details.rubric_scores: rubric_text = rubrics_by_id.get(rubric_score.rubric_id) @@ -257,7 +257,7 @@ def pretty_print_eval_result(eval_result: EvalCaseResult): if metric_result.details and metric_result.details.rubric_scores: rubrics_by_id = { r["rubric_id"]: r["rubric_content"]["text_property"] - for r in metric_result.criterion.rubrics + for r in getattr(metric_result.criterion, "rubrics", None) or [] } for rubric_score in metric_result.details.rubric_scores: rubric = rubrics_by_id.get(rubric_score.rubric_id) diff --git a/src/google/adk/evaluation/rubric_based_evaluator.py b/src/google/adk/evaluation/rubric_based_evaluator.py index 451a14f1a5..533540400f 100644 --- a/src/google/adk/evaluation/rubric_based_evaluator.py +++ b/src/google/adk/evaluation/rubric_based_evaluator.py @@ -329,9 +329,7 @@ def __init__( self._per_invocation_results_aggregator = per_invocation_results_aggregator self._invocation_results_summarizer = invocation_results_summarizer - assert self._criterion.rubrics, "Rubrics are required." - - self._rubrics: list[Rubric] = self._criterion.rubrics + self._rubrics: list[Rubric] = self._criterion.rubrics or [] self._effective_rubrics_list: Optional[list[Rubric]] = None self._normalized_rubric_to_id_map = { @@ -373,6 +371,8 @@ def get_effective_rubrics_list(self) -> list[Rubric]: "Effective rubrics list not initialized. Call" " create_effective_rubrics_list() first." ) + if not self._effective_rubrics_list: + raise ValueError("Rubrics are required.") return self._effective_rubrics_list @override diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py index 135b2b9593..2a780503df 100644 --- a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py +++ b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py @@ -289,7 +289,7 @@ def format_auto_rater_prompt( rubrics_text = "\n".join([ f"* {r.rubric_content.text_property}" - for r in self._effective_rubrics_list + for r in self.get_effective_rubrics_list() ]) developer_instructions = "" diff --git a/src/google/adk/evaluation/rubric_based_tool_use_quality_v1.py b/src/google/adk/evaluation/rubric_based_tool_use_quality_v1.py index d8d1da94c5..a21be7636f 100644 --- a/src/google/adk/evaluation/rubric_based_tool_use_quality_v1.py +++ b/src/google/adk/evaluation/rubric_based_tool_use_quality_v1.py @@ -179,7 +179,7 @@ def format_auto_rater_prompt( rubrics_text = "\n".join([ f"* {r.rubric_content.text_property}" - for r in self._effective_rubrics_list + for r in self.get_effective_rubrics_list() ]) app_details = actual_invocation.app_details diff --git a/tests/unittests/cli/utils/test_cli_eval_pretty_print.py b/tests/unittests/cli/utils/test_cli_eval_pretty_print.py new file mode 100644 index 0000000000..8ca199f13f --- /dev/null +++ b/tests/unittests/cli/utils/test_cli_eval_pretty_print.py @@ -0,0 +1,72 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from google.adk.cli.cli_eval import pretty_print_eval_result +from google.adk.evaluation.eval_case import Invocation +from google.adk.evaluation.eval_metrics import EvalMetricResult +from google.adk.evaluation.eval_metrics import EvalMetricResultDetails +from google.adk.evaluation.eval_metrics import EvalMetricResultPerInvocation +from google.adk.evaluation.eval_metrics import PrebuiltMetrics +from google.adk.evaluation.eval_metrics import RubricsBasedCriterion +from google.adk.evaluation.eval_result import EvalCaseResult +from google.adk.evaluation.eval_rubrics import RubricScore +from google.adk.evaluation.evaluator import EvalStatus +from google.genai import types as genai_types + + +def test_pretty_print_eval_result_with_empty_criterion_rubrics(capsys): + """Tests pretty printing falls back to rubric id when criterion rubrics are empty.""" + criterion = RubricsBasedCriterion(threshold=0.5) + metric_result = EvalMetricResult( + metric_name=PrebuiltMetrics.RUBRIC_BASED_TOOL_USE_QUALITY_V1.value, + threshold=0.5, + criterion=criterion, + score=1.0, + eval_status=EvalStatus.PASSED, + details=EvalMetricResultDetails( + rubric_scores=[ + RubricScore( + rubric_id="invocation-rubric", + score=1.0, + rationale="The correct tool was used.", + ) + ] + ), + ) + invocation = Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="User input here.")] + ) + ) + eval_result = EvalCaseResult( + eval_set_id="eval-set", + eval_id="eval-id", + final_eval_status=EvalStatus.PASSED, + overall_eval_metric_results=[metric_result], + eval_metric_result_per_invocation=[ + EvalMetricResultPerInvocation( + actual_invocation=invocation, + eval_metric_results=[metric_result], + ) + ], + session_id="session-id", + ) + + pretty_print_eval_result(eval_result) + + captured = capsys.readouterr() + assert "Rubric: invocation-rubric" in captured.out + assert "The correct tool was used." in captured.out diff --git a/tests/unittests/evaluation/test_rubric_based_evaluator.py b/tests/unittests/evaluation/test_rubric_based_evaluator.py index 87a10cbc82..5ebaffcc19 100644 --- a/tests/unittests/evaluation/test_rubric_based_evaluator.py +++ b/tests/unittests/evaluation/test_rubric_based_evaluator.py @@ -599,6 +599,26 @@ def test_create_effective_rubrics_list_with_no_invocation_rubrics( assert len(effective_rubrics) == 2 assert {r.rubric_id for r in effective_rubrics} == {"1", "2"} + def test_get_effective_rubrics_list_with_no_rubrics_raises_error(self): + judge_model_options = JudgeModelOptions( + judge_model_config=None, + num_samples=3, + ) + criterion = RubricsBasedCriterion( + threshold=0.5, judge_model_options=judge_model_options + ) + metric = EvalMetric( + metric_name=PrebuiltMetrics.RUBRIC_BASED_FINAL_RESPONSE_QUALITY_V1.value, + threshold=0.5, + criterion=criterion, + ) + evaluator = FakeRubricBasedEvaluator(metric) + + evaluator.create_effective_rubrics_list(None) + + with pytest.raises(ValueError, match="Rubrics are required."): + evaluator.get_effective_rubrics_list() + def test_get_effective_rubrics_list_before_creation_raises_error( self, evaluator: RubricBasedEvaluator ): diff --git a/tests/unittests/evaluation/test_rubric_based_tool_use_quality_v1.py b/tests/unittests/evaluation/test_rubric_based_tool_use_quality_v1.py index 3aaa897d58..64d5ff7a31 100644 --- a/tests/unittests/evaluation/test_rubric_based_tool_use_quality_v1.py +++ b/tests/unittests/evaluation/test_rubric_based_tool_use_quality_v1.py @@ -79,6 +79,67 @@ def test_format_auto_rater_prompt_with_basic_invocation( assert "\nNo intermediate steps were taken.\n" in prompt +def test_format_auto_rater_prompt_with_invocation_rubrics_only(): + """Tests prompt formatting when rubrics are defined on the invocation.""" + judge_model_options = JudgeModelOptions( + judge_model_config=None, + num_samples=3, + ) + criterion = RubricsBasedCriterion( + threshold=0.5, judge_model_options=judge_model_options + ) + metric = EvalMetric( + metric_name=PrebuiltMetrics.RUBRIC_BASED_TOOL_USE_QUALITY_V1.value, + threshold=0.5, + criterion=criterion, + ) + evaluator = RubricBasedToolUseV1Evaluator(metric) + invocation = Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="User input here.")] + ), + rubrics=[ + Rubric( + rubric_id="invocation-rubric", + rubric_content=RubricContent( + text_property="Did the agent use the lookup tool?" + ), + type=RubricBasedToolUseV1Evaluator.RUBRIC_TYPE, + ) + ], + ) + + prompt = evaluator.format_auto_rater_prompt(invocation, None) + + assert "User input here." in prompt + assert "Did the agent use the lookup tool?" in prompt + + +def test_format_auto_rater_prompt_without_effective_rubrics_raises_error(): + """Tests prompt formatting fails when no criterion or invocation rubrics exist.""" + judge_model_options = JudgeModelOptions( + judge_model_config=None, + num_samples=3, + ) + criterion = RubricsBasedCriterion( + threshold=0.5, judge_model_options=judge_model_options + ) + metric = EvalMetric( + metric_name=PrebuiltMetrics.RUBRIC_BASED_TOOL_USE_QUALITY_V1.value, + threshold=0.5, + criterion=criterion, + ) + evaluator = RubricBasedToolUseV1Evaluator(metric) + invocation = Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="User input here.")] + ), + ) + + with pytest.raises(ValueError, match="Rubrics are required."): + evaluator.format_auto_rater_prompt(invocation, None) + + def test_format_auto_rater_prompt_with_app_details( evaluator: RubricBasedToolUseV1Evaluator, ):