From dda367e65ff9bffacd6181b15228a81789c60b1d Mon Sep 17 00:00:00 2001
From: Jae <641403035@qq.com>
Date: Mon, 8 Jun 2026 16:13:42 +0800
Subject: [PATCH] fix(eval): allow invocation-level rubrics

---
 src/google/adk/cli/cli_eval.py                |  4 +-
 .../adk/evaluation/rubric_based_evaluator.py  |  6 +-
 .../rubric_based_final_response_quality_v1.py |  2 +-
 .../rubric_based_tool_use_quality_v1.py       |  2 +-
 .../cli/utils/test_cli_eval_pretty_print.py   | 72 +++++++++++++++++++
 .../evaluation/test_rubric_based_evaluator.py | 20 ++++++
 .../test_rubric_based_tool_use_quality_v1.py  | 61 ++++++++++++++++
 7 files changed, 160 insertions(+), 7 deletions(-)
 create mode 100644 tests/unittests/cli/utils/test_cli_eval_pretty_print.py

diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py
index 33c1693208..5a023864e8 100644
--- a/src/google/adk/cli/cli_eval.py
+++ b/src/google/adk/cli/cli_eval.py
@@ -214,7 +214,7 @@ def pretty_print_eval_result(eval_result: EvalCaseResult):
       click.echo("Rubric Scores:")
       rubrics_by_id = {
           r["rubric_id"]: r["rubric_content"]["text_property"]
-          for r in metric_result.criterion.rubrics
+          for r in getattr(metric_result.criterion, "rubrics", None) or []
       }
       for rubric_score in metric_result.details.rubric_scores:
         rubric_text = rubrics_by_id.get(rubric_score.rubric_id)
@@ -257,7 +257,7 @@ def pretty_print_eval_result(eval_result: EvalCaseResult):
       if metric_result.details and metric_result.details.rubric_scores:
         rubrics_by_id = {
             r["rubric_id"]: r["rubric_content"]["text_property"]
-            for r in metric_result.criterion.rubrics
+            for r in getattr(metric_result.criterion, "rubrics", None) or []
         }
         for rubric_score in metric_result.details.rubric_scores:
           rubric = rubrics_by_id.get(rubric_score.rubric_id)
diff --git a/src/google/adk/evaluation/rubric_based_evaluator.py b/src/google/adk/evaluation/rubric_based_evaluator.py
index 451a14f1a5..533540400f 100644
--- a/src/google/adk/evaluation/rubric_based_evaluator.py
+++ b/src/google/adk/evaluation/rubric_based_evaluator.py
@@ -329,9 +329,7 @@ def __init__(
     self._per_invocation_results_aggregator = per_invocation_results_aggregator
     self._invocation_results_summarizer = invocation_results_summarizer
 
-    assert self._criterion.rubrics, "Rubrics are required."
-
-    self._rubrics: list[Rubric] = self._criterion.rubrics
+    self._rubrics: list[Rubric] = self._criterion.rubrics or []
     self._effective_rubrics_list: Optional[list[Rubric]] = None
 
     self._normalized_rubric_to_id_map = {
@@ -373,6 +371,8 @@ def get_effective_rubrics_list(self) -> list[Rubric]:
           "Effective rubrics list not initialized. Call"
           " create_effective_rubrics_list() first."
       )
+    if not self._effective_rubrics_list:
+      raise ValueError("Rubrics are required.")
     return self._effective_rubrics_list
 
   @override
diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
index 135b2b9593..2a780503df 100644
--- a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
+++ b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
@@ -289,7 +289,7 @@ def format_auto_rater_prompt(
 
     rubrics_text = "\n".join([
         f"*  {r.rubric_content.text_property}"
-        for r in self._effective_rubrics_list
+        for r in self.get_effective_rubrics_list()
     ])
 
     developer_instructions = ""
diff --git a/src/google/adk/evaluation/rubric_based_tool_use_quality_v1.py b/src/google/adk/evaluation/rubric_based_tool_use_quality_v1.py
index d8d1da94c5..a21be7636f 100644
--- a/src/google/adk/evaluation/rubric_based_tool_use_quality_v1.py
+++ b/src/google/adk/evaluation/rubric_based_tool_use_quality_v1.py
@@ -179,7 +179,7 @@ def format_auto_rater_prompt(
 
     rubrics_text = "\n".join([
         f"*  {r.rubric_content.text_property}"
-        for r in self._effective_rubrics_list
+        for r in self.get_effective_rubrics_list()
     ])
 
     app_details = actual_invocation.app_details
diff --git a/tests/unittests/cli/utils/test_cli_eval_pretty_print.py b/tests/unittests/cli/utils/test_cli_eval_pretty_print.py
new file mode 100644
index 0000000000..8ca199f13f
--- /dev/null
+++ b/tests/unittests/cli/utils/test_cli_eval_pretty_print.py
@@ -0,0 +1,72 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from google.adk.cli.cli_eval import pretty_print_eval_result
+from google.adk.evaluation.eval_case import Invocation
+from google.adk.evaluation.eval_metrics import EvalMetricResult
+from google.adk.evaluation.eval_metrics import EvalMetricResultDetails
+from google.adk.evaluation.eval_metrics import EvalMetricResultPerInvocation
+from google.adk.evaluation.eval_metrics import PrebuiltMetrics
+from google.adk.evaluation.eval_metrics import RubricsBasedCriterion
+from google.adk.evaluation.eval_result import EvalCaseResult
+from google.adk.evaluation.eval_rubrics import RubricScore
+from google.adk.evaluation.evaluator import EvalStatus
+from google.genai import types as genai_types
+
+
+def test_pretty_print_eval_result_with_empty_criterion_rubrics(capsys):
+  """Tests pretty printing falls back to rubric id when criterion rubrics are empty."""
+  criterion = RubricsBasedCriterion(threshold=0.5)
+  metric_result = EvalMetricResult(
+      metric_name=PrebuiltMetrics.RUBRIC_BASED_TOOL_USE_QUALITY_V1.value,
+      threshold=0.5,
+      criterion=criterion,
+      score=1.0,
+      eval_status=EvalStatus.PASSED,
+      details=EvalMetricResultDetails(
+          rubric_scores=[
+              RubricScore(
+                  rubric_id="invocation-rubric",
+                  score=1.0,
+                  rationale="The correct tool was used.",
+              )
+          ]
+      ),
+  )
+  invocation = Invocation(
+      user_content=genai_types.Content(
+          parts=[genai_types.Part(text="User input here.")]
+      )
+  )
+  eval_result = EvalCaseResult(
+      eval_set_id="eval-set",
+      eval_id="eval-id",
+      final_eval_status=EvalStatus.PASSED,
+      overall_eval_metric_results=[metric_result],
+      eval_metric_result_per_invocation=[
+          EvalMetricResultPerInvocation(
+              actual_invocation=invocation,
+              eval_metric_results=[metric_result],
+          )
+      ],
+      session_id="session-id",
+  )
+
+  pretty_print_eval_result(eval_result)
+
+  captured = capsys.readouterr()
+  assert "Rubric: invocation-rubric" in captured.out
+  assert "The correct tool was used." in captured.out
diff --git a/tests/unittests/evaluation/test_rubric_based_evaluator.py b/tests/unittests/evaluation/test_rubric_based_evaluator.py
index 87a10cbc82..5ebaffcc19 100644
--- a/tests/unittests/evaluation/test_rubric_based_evaluator.py
+++ b/tests/unittests/evaluation/test_rubric_based_evaluator.py
@@ -599,6 +599,26 @@ def test_create_effective_rubrics_list_with_no_invocation_rubrics(
     assert len(effective_rubrics) == 2
     assert {r.rubric_id for r in effective_rubrics} == {"1", "2"}
 
+  def test_get_effective_rubrics_list_with_no_rubrics_raises_error(self):
+    judge_model_options = JudgeModelOptions(
+        judge_model_config=None,
+        num_samples=3,
+    )
+    criterion = RubricsBasedCriterion(
+        threshold=0.5, judge_model_options=judge_model_options
+    )
+    metric = EvalMetric(
+        metric_name=PrebuiltMetrics.RUBRIC_BASED_FINAL_RESPONSE_QUALITY_V1.value,
+        threshold=0.5,
+        criterion=criterion,
+    )
+    evaluator = FakeRubricBasedEvaluator(metric)
+
+    evaluator.create_effective_rubrics_list(None)
+
+    with pytest.raises(ValueError, match="Rubrics are required."):
+      evaluator.get_effective_rubrics_list()
+
   def test_get_effective_rubrics_list_before_creation_raises_error(
       self, evaluator: RubricBasedEvaluator
   ):
diff --git a/tests/unittests/evaluation/test_rubric_based_tool_use_quality_v1.py b/tests/unittests/evaluation/test_rubric_based_tool_use_quality_v1.py
index 3aaa897d58..64d5ff7a31 100644
--- a/tests/unittests/evaluation/test_rubric_based_tool_use_quality_v1.py
+++ b/tests/unittests/evaluation/test_rubric_based_tool_use_quality_v1.py
@@ -79,6 +79,67 @@ def test_format_auto_rater_prompt_with_basic_invocation(
   assert "<response>\nNo intermediate steps were taken.\n</response>" in prompt
 
 
+def test_format_auto_rater_prompt_with_invocation_rubrics_only():
+  """Tests prompt formatting when rubrics are defined on the invocation."""
+  judge_model_options = JudgeModelOptions(
+      judge_model_config=None,
+      num_samples=3,
+  )
+  criterion = RubricsBasedCriterion(
+      threshold=0.5, judge_model_options=judge_model_options
+  )
+  metric = EvalMetric(
+      metric_name=PrebuiltMetrics.RUBRIC_BASED_TOOL_USE_QUALITY_V1.value,
+      threshold=0.5,
+      criterion=criterion,
+  )
+  evaluator = RubricBasedToolUseV1Evaluator(metric)
+  invocation = Invocation(
+      user_content=genai_types.Content(
+          parts=[genai_types.Part(text="User input here.")]
+      ),
+      rubrics=[
+          Rubric(
+              rubric_id="invocation-rubric",
+              rubric_content=RubricContent(
+                  text_property="Did the agent use the lookup tool?"
+              ),
+              type=RubricBasedToolUseV1Evaluator.RUBRIC_TYPE,
+          )
+      ],
+  )
+
+  prompt = evaluator.format_auto_rater_prompt(invocation, None)
+
+  assert "User input here." in prompt
+  assert "Did the agent use the lookup tool?" in prompt
+
+
+def test_format_auto_rater_prompt_without_effective_rubrics_raises_error():
+  """Tests prompt formatting fails when no criterion or invocation rubrics exist."""
+  judge_model_options = JudgeModelOptions(
+      judge_model_config=None,
+      num_samples=3,
+  )
+  criterion = RubricsBasedCriterion(
+      threshold=0.5, judge_model_options=judge_model_options
+  )
+  metric = EvalMetric(
+      metric_name=PrebuiltMetrics.RUBRIC_BASED_TOOL_USE_QUALITY_V1.value,
+      threshold=0.5,
+      criterion=criterion,
+  )
+  evaluator = RubricBasedToolUseV1Evaluator(metric)
+  invocation = Invocation(
+      user_content=genai_types.Content(
+          parts=[genai_types.Part(text="User input here.")]
+      ),
+  )
+
+  with pytest.raises(ValueError, match="Rubrics are required."):
+    evaluator.format_auto_rater_prompt(invocation, None)
+
+
 def test_format_auto_rater_prompt_with_app_details(
     evaluator: RubricBasedToolUseV1Evaluator,
 ):