Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/google/adk/cli/cli_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def pretty_print_eval_result(eval_result: EvalCaseResult):
click.echo("Rubric Scores:")
rubrics_by_id = {
r["rubric_id"]: r["rubric_content"]["text_property"]
for r in metric_result.criterion.rubrics
for r in getattr(metric_result.criterion, "rubrics", None) or []
}
for rubric_score in metric_result.details.rubric_scores:
rubric_text = rubrics_by_id.get(rubric_score.rubric_id)
Expand Down Expand Up @@ -257,7 +257,7 @@ def pretty_print_eval_result(eval_result: EvalCaseResult):
if metric_result.details and metric_result.details.rubric_scores:
rubrics_by_id = {
r["rubric_id"]: r["rubric_content"]["text_property"]
for r in metric_result.criterion.rubrics
for r in getattr(metric_result.criterion, "rubrics", None) or []
}
for rubric_score in metric_result.details.rubric_scores:
rubric = rubrics_by_id.get(rubric_score.rubric_id)
Expand Down
6 changes: 3 additions & 3 deletions src/google/adk/evaluation/rubric_based_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,9 +329,7 @@ def __init__(
self._per_invocation_results_aggregator = per_invocation_results_aggregator
self._invocation_results_summarizer = invocation_results_summarizer

assert self._criterion.rubrics, "Rubrics are required."

self._rubrics: list[Rubric] = self._criterion.rubrics
self._rubrics: list[Rubric] = self._criterion.rubrics or []
self._effective_rubrics_list: Optional[list[Rubric]] = None

self._normalized_rubric_to_id_map = {
Expand Down Expand Up @@ -373,6 +371,8 @@ def get_effective_rubrics_list(self) -> list[Rubric]:
"Effective rubrics list not initialized. Call"
" create_effective_rubrics_list() first."
)
if not self._effective_rubrics_list:
raise ValueError("Rubrics are required.")
return self._effective_rubrics_list

@override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ def format_auto_rater_prompt(

rubrics_text = "\n".join([
f"* {r.rubric_content.text_property}"
for r in self._effective_rubrics_list
for r in self.get_effective_rubrics_list()
])

developer_instructions = ""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def format_auto_rater_prompt(

rubrics_text = "\n".join([
f"* {r.rubric_content.text_property}"
for r in self._effective_rubrics_list
for r in self.get_effective_rubrics_list()
])

app_details = actual_invocation.app_details
Expand Down
72 changes: 72 additions & 0 deletions tests/unittests/cli/utils/test_cli_eval_pretty_print.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

from google.adk.cli.cli_eval import pretty_print_eval_result
from google.adk.evaluation.eval_case import Invocation
from google.adk.evaluation.eval_metrics import EvalMetricResult
from google.adk.evaluation.eval_metrics import EvalMetricResultDetails
from google.adk.evaluation.eval_metrics import EvalMetricResultPerInvocation
from google.adk.evaluation.eval_metrics import PrebuiltMetrics
from google.adk.evaluation.eval_metrics import RubricsBasedCriterion
from google.adk.evaluation.eval_result import EvalCaseResult
from google.adk.evaluation.eval_rubrics import RubricScore
from google.adk.evaluation.evaluator import EvalStatus
from google.genai import types as genai_types


def test_pretty_print_eval_result_with_empty_criterion_rubrics(capsys):
"""Tests pretty printing falls back to rubric id when criterion rubrics are empty."""
criterion = RubricsBasedCriterion(threshold=0.5)
metric_result = EvalMetricResult(
metric_name=PrebuiltMetrics.RUBRIC_BASED_TOOL_USE_QUALITY_V1.value,
threshold=0.5,
criterion=criterion,
score=1.0,
eval_status=EvalStatus.PASSED,
details=EvalMetricResultDetails(
rubric_scores=[
RubricScore(
rubric_id="invocation-rubric",
score=1.0,
rationale="The correct tool was used.",
)
]
),
)
invocation = Invocation(
user_content=genai_types.Content(
parts=[genai_types.Part(text="User input here.")]
)
)
eval_result = EvalCaseResult(
eval_set_id="eval-set",
eval_id="eval-id",
final_eval_status=EvalStatus.PASSED,
overall_eval_metric_results=[metric_result],
eval_metric_result_per_invocation=[
EvalMetricResultPerInvocation(
actual_invocation=invocation,
eval_metric_results=[metric_result],
)
],
session_id="session-id",
)

pretty_print_eval_result(eval_result)

captured = capsys.readouterr()
assert "Rubric: invocation-rubric" in captured.out
assert "The correct tool was used." in captured.out
20 changes: 20 additions & 0 deletions tests/unittests/evaluation/test_rubric_based_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,26 @@ def test_create_effective_rubrics_list_with_no_invocation_rubrics(
assert len(effective_rubrics) == 2
assert {r.rubric_id for r in effective_rubrics} == {"1", "2"}

def test_get_effective_rubrics_list_with_no_rubrics_raises_error(self):
judge_model_options = JudgeModelOptions(
judge_model_config=None,
num_samples=3,
)
criterion = RubricsBasedCriterion(
threshold=0.5, judge_model_options=judge_model_options
)
metric = EvalMetric(
metric_name=PrebuiltMetrics.RUBRIC_BASED_FINAL_RESPONSE_QUALITY_V1.value,
threshold=0.5,
criterion=criterion,
)
evaluator = FakeRubricBasedEvaluator(metric)

evaluator.create_effective_rubrics_list(None)

with pytest.raises(ValueError, match="Rubrics are required."):
evaluator.get_effective_rubrics_list()

def test_get_effective_rubrics_list_before_creation_raises_error(
self, evaluator: RubricBasedEvaluator
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,67 @@ def test_format_auto_rater_prompt_with_basic_invocation(
assert "<response>\nNo intermediate steps were taken.\n</response>" in prompt


def test_format_auto_rater_prompt_with_invocation_rubrics_only():
"""Tests prompt formatting when rubrics are defined on the invocation."""
judge_model_options = JudgeModelOptions(
judge_model_config=None,
num_samples=3,
)
criterion = RubricsBasedCriterion(
threshold=0.5, judge_model_options=judge_model_options
)
metric = EvalMetric(
metric_name=PrebuiltMetrics.RUBRIC_BASED_TOOL_USE_QUALITY_V1.value,
threshold=0.5,
criterion=criterion,
)
evaluator = RubricBasedToolUseV1Evaluator(metric)
invocation = Invocation(
user_content=genai_types.Content(
parts=[genai_types.Part(text="User input here.")]
),
rubrics=[
Rubric(
rubric_id="invocation-rubric",
rubric_content=RubricContent(
text_property="Did the agent use the lookup tool?"
),
type=RubricBasedToolUseV1Evaluator.RUBRIC_TYPE,
)
],
)

prompt = evaluator.format_auto_rater_prompt(invocation, None)

assert "User input here." in prompt
assert "Did the agent use the lookup tool?" in prompt


def test_format_auto_rater_prompt_without_effective_rubrics_raises_error():
"""Tests prompt formatting fails when no criterion or invocation rubrics exist."""
judge_model_options = JudgeModelOptions(
judge_model_config=None,
num_samples=3,
)
criterion = RubricsBasedCriterion(
threshold=0.5, judge_model_options=judge_model_options
)
metric = EvalMetric(
metric_name=PrebuiltMetrics.RUBRIC_BASED_TOOL_USE_QUALITY_V1.value,
threshold=0.5,
criterion=criterion,
)
evaluator = RubricBasedToolUseV1Evaluator(metric)
invocation = Invocation(
user_content=genai_types.Content(
parts=[genai_types.Part(text="User input here.")]
),
)

with pytest.raises(ValueError, match="Rubrics are required."):
evaluator.format_auto_rater_prompt(invocation, None)


def test_format_auto_rater_prompt_with_app_details(
evaluator: RubricBasedToolUseV1Evaluator,
):
Expand Down