|
15 | 15 | from __future__ import annotations |
16 | 16 |
|
17 | 17 | import asyncio |
18 | | -import sys |
19 | 18 | from typing import Optional |
20 | 19 |
|
21 | 20 | from google.adk.agents.llm_agent import LlmAgent |
@@ -465,6 +464,39 @@ async def test_evaluate_single_inference_result( |
465 | 464 | assert metric_result.eval_status == EvalStatus.PASSED |
466 | 465 |
|
467 | 466 |
|
| 467 | +@pytest.mark.asyncio |
| 468 | +async def test_evaluate_single_inference_result_failed_without_inferences( |
| 469 | + eval_service, mock_eval_sets_manager, mocker |
| 470 | +): |
| 471 | + inference_result = InferenceResult( |
| 472 | + app_name="test_app", |
| 473 | + eval_set_id="test_eval_set", |
| 474 | + eval_case_id="case1", |
| 475 | + inferences=None, |
| 476 | + session_id="session1", |
| 477 | + status=InferenceStatus.FAILURE, |
| 478 | + error_message="auth failed", |
| 479 | + ) |
| 480 | + eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5) |
| 481 | + evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1) |
| 482 | + |
| 483 | + mock_eval_case = mocker.MagicMock(spec=EvalCase) |
| 484 | + mock_eval_case.conversation = [] |
| 485 | + mock_eval_case.conversation_scenario = None |
| 486 | + mock_eval_case.session_input = None |
| 487 | + mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case |
| 488 | + |
| 489 | + _, result = await eval_service._evaluate_single_inference_result( |
| 490 | + inference_result=inference_result, evaluate_config=evaluate_config |
| 491 | + ) |
| 492 | + |
| 493 | + assert result.eval_id == "case1" |
| 494 | + assert result.session_id == "session1" |
| 495 | + assert result.final_eval_status == EvalStatus.FAILED |
| 496 | + assert result.overall_eval_metric_results == [] |
| 497 | + assert result.eval_metric_result_per_invocation == [] |
| 498 | + |
| 499 | + |
468 | 500 | @pytest.mark.asyncio |
469 | 501 | async def test_evaluate_single_inference_result_for_conversation_scenario( |
470 | 502 | eval_service, mock_eval_sets_manager, mocker |
@@ -520,7 +552,7 @@ async def test_evaluate_single_inference_result_for_conversation_scenario( |
520 | 552 | for i in range(3): |
521 | 553 | invocation_result = result.eval_metric_result_per_invocation[i] |
522 | 554 | assert invocation_result.actual_invocation == inference_result.inferences[i] |
523 | | - assert invocation_result.expected_invocation == None |
| 555 | + assert invocation_result.expected_invocation is None |
524 | 556 | assert len(invocation_result.eval_metric_results) == 1 |
525 | 557 | metric_result = invocation_result.eval_metric_results[0] |
526 | 558 | assert metric_result.metric_name == "fake_single_sided_metric" |
|
0 commit comments