From 07ab964d96dc58f74859abd1e5a6add97a1ceb26 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Tue, 27 Jan 2026 22:23:20 +0530 Subject: [PATCH 1/5] Evaluation: add export_format query param with grouped traces support --- .../app/api/docs/evaluation/get_evaluation.md | 30 +++- .../app/api/routes/evaluations/evaluation.py | 21 +++ backend/app/crud/evaluations/core.py | 55 +++++++ .../app/tests/api/routes/test_evaluation.py | 141 ++++++++++++++++++ 4 files changed, 246 insertions(+), 1 deletion(-) diff --git a/backend/app/api/docs/evaluation/get_evaluation.md b/backend/app/api/docs/evaluation/get_evaluation.md index 1e3186d2c..4aea35a9d 100644 --- a/backend/app/api/docs/evaluation/get_evaluation.md +++ b/backend/app/api/docs/evaluation/get_evaluation.md @@ -5,8 +5,9 @@ Returns comprehensive evaluation information including processing status, config **Query Parameters:** * `get_trace_info` (optional, default: false) - Include Langfuse trace scores with Q&A context. Data is fetched from Langfuse on first request and cached for subsequent calls. Only available for completed evaluations. * `resync_score` (optional, default: false) - Clear cached scores and re-fetch from Langfuse. Useful when evaluators have been updated. Requires `get_trace_info=true`. +* `export_format` (optional, default: row) - Controls the structure of traces in the response. Requires `get_trace_info=true` when set to "grouped". enum=['row','grouped'] -**Score Format** (`get_trace_info=true`): +**Score Format** (`get_trace_info=true`,`export_format=row`): ```json { @@ -49,6 +50,33 @@ Returns comprehensive evaluation information including processing status, config } ``` +**Score Format** (`get_trace_info=true`,`export_format=grouped`): +```json +{ + "summary_scores": [...], + "traces": [...], + "grouped_traces": [ + { + "question_id": 1, + "question": "What is Python?", + "ground_truth_answer": "Python is a high-level programming language.", + "llm_answers": [ + "Answer from evaluation run 1...", + "Answer from evaluation run 2..." + ], + "trace_ids": [ + "uuid-123", + "uuid-456" + ], + "scores": [ + [{"name": "cosine_similarity", "value": 0.82, "data_type": "NUMERIC"}], + [{"name": "cosine_similarity", "value": 0.75, "data_type": "NUMERIC"}] + ] + } + ] +} +``` + **Score Details:** * NUMERIC scores include average (`avg`) and standard deviation (`std`) in summary * CATEGORICAL scores include distribution counts in summary diff --git a/backend/app/api/routes/evaluations/evaluation.py b/backend/app/api/routes/evaluations/evaluation.py index d40a88a1a..a14b7479c 100644 --- a/backend/app/api/routes/evaluations/evaluation.py +++ b/backend/app/api/routes/evaluations/evaluation.py @@ -123,6 +123,14 @@ def get_evaluation_run_status( "Requires get_trace_info=true." ), ), + export_format: str = Query( + "row", + description=( + "Controls the Traces structure." + "'grouped' collates repeated questions horizontally using Parent Question ID." + ), + enum=["row", "grouped"] + ) ) -> APIResponse[EvaluationRunPublic]: """Get evaluation run status with optional trace info.""" if resync_score and not get_trace_info: @@ -130,6 +138,11 @@ def get_evaluation_run_status( status_code=400, detail="resync_score=true requires get_trace_info=true", ) + if export_format == "grouped" and not get_trace_info: + raise HTTPException( + status_code = 400, + detail="export_format=grouped requires get_trace_info=true" + ) eval_run, error = get_evaluation_with_scores( session=_session, @@ -148,6 +161,14 @@ def get_evaluation_run_status( "to this organization" ), ) + # Formatter = grouped + if export_format == "grouped" and eval_run.score and "traces" in eval_run.score: + from app.crud.evaluations.core import group_traces_by_question_id + try: + grouped_traces = group_traces_by_question_id(eval_run.score["traces"]) + eval_run.score["traces"] = grouped_traces + except ValueError as e: + return APIResponse.failure_response(error=str(e), data=eval_run) if error: return APIResponse.failure_response(error=error, data=eval_run) diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index 33b6777f3..53fdab920 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -1,4 +1,5 @@ import logging +from typing import Any from langfuse import Langfuse from sqlmodel import Session, select @@ -311,3 +312,57 @@ def save_score( f"traces={len(score.get('traces', []))}" ) return eval_run + +def group_traces_by_question_id( + traces: list[dict[str, Any]], +) -> list[dict[str, Any]]: + """ + Docstring for group_traces_by_question_id + + :param traces: Description + :type traces: list[dict[str, Any]] + :return: Description + :rtype: list[dict[str, Any]] + + Returns: + List of grouped traces sorted by question_id: + [ + { + "question_id": 1, + "question": "What is Python?", + "ground_truth_answer": "...", + "llm_answers": ["Answer 1", "Answer 2"], + "trace_ids": ["trace-1", "trace-2"], + "scores": [[...], [...]] + } + ] + """ + + # weather question_id exists in the traces + if traces and (traces[0].get("question_id") is None or traces[0].get("question_id") == ""): + raise ValueError( + "Grouped export format is not available for this evaluation.") + + groups: dict[int, list[dict[str, Any]]] = {} + + for trace in traces: + question_id = trace.get("question_id") + if question_id not in groups: + groups[question_id] = [] + groups[question_id].append(trace) + + result: list[dict[str, Any]] = [] + for question_id in sorted(groups.keys()): + group_traces = groups[question_id] + first = group_traces[0] + result.append({ + "question_id": question_id, + "question": first.get("question", ""), + "ground_truth_answer": first.get("ground_truth_answer", ""), + "llm_answers": [t.get("llm_answer", "") for t in group_traces], + "trace_ids": [t.get("trace_id", "") for t in group_traces], + "scores": [t.get("scores", []) for t in group_traces], + }) + + logger.info(f"[group_traces_by_question_id] Created {len(result)} groups") + return result \ No newline at end of file diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py index 813fd483f..26db5a144 100644 --- a/backend/app/tests/api/routes/test_evaluation.py +++ b/backend/app/tests/api/routes/test_evaluation.py @@ -918,6 +918,147 @@ def test_get_evaluation_run_resync_without_trace_info_fails( and "get_trace_info" in error_str.lower() ) + def test_get_evaluation_run_grouped_format_without_trace_info_fails( + self, + client: TestClient, + user_api_key_header: dict[str, str], + db: Session, + user_api_key: TestAuthContext, + create_test_dataset: EvaluationDataset, + ) -> None: + eval_run = EvaluationRun( + run_name="test_run", + dataset_name=create_test_dataset.name, + dataset_id=create_test_dataset.id, + config={"model": "gpt-4o"}, + status="completed", + total_items=3, + organization_id=user_api_key.organization_id, + project_id=user_api_key.project_id, + ) + db.add(eval_run) + db.commit() + db.refresh(eval_run) + + response = client.get( + f"/api/v1/evaluations/{eval_run.id}", + params={"export_format": "grouped"}, # Missing get_trace_info=true + headers=user_api_key_header, + ) + + assert response.status_code == 400 + response_data = response.json() + error_str = response_data.get( + "detail", response_data.get("error", str(response_data)) + ) + assert ( + "export_format" in error_str.lower() + and "get_trace_info" in error_str.lower() + ) + + def test_get_evaluation_run_grouped_format_success( + self, + client: TestClient, + user_api_key_header: dict[str, str], + db: Session, + user_api_key: TestAuthContext, + create_test_dataset: EvaluationDataset, + ) -> None: + eval_run = EvaluationRun( + run_name="test_run", + dataset_name=create_test_dataset.name, + dataset_id=create_test_dataset.id, + config={"model": "gpt-4o"}, + status="completed", + total_items=4, + score={ + "traces": [ + { + "trace_id": "trace-1a", + "question_id": 1, + "question": "What is Python?", + "ground_truth_answer": "A programming language", + "llm_answer": "Python is a high-level programming language", + "scores": [ + { + "name": "cosine_similarity", + "value": 0.82, + "data_type": "NUMERIC", + } + ], + }, + { + "trace_id": "trace-1b", + "question_id": 1, + "question": "What is Python?", + "ground_truth_answer": "A programming language", + "llm_answer": "Python is an interpreted language", + "scores": [ + { + "name": "cosine_similarity", + "value": 0.75, + "data_type": "NUMERIC", + } + ], + }, + # Row format - 1 trace for question_id=2 + { + "trace_id": "trace-2a", + "question_id": 2, + "question": "What is Java?", + "ground_truth_answer": "An OOP language", + "llm_answer": "Java is a statically typed language", + "scores": [ + { + "name": "cosine_similarity", + "value": 0.80, + "data_type": "NUMERIC", + } + ], + }, + ], + "summary_scores": [ + { + "avg": 0.79, + "std": 0.03, + "name": "cosine_similarity", + "data_type": "NUMERIC", + "total_pairs": 3, + } + ], + }, + organization_id=user_api_key.organization_id, + project_id=user_api_key.project_id, + ) + db.add(eval_run) + db.commit() + db.refresh(eval_run) + + response = client.get( + f"/api/v1/evaluations/{eval_run.id}", + params={ + "export_format": "grouped", + "get_trace_info": True, + }, # Missing get_trace_info=true + headers=user_api_key_header, + ) + + assert response.status_code == 200 + response_data = response.json() + assert response_data["success"] is True + data = response_data["data"] + assert data["id"] == eval_run.id + assert data["status"] == "completed" + + traces = data["score"]["traces"] + assert ( + isinstance(traces, list) + and len(traces) > 0 + and "llm_answers" in traces[0] + and isinstance(traces[0]["llm_answers"], list) + and "trace_ids" in traces[0] + and isinstance(traces[0]["trace_ids"], list) + ) class TestGetDataset: """Test GET /evaluations/datasets/{dataset_id} endpoint.""" From eca2181ba8c2fb50d5f10d14793d98b7ac6e1437 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Wed, 28 Jan 2026 13:57:22 +0530 Subject: [PATCH 2/5] Fix formatting and update docstring for group_traces_by_question_id --- .../app/api/routes/evaluations/evaluation.py | 8 ++-- backend/app/crud/evaluations/core.py | 45 +++++++++++-------- .../app/tests/api/routes/test_evaluation.py | 1 + 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/backend/app/api/routes/evaluations/evaluation.py b/backend/app/api/routes/evaluations/evaluation.py index c2b6eedbf..55db7514c 100644 --- a/backend/app/api/routes/evaluations/evaluation.py +++ b/backend/app/api/routes/evaluations/evaluation.py @@ -127,8 +127,8 @@ def get_evaluation_run_status( "Controls the Traces structure." "'grouped' collates repeated questions horizontally using Parent Question ID." ), - enum=["row", "grouped"] - ) + enum=["row", "grouped"], + ), ) -> APIResponse[EvaluationRunPublic]: """Get evaluation run status with optional trace info.""" if resync_score and not get_trace_info: @@ -138,8 +138,7 @@ def get_evaluation_run_status( ) if export_format == "grouped" and not get_trace_info: raise HTTPException( - status_code = 400, - detail="export_format=grouped requires get_trace_info=true" + status_code=400, detail="export_format=grouped requires get_trace_info=true" ) eval_run, error = get_evaluation_with_scores( @@ -162,6 +161,7 @@ def get_evaluation_run_status( # Formatter = grouped if export_format == "grouped" and eval_run.score and "traces" in eval_run.score: from app.crud.evaluations.core import group_traces_by_question_id + try: grouped_traces = group_traces_by_question_id(eval_run.score["traces"]) eval_run.score["traces"] = grouped_traces diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index afd747597..94858d1a8 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -352,16 +352,16 @@ def save_score( ) return eval_run + def group_traces_by_question_id( traces: list[dict[str, Any]], ) -> list[dict[str, Any]]: """ - Docstring for group_traces_by_question_id - - :param traces: Description - :type traces: list[dict[str, Any]] - :return: Description - :rtype: list[dict[str, Any]] + Group evaluation traces by question_id for horizontal comparison + + Args: + traces: List of trace dicts, each containing question_id, question, + ground_truth_answer, llm_answer, trace_id, and scores. Returns: List of grouped traces sorted by question_id: @@ -375,13 +375,17 @@ def group_traces_by_question_id( "scores": [[...], [...]] } ] + + Raises: + ValueError: If traces lack question_id (required for grouping). """ # weather question_id exists in the traces - if traces and (traces[0].get("question_id") is None or traces[0].get("question_id") == ""): - raise ValueError( - "Grouped export format is not available for this evaluation.") - + if traces and ( + traces[0].get("question_id") is None or traces[0].get("question_id") == "" + ): + raise ValueError("Grouped export format is not available for this evaluation.") + groups: dict[int, list[dict[str, Any]]] = {} for trace in traces: @@ -389,23 +393,26 @@ def group_traces_by_question_id( if question_id not in groups: groups[question_id] = [] groups[question_id].append(trace) - + result: list[dict[str, Any]] = [] for question_id in sorted(groups.keys()): group_traces = groups[question_id] first = group_traces[0] - result.append({ - "question_id": question_id, - "question": first.get("question", ""), - "ground_truth_answer": first.get("ground_truth_answer", ""), - "llm_answers": [t.get("llm_answer", "") for t in group_traces], - "trace_ids": [t.get("trace_id", "") for t in group_traces], - "scores": [t.get("scores", []) for t in group_traces], - }) + result.append( + { + "question_id": question_id, + "question": first.get("question", ""), + "ground_truth_answer": first.get("ground_truth_answer", ""), + "llm_answers": [t.get("llm_answer", "") for t in group_traces], + "trace_ids": [t.get("trace_id", "") for t in group_traces], + "scores": [t.get("scores", []) for t in group_traces], + } + ) logger.info(f"[group_traces_by_question_id] Created {len(result)} groups") return result + def resolve_model_from_config( session: Session, eval_run: EvaluationRun, diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py index 9d997175f..bf21fc3f0 100644 --- a/backend/app/tests/api/routes/test_evaluation.py +++ b/backend/app/tests/api/routes/test_evaluation.py @@ -1070,6 +1070,7 @@ def test_get_evaluation_run_grouped_format_success( and isinstance(traces[0]["trace_ids"], list) ) + class TestGetDataset: """Test GET /evaluations/datasets/{dataset_id} endpoint.""" From cae0263d30d7ad7d82209145ac5e10546d5abb49 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Wed, 28 Jan 2026 14:01:52 +0530 Subject: [PATCH 3/5] Fix trailing whitespace --- backend/app/crud/evaluations/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index 94858d1a8..1c3b658d1 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -375,7 +375,7 @@ def group_traces_by_question_id( "scores": [[...], [...]] } ] - + Raises: ValueError: If traces lack question_id (required for grouping). """ From e6de3d023217a4cf5f31f15393da39d09fe8eb27 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Thu, 29 Jan 2026 11:28:52 +0530 Subject: [PATCH 4/5] Refactor: Move import of group_traces_by_question_id to the top and update docstring for clarity --- backend/app/api/routes/evaluations/evaluation.py | 3 +-- backend/app/crud/evaluations/core.py | 11 ++--------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/backend/app/api/routes/evaluations/evaluation.py b/backend/app/api/routes/evaluations/evaluation.py index 55db7514c..6ae68b461 100644 --- a/backend/app/api/routes/evaluations/evaluation.py +++ b/backend/app/api/routes/evaluations/evaluation.py @@ -13,6 +13,7 @@ from app.api.deps import AuthContextDep, SessionDep from app.crud.evaluations import list_evaluation_runs as list_evaluation_runs_crud +from app.crud.evaluations.core import group_traces_by_question_id from app.models.evaluation import EvaluationRunPublic from app.api.permissions import Permission, require_permission from app.services.evaluations import ( @@ -160,8 +161,6 @@ def get_evaluation_run_status( ) # Formatter = grouped if export_format == "grouped" and eval_run.score and "traces" in eval_run.score: - from app.crud.evaluations.core import group_traces_by_question_id - try: grouped_traces = group_traces_by_question_id(eval_run.score["traces"]) eval_run.score["traces"] = grouped_traces diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index 1c3b658d1..6d17afe60 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -357,11 +357,7 @@ def group_traces_by_question_id( traces: list[dict[str, Any]], ) -> list[dict[str, Any]]: """ - Group evaluation traces by question_id for horizontal comparison - - Args: - traces: List of trace dicts, each containing question_id, question, - ground_truth_answer, llm_answer, trace_id, and scores. + Group evaluation traces by question_id for horizontal comparison. Returns: List of grouped traces sorted by question_id: @@ -375,12 +371,9 @@ def group_traces_by_question_id( "scores": [[...], [...]] } ] - - Raises: - ValueError: If traces lack question_id (required for grouping). """ - # weather question_id exists in the traces + # whether question_id exists in the traces if traces and ( traces[0].get("question_id") is None or traces[0].get("question_id") == "" ): From fdc4582b01f1747e99ab9d8d68f7261eea036b29 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Thu, 29 Jan 2026 14:55:24 +0530 Subject: [PATCH 5/5] Docs: Improve export_format parameter description for clarity --- backend/app/api/docs/evaluation/get_evaluation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/api/docs/evaluation/get_evaluation.md b/backend/app/api/docs/evaluation/get_evaluation.md index 4aea35a9d..2c8e96351 100644 --- a/backend/app/api/docs/evaluation/get_evaluation.md +++ b/backend/app/api/docs/evaluation/get_evaluation.md @@ -5,7 +5,7 @@ Returns comprehensive evaluation information including processing status, config **Query Parameters:** * `get_trace_info` (optional, default: false) - Include Langfuse trace scores with Q&A context. Data is fetched from Langfuse on first request and cached for subsequent calls. Only available for completed evaluations. * `resync_score` (optional, default: false) - Clear cached scores and re-fetch from Langfuse. Useful when evaluators have been updated. Requires `get_trace_info=true`. -* `export_format` (optional, default: row) - Controls the structure of traces in the response. Requires `get_trace_info=true` when set to "grouped". enum=['row','grouped'] +* `export_format` (optional, default: row) - Controls the structure of traces in the response. Requires `get_trace_info=true` when set to "grouped". Allowed values: `row`, `grouped`. **Score Format** (`get_trace_info=true`,`export_format=row`):