From 07ab964d96dc58f74859abd1e5a6add97a1ceb26 Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Tue, 27 Jan 2026 22:23:20 +0530
Subject: [PATCH 1/5] Evaluation: add export_format query param with grouped
 traces support

---
 .../app/api/docs/evaluation/get_evaluation.md |  30 +++-
 .../app/api/routes/evaluations/evaluation.py  |  21 +++
 backend/app/crud/evaluations/core.py          |  55 +++++++
 .../app/tests/api/routes/test_evaluation.py   | 141 ++++++++++++++++++
 4 files changed, 246 insertions(+), 1 deletion(-)

diff --git a/backend/app/api/docs/evaluation/get_evaluation.md b/backend/app/api/docs/evaluation/get_evaluation.md
index 1e3186d2c..4aea35a9d 100644
--- a/backend/app/api/docs/evaluation/get_evaluation.md
+++ b/backend/app/api/docs/evaluation/get_evaluation.md
@@ -5,8 +5,9 @@ Returns comprehensive evaluation information including processing status, config
 **Query Parameters:**
 * `get_trace_info` (optional, default: false) - Include Langfuse trace scores with Q&A context. Data is fetched from Langfuse on first request and cached for subsequent calls. Only available for completed evaluations.
 * `resync_score` (optional, default: false) - Clear cached scores and re-fetch from Langfuse. Useful when evaluators have been updated. Requires `get_trace_info=true`.
+* `export_format` (optional, default: row) -  Controls the structure of traces in the response. Requires `get_trace_info=true` when set to "grouped". enum=['row','grouped']
 
-**Score Format** (`get_trace_info=true`):
+**Score Format** (`get_trace_info=true`,`export_format=row`):
 
 ```json
 {
@@ -49,6 +50,33 @@ Returns comprehensive evaluation information including processing status, config
 }
 ```
 
+**Score Format** (`get_trace_info=true`,`export_format=grouped`):
+```json
+{
+  "summary_scores": [...],
+  "traces": [...],
+  "grouped_traces": [
+    {
+      "question_id": 1,
+      "question": "What is Python?",
+      "ground_truth_answer": "Python is a high-level programming language.",
+      "llm_answers": [
+        "Answer from evaluation run 1...",
+        "Answer from evaluation run 2..."
+      ],
+      "trace_ids": [
+        "uuid-123",
+        "uuid-456"
+      ],
+      "scores": [
+        [{"name": "cosine_similarity", "value": 0.82, "data_type": "NUMERIC"}],
+        [{"name": "cosine_similarity", "value": 0.75, "data_type": "NUMERIC"}]
+      ]
+    }
+  ]
+}
+```
+
 **Score Details:**
 * NUMERIC scores include average (`avg`) and standard deviation (`std`) in summary
 * CATEGORICAL scores include distribution counts in summary
diff --git a/backend/app/api/routes/evaluations/evaluation.py b/backend/app/api/routes/evaluations/evaluation.py
index d40a88a1a..a14b7479c 100644
--- a/backend/app/api/routes/evaluations/evaluation.py
+++ b/backend/app/api/routes/evaluations/evaluation.py
@@ -123,6 +123,14 @@ def get_evaluation_run_status(
             "Requires get_trace_info=true."
         ),
     ),
+    export_format: str = Query(
+        "row",
+        description=(
+            "Controls the Traces structure."
+            "'grouped' collates repeated questions horizontally using Parent Question ID."
+        ),
+        enum=["row", "grouped"]
+    )
 ) -> APIResponse[EvaluationRunPublic]:
     """Get evaluation run status with optional trace info."""
     if resync_score and not get_trace_info:
@@ -130,6 +138,11 @@ def get_evaluation_run_status(
             status_code=400,
             detail="resync_score=true requires get_trace_info=true",
         )
+    if export_format == "grouped" and not get_trace_info:
+        raise HTTPException(
+            status_code = 400,
+            detail="export_format=grouped requires get_trace_info=true"
+        )
 
     eval_run, error = get_evaluation_with_scores(
         session=_session,
@@ -148,6 +161,14 @@ def get_evaluation_run_status(
                 "to this organization"
             ),
         )
+    # Formatter = grouped
+    if export_format == "grouped" and eval_run.score and "traces" in eval_run.score:
+        from app.crud.evaluations.core import group_traces_by_question_id
+        try:
+            grouped_traces = group_traces_by_question_id(eval_run.score["traces"])
+            eval_run.score["traces"] = grouped_traces
+        except ValueError as e:
+            return APIResponse.failure_response(error=str(e), data=eval_run)
 
     if error:
         return APIResponse.failure_response(error=error, data=eval_run)
diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py
index 33b6777f3..53fdab920 100644
--- a/backend/app/crud/evaluations/core.py
+++ b/backend/app/crud/evaluations/core.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Any
 
 from langfuse import Langfuse
 from sqlmodel import Session, select
@@ -311,3 +312,57 @@ def save_score(
                 f"traces={len(score.get('traces', []))}"
             )
         return eval_run
+
+def group_traces_by_question_id(
+    traces: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    """
+    Docstring for group_traces_by_question_id
+    
+    :param traces: Description
+    :type traces: list[dict[str, Any]]
+    :return: Description
+    :rtype: list[dict[str, Any]]
+
+    Returns:
+        List of grouped traces sorted by question_id:
+        [
+            {
+                "question_id": 1,
+                "question": "What is Python?",
+                "ground_truth_answer": "...",
+                "llm_answers": ["Answer 1", "Answer 2"],
+                "trace_ids": ["trace-1", "trace-2"],
+                "scores": [[...], [...]]
+            }
+        ]
+    """
+
+    # weather question_id exists in the traces
+    if traces and (traces[0].get("question_id") is None or traces[0].get("question_id") ==  ""):
+        raise ValueError(
+            "Grouped export format is not available for this evaluation.")
+    
+    groups: dict[int, list[dict[str, Any]]] = {}
+
+    for trace in traces:
+        question_id = trace.get("question_id")
+        if question_id not in groups:
+            groups[question_id] = []
+        groups[question_id].append(trace)
+    
+    result: list[dict[str, Any]] = []
+    for question_id in sorted(groups.keys()):
+        group_traces = groups[question_id]
+        first = group_traces[0]
+        result.append({
+            "question_id": question_id,
+            "question": first.get("question", ""),
+            "ground_truth_answer": first.get("ground_truth_answer", ""),
+            "llm_answers": [t.get("llm_answer", "") for t in group_traces],
+            "trace_ids": [t.get("trace_id", "") for t in group_traces],
+            "scores": [t.get("scores", []) for t in group_traces],
+        })
+
+    logger.info(f"[group_traces_by_question_id] Created {len(result)} groups")
+    return result
\ No newline at end of file
diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py
index 813fd483f..26db5a144 100644
--- a/backend/app/tests/api/routes/test_evaluation.py
+++ b/backend/app/tests/api/routes/test_evaluation.py
@@ -918,6 +918,147 @@ def test_get_evaluation_run_resync_without_trace_info_fails(
             and "get_trace_info" in error_str.lower()
         )
 
+    def test_get_evaluation_run_grouped_format_without_trace_info_fails(
+        self,
+        client: TestClient,
+        user_api_key_header: dict[str, str],
+        db: Session,
+        user_api_key: TestAuthContext,
+        create_test_dataset: EvaluationDataset,
+    ) -> None:
+        eval_run = EvaluationRun(
+            run_name="test_run",
+            dataset_name=create_test_dataset.name,
+            dataset_id=create_test_dataset.id,
+            config={"model": "gpt-4o"},
+            status="completed",
+            total_items=3,
+            organization_id=user_api_key.organization_id,
+            project_id=user_api_key.project_id,
+        )
+        db.add(eval_run)
+        db.commit()
+        db.refresh(eval_run)
+
+        response = client.get(
+            f"/api/v1/evaluations/{eval_run.id}",
+            params={"export_format": "grouped"},  # Missing get_trace_info=true
+            headers=user_api_key_header,
+        )
+
+        assert response.status_code == 400
+        response_data = response.json()
+        error_str = response_data.get(
+            "detail", response_data.get("error", str(response_data))
+        )
+        assert (
+            "export_format" in error_str.lower()
+            and "get_trace_info" in error_str.lower()
+        )
+
+    def test_get_evaluation_run_grouped_format_success(
+        self,
+        client: TestClient,
+        user_api_key_header: dict[str, str],
+        db: Session,
+        user_api_key: TestAuthContext,
+        create_test_dataset: EvaluationDataset,
+    ) -> None:
+        eval_run = EvaluationRun(
+            run_name="test_run",
+            dataset_name=create_test_dataset.name,
+            dataset_id=create_test_dataset.id,
+            config={"model": "gpt-4o"},
+            status="completed",
+            total_items=4,
+            score={
+                "traces": [
+                    {
+                        "trace_id": "trace-1a",
+                        "question_id": 1,
+                        "question": "What is Python?",
+                        "ground_truth_answer": "A programming language",
+                        "llm_answer": "Python is a high-level programming language",
+                        "scores": [
+                            {
+                                "name": "cosine_similarity",
+                                "value": 0.82,
+                                "data_type": "NUMERIC",
+                            }
+                        ],
+                    },
+                    {
+                        "trace_id": "trace-1b",
+                        "question_id": 1,
+                        "question": "What is Python?",
+                        "ground_truth_answer": "A programming language",
+                        "llm_answer": "Python is an interpreted language",
+                        "scores": [
+                            {
+                                "name": "cosine_similarity",
+                                "value": 0.75,
+                                "data_type": "NUMERIC",
+                            }
+                        ],
+                    },
+                    # Row format - 1 trace for question_id=2
+                    {
+                        "trace_id": "trace-2a",
+                        "question_id": 2,
+                        "question": "What is Java?",
+                        "ground_truth_answer": "An OOP language",
+                        "llm_answer": "Java is a statically typed language",
+                        "scores": [
+                            {
+                                "name": "cosine_similarity",
+                                "value": 0.80,
+                                "data_type": "NUMERIC",
+                            }
+                        ],
+                    },
+                ],
+                "summary_scores": [
+                    {
+                        "avg": 0.79,
+                        "std": 0.03,
+                        "name": "cosine_similarity",
+                        "data_type": "NUMERIC",
+                        "total_pairs": 3,
+                    }
+                ],
+            },
+            organization_id=user_api_key.organization_id,
+            project_id=user_api_key.project_id,
+        )
+        db.add(eval_run)
+        db.commit()
+        db.refresh(eval_run)
+
+        response = client.get(
+            f"/api/v1/evaluations/{eval_run.id}",
+            params={
+                "export_format": "grouped",
+                "get_trace_info": True,
+            },  # Missing get_trace_info=true
+            headers=user_api_key_header,
+        )
+
+        assert response.status_code == 200
+        response_data = response.json()
+        assert response_data["success"] is True
+        data = response_data["data"]
+        assert data["id"] == eval_run.id
+        assert data["status"] == "completed"
+
+        traces = data["score"]["traces"]
+        assert (
+            isinstance(traces, list)
+            and len(traces) > 0
+            and "llm_answers" in traces[0]
+            and isinstance(traces[0]["llm_answers"], list)
+            and "trace_ids" in traces[0]
+            and isinstance(traces[0]["trace_ids"], list)
+        )
 
 class TestGetDataset:
     """Test GET /evaluations/datasets/{dataset_id} endpoint."""

From eca2181ba8c2fb50d5f10d14793d98b7ac6e1437 Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Wed, 28 Jan 2026 13:57:22 +0530
Subject: [PATCH 2/5] Fix formatting and update docstring for
 group_traces_by_question_id

---
 .../app/api/routes/evaluations/evaluation.py  |  8 ++--
 backend/app/crud/evaluations/core.py          | 45 +++++++++++--------
 .../app/tests/api/routes/test_evaluation.py   |  1 +
 3 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/backend/app/api/routes/evaluations/evaluation.py b/backend/app/api/routes/evaluations/evaluation.py
index c2b6eedbf..55db7514c 100644
--- a/backend/app/api/routes/evaluations/evaluation.py
+++ b/backend/app/api/routes/evaluations/evaluation.py
@@ -127,8 +127,8 @@ def get_evaluation_run_status(
             "Controls the Traces structure."
             "'grouped' collates repeated questions horizontally using Parent Question ID."
         ),
-        enum=["row", "grouped"]
-    )
+        enum=["row", "grouped"],
+    ),
 ) -> APIResponse[EvaluationRunPublic]:
     """Get evaluation run status with optional trace info."""
     if resync_score and not get_trace_info:
@@ -138,8 +138,7 @@ def get_evaluation_run_status(
         )
     if export_format == "grouped" and not get_trace_info:
         raise HTTPException(
-            status_code = 400,
-            detail="export_format=grouped requires get_trace_info=true"
+            status_code=400, detail="export_format=grouped requires get_trace_info=true"
         )
 
     eval_run, error = get_evaluation_with_scores(
@@ -162,6 +161,7 @@ def get_evaluation_run_status(
     # Formatter = grouped
     if export_format == "grouped" and eval_run.score and "traces" in eval_run.score:
         from app.crud.evaluations.core import group_traces_by_question_id
+
         try:
             grouped_traces = group_traces_by_question_id(eval_run.score["traces"])
             eval_run.score["traces"] = grouped_traces
diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py
index afd747597..94858d1a8 100644
--- a/backend/app/crud/evaluations/core.py
+++ b/backend/app/crud/evaluations/core.py
@@ -352,16 +352,16 @@ def save_score(
             )
         return eval_run
 
+
 def group_traces_by_question_id(
     traces: list[dict[str, Any]],
 ) -> list[dict[str, Any]]:
     """
-    Docstring for group_traces_by_question_id
-    
-    :param traces: Description
-    :type traces: list[dict[str, Any]]
-    :return: Description
-    :rtype: list[dict[str, Any]]
+    Group evaluation traces by question_id for horizontal comparison
+
+    Args:
+        traces: List of trace dicts, each containing question_id, question,
+            ground_truth_answer, llm_answer, trace_id, and scores.
 
     Returns:
         List of grouped traces sorted by question_id:
@@ -375,13 +375,17 @@ def group_traces_by_question_id(
                 "scores": [[...], [...]]
             }
         ]
+    
+    Raises:
+        ValueError: If traces lack question_id (required for grouping).
     """
 
     # weather question_id exists in the traces
-    if traces and (traces[0].get("question_id") is None or traces[0].get("question_id") ==  ""):
-        raise ValueError(
-            "Grouped export format is not available for this evaluation.")
-    
+    if traces and (
+        traces[0].get("question_id") is None or traces[0].get("question_id") == ""
+    ):
+        raise ValueError("Grouped export format is not available for this evaluation.")
+
     groups: dict[int, list[dict[str, Any]]] = {}
 
     for trace in traces:
@@ -389,23 +393,26 @@ def group_traces_by_question_id(
         if question_id not in groups:
             groups[question_id] = []
         groups[question_id].append(trace)
-    
+
     result: list[dict[str, Any]] = []
     for question_id in sorted(groups.keys()):
         group_traces = groups[question_id]
         first = group_traces[0]
-        result.append({
-            "question_id": question_id,
-            "question": first.get("question", ""),
-            "ground_truth_answer": first.get("ground_truth_answer", ""),
-            "llm_answers": [t.get("llm_answer", "") for t in group_traces],
-            "trace_ids": [t.get("trace_id", "") for t in group_traces],
-            "scores": [t.get("scores", []) for t in group_traces],
-        })
+        result.append(
+            {
+                "question_id": question_id,
+                "question": first.get("question", ""),
+                "ground_truth_answer": first.get("ground_truth_answer", ""),
+                "llm_answers": [t.get("llm_answer", "") for t in group_traces],
+                "trace_ids": [t.get("trace_id", "") for t in group_traces],
+                "scores": [t.get("scores", []) for t in group_traces],
+            }
+        )
 
     logger.info(f"[group_traces_by_question_id] Created {len(result)} groups")
     return result
 
+
 def resolve_model_from_config(
     session: Session,
     eval_run: EvaluationRun,
diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py
index 9d997175f..bf21fc3f0 100644
--- a/backend/app/tests/api/routes/test_evaluation.py
+++ b/backend/app/tests/api/routes/test_evaluation.py
@@ -1070,6 +1070,7 @@ def test_get_evaluation_run_grouped_format_success(
             and isinstance(traces[0]["trace_ids"], list)
         )
 
+
 class TestGetDataset:
     """Test GET /evaluations/datasets/{dataset_id} endpoint."""
 

From cae0263d30d7ad7d82209145ac5e10546d5abb49 Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Wed, 28 Jan 2026 14:01:52 +0530
Subject: [PATCH 3/5] Fix trailing whitespace

---
 backend/app/crud/evaluations/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py
index 94858d1a8..1c3b658d1 100644
--- a/backend/app/crud/evaluations/core.py
+++ b/backend/app/crud/evaluations/core.py
@@ -375,7 +375,7 @@ def group_traces_by_question_id(
                 "scores": [[...], [...]]
             }
         ]
-    
+
     Raises:
         ValueError: If traces lack question_id (required for grouping).
     """

From e6de3d023217a4cf5f31f15393da39d09fe8eb27 Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Thu, 29 Jan 2026 11:28:52 +0530
Subject: [PATCH 4/5] Refactor: Move import of group_traces_by_question_id to
 the top and update docstring for clarity

---
 backend/app/api/routes/evaluations/evaluation.py |  3 +--
 backend/app/crud/evaluations/core.py             | 11 ++---------
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/backend/app/api/routes/evaluations/evaluation.py b/backend/app/api/routes/evaluations/evaluation.py
index 55db7514c..6ae68b461 100644
--- a/backend/app/api/routes/evaluations/evaluation.py
+++ b/backend/app/api/routes/evaluations/evaluation.py
@@ -13,6 +13,7 @@
 
 from app.api.deps import AuthContextDep, SessionDep
 from app.crud.evaluations import list_evaluation_runs as list_evaluation_runs_crud
+from app.crud.evaluations.core import group_traces_by_question_id
 from app.models.evaluation import EvaluationRunPublic
 from app.api.permissions import Permission, require_permission
 from app.services.evaluations import (
@@ -160,8 +161,6 @@ def get_evaluation_run_status(
         )
     # Formatter = grouped
     if export_format == "grouped" and eval_run.score and "traces" in eval_run.score:
-        from app.crud.evaluations.core import group_traces_by_question_id
-
         try:
             grouped_traces = group_traces_by_question_id(eval_run.score["traces"])
             eval_run.score["traces"] = grouped_traces
diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py
index 1c3b658d1..6d17afe60 100644
--- a/backend/app/crud/evaluations/core.py
+++ b/backend/app/crud/evaluations/core.py
@@ -357,11 +357,7 @@ def group_traces_by_question_id(
     traces: list[dict[str, Any]],
 ) -> list[dict[str, Any]]:
     """
-    Group evaluation traces by question_id for horizontal comparison
-
-    Args:
-        traces: List of trace dicts, each containing question_id, question,
-            ground_truth_answer, llm_answer, trace_id, and scores.
+    Group evaluation traces by question_id for horizontal comparison.
 
     Returns:
         List of grouped traces sorted by question_id:
@@ -375,12 +371,9 @@ def group_traces_by_question_id(
                 "scores": [[...], [...]]
             }
         ]
-
-    Raises:
-        ValueError: If traces lack question_id (required for grouping).
     """
 
-    # weather question_id exists in the traces
+    # whether question_id exists in the traces
     if traces and (
         traces[0].get("question_id") is None or traces[0].get("question_id") == ""
     ):

From fdc4582b01f1747e99ab9d8d68f7261eea036b29 Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Thu, 29 Jan 2026 14:55:24 +0530
Subject: [PATCH 5/5] Docs: Improve export_format parameter description for
 clarity

---
 backend/app/api/docs/evaluation/get_evaluation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/app/api/docs/evaluation/get_evaluation.md b/backend/app/api/docs/evaluation/get_evaluation.md
index 4aea35a9d..2c8e96351 100644
--- a/backend/app/api/docs/evaluation/get_evaluation.md
+++ b/backend/app/api/docs/evaluation/get_evaluation.md
@@ -5,7 +5,7 @@ Returns comprehensive evaluation information including processing status, config
 **Query Parameters:**
 * `get_trace_info` (optional, default: false) - Include Langfuse trace scores with Q&A context. Data is fetched from Langfuse on first request and cached for subsequent calls. Only available for completed evaluations.
 * `resync_score` (optional, default: false) - Clear cached scores and re-fetch from Langfuse. Useful when evaluators have been updated. Requires `get_trace_info=true`.
-* `export_format` (optional, default: row) -  Controls the structure of traces in the response. Requires `get_trace_info=true` when set to "grouped". enum=['row','grouped']
+* `export_format` (optional, default: row) -  Controls the structure of traces in the response. Requires `get_trace_info=true` when set to "grouped". Allowed values: `row`, `grouped`.
 
 **Score Format** (`get_trace_info=true`,`export_format=row`):