diff --git a/.semgrep/rules/idor-team-scoped-models.yaml b/.semgrep/rules/idor-team-scoped-models.yaml index 1d2292017f85..b0a88f13e341 100644 --- a/.semgrep/rules/idor-team-scoped-models.yaml +++ b/.semgrep/rules/idor-team-scoped-models.yaml @@ -183,6 +183,7 @@ rules: |Task |TaskRun |EmailChannel + |EvaluationReport |Text |Threshold |Ticket @@ -372,6 +373,7 @@ rules: |Task |TaskRun |EmailChannel + |EvaluationReport |Text |Threshold |Ticket diff --git a/frontend/src/generated/core/api.schemas.ts b/frontend/src/generated/core/api.schemas.ts index 165e42ff1e50..0355024a3771 100644 --- a/frontend/src/generated/core/api.schemas.ts +++ b/frontend/src/generated/core/api.schemas.ts @@ -1085,9 +1085,10 @@ export const TargetTypeEnumApi = { * `monthly` - Monthly * `yearly` - Yearly */ -export type FrequencyEnumApi = (typeof FrequencyEnumApi)[keyof typeof FrequencyEnumApi] +export type SubscriptionFrequencyEnumApi = + (typeof SubscriptionFrequencyEnumApi)[keyof typeof SubscriptionFrequencyEnumApi] -export const FrequencyEnumApi = { +export const SubscriptionFrequencyEnumApi = { Daily: 'daily', Weekly: 'weekly', Monthly: 'monthly', @@ -1131,7 +1132,7 @@ export interface SubscriptionApi { dashboard_export_insights?: number[] target_type: TargetTypeEnumApi target_value: string - frequency: FrequencyEnumApi + frequency: SubscriptionFrequencyEnumApi /** * @minimum -2147483648 * @maximum 2147483647 @@ -1196,7 +1197,7 @@ export interface PatchedSubscriptionApi { dashboard_export_insights?: number[] target_type?: TargetTypeEnumApi target_value?: string - frequency?: FrequencyEnumApi + frequency?: SubscriptionFrequencyEnumApi /** * @minimum -2147483648 * @maximum 2147483647 diff --git a/posthog/api/__init__.py b/posthog/api/__init__.py index b86dc743142c..5853a4a8a4a6 100644 --- a/posthog/api/__init__.py +++ b/posthog/api/__init__.py @@ -62,6 +62,7 @@ DatasetItemViewSet, DatasetViewSet, EvaluationConfigViewSet, + EvaluationReportViewSet, EvaluationRunViewSet, EvaluationViewSet, LLMAnalyticsClusteringRunViewSet, @@ -1380,6 +1381,13 @@ def register_grandfathered_environment_nested_viewset( ["team_id"], ) +environments_router.register( + r"llm_analytics/evaluation_reports", + EvaluationReportViewSet, + "environment_llm_analytics_evaluation_reports", + ["team_id"], +) + environments_router.register( r"change_requests", approval_api.ChangeRequestViewSet, diff --git a/products/llm_analytics/backend/api/__init__.py b/products/llm_analytics/backend/api/__init__.py index 9fd2af0eb63e..f6b235ddf3e0 100644 --- a/products/llm_analytics/backend/api/__init__.py +++ b/products/llm_analytics/backend/api/__init__.py @@ -3,6 +3,7 @@ from .clustering_job import ClusteringJobViewSet from .datasets import DatasetItemViewSet, DatasetViewSet from .evaluation_config import EvaluationConfigViewSet +from .evaluation_reports import EvaluationReportViewSet from .evaluation_runs import EvaluationRunViewSet from .evaluation_summary import LLMEvaluationSummaryViewSet from .evaluations import EvaluationViewSet @@ -31,6 +32,7 @@ "DatasetViewSet", "DatasetItemViewSet", "EvaluationViewSet", + "EvaluationReportViewSet", "EvaluationRunViewSet", "EvaluationConfigViewSet", "LLMProviderKeyViewSet", diff --git a/products/llm_analytics/backend/api/evaluation_reports.py b/products/llm_analytics/backend/api/evaluation_reports.py new file mode 100644 index 000000000000..7abe92445983 --- /dev/null +++ b/products/llm_analytics/backend/api/evaluation_reports.py @@ -0,0 +1,201 @@ +"""API endpoints for evaluation report configuration and report run history.""" + +import datetime as dt + +from django.conf import settings +from django.db.models import QuerySet + +import structlog +from asgiref.sync import async_to_sync +from drf_spectacular.utils import extend_schema +from rest_framework import serializers, status, viewsets +from rest_framework.decorators import action +from rest_framework.request import Request +from rest_framework.response import Response + +from posthog.api.routing import TeamAndOrgViewSetMixin +from posthog.permissions import AccessControlPermission + +from products.llm_analytics.backend.api.metrics import llma_track_latency +from products.llm_analytics.backend.models.evaluation_reports import EvaluationReport, EvaluationReportRun + +logger = structlog.get_logger(__name__) + + +class EvaluationReportSerializer(serializers.ModelSerializer): + class Meta: + model = EvaluationReport + fields = [ + "id", + "evaluation", + "frequency", + "byweekday", + "start_date", + "next_delivery_date", + "delivery_targets", + "max_sample_size", + "enabled", + "deleted", + "last_delivered_at", + "report_prompt_guidance", + "trigger_threshold", + "cooldown_minutes", + "daily_run_cap", + "created_by", + "created_at", + ] + read_only_fields = ["id", "next_delivery_date", "last_delivered_at", "created_by", "created_at"] + + def validate_evaluation(self, value): + # Prevent creating a report in team A that references team B's evaluation: + # the FK queryset is unscoped, so a user with access to multiple teams could + # otherwise cross tenant boundaries by passing a foreign evaluation id. + team = self.context["get_team"]() + if value.team_id != team.id: + raise serializers.ValidationError("Evaluation does not belong to this team.") + return value + + def validate(self, attrs): + attrs = super().validate(attrs) + frequency = attrs.get("frequency") or (self.instance.frequency if self.instance else None) + if frequency == EvaluationReport.Frequency.EVERY_N: + threshold = ( + attrs.get("trigger_threshold") + if "trigger_threshold" in attrs + else (self.instance.trigger_threshold if self.instance else None) + ) + if threshold is None: + raise serializers.ValidationError({"trigger_threshold": "Required when frequency is 'every_n'."}) + if threshold < EvaluationReport.TRIGGER_THRESHOLD_MIN: + raise serializers.ValidationError( + {"trigger_threshold": f"Minimum is {EvaluationReport.TRIGGER_THRESHOLD_MIN}."} + ) + if threshold > EvaluationReport.TRIGGER_THRESHOLD_MAX: + raise serializers.ValidationError( + {"trigger_threshold": f"Maximum is {EvaluationReport.TRIGGER_THRESHOLD_MAX}."} + ) + cooldown = ( + attrs.get("cooldown_minutes") + if "cooldown_minutes" in attrs + else (self.instance.cooldown_minutes if self.instance else EvaluationReport.COOLDOWN_MINUTES_DEFAULT) + ) + if cooldown < EvaluationReport.COOLDOWN_MINUTES_MIN: + raise serializers.ValidationError( + {"cooldown_minutes": f"Minimum is {EvaluationReport.COOLDOWN_MINUTES_MIN} minutes."} + ) + return attrs + + def validate_delivery_targets(self, value: list) -> list: + if not isinstance(value, list): + raise serializers.ValidationError("Delivery targets must be a list.") + for target in value: + if not isinstance(target, dict): + raise serializers.ValidationError("Each delivery target must be an object.") + target_type = target.get("type") + if target_type not in ("email", "slack"): + raise serializers.ValidationError(f"Invalid delivery target type: {target_type}") + if target_type == "email" and not target.get("value"): + raise serializers.ValidationError("Email delivery target must include a 'value' field.") + if target_type == "slack" and (not target.get("integration_id") or not target.get("channel")): + raise serializers.ValidationError("Slack delivery target must include 'integration_id' and 'channel'.") + return value + + def create(self, validated_data): + request = self.context["request"] + team = self.context["get_team"]() + validated_data["team"] = team + validated_data["created_by"] = request.user + return super().create(validated_data) + + +class EvaluationReportRunSerializer(serializers.ModelSerializer): + class Meta: + model = EvaluationReportRun + fields = [ + "id", + "report", + "content", + "metadata", + "period_start", + "period_end", + "delivery_status", + "delivery_errors", + "created_at", + ] + read_only_fields = fields + + +class EvaluationReportViewSet(TeamAndOrgViewSetMixin, viewsets.ModelViewSet): + """CRUD for evaluation report configurations + report run history.""" + + scope_object = "llm_analytics" + permission_classes = [AccessControlPermission] + serializer_class = EvaluationReportSerializer + queryset = EvaluationReport.objects.all() + + def safely_get_queryset(self, queryset: QuerySet[EvaluationReport]) -> QuerySet[EvaluationReport]: + queryset = queryset.filter(team_id=self.team_id).order_by("-created_at") + if self.action not in ("update", "partial_update"): + queryset = queryset.filter(deleted=False) + return queryset + + @llma_track_latency("llma_evaluation_reports_list") + def list(self, request: Request, *args, **kwargs) -> Response: + return super().list(request, *args, **kwargs) + + @llma_track_latency("llma_evaluation_reports_create") + def create(self, request: Request, *args, **kwargs) -> Response: + return super().create(request, *args, **kwargs) + + @llma_track_latency("llma_evaluation_reports_retrieve") + def retrieve(self, request: Request, *args, **kwargs) -> Response: + return super().retrieve(request, *args, **kwargs) + + @llma_track_latency("llma_evaluation_reports_update") + def update(self, request: Request, *args, **kwargs) -> Response: + return super().update(request, *args, **kwargs) + + @llma_track_latency("llma_evaluation_reports_partial_update") + def partial_update(self, request: Request, *args, **kwargs) -> Response: + return super().partial_update(request, *args, **kwargs) + + def perform_destroy(self, instance): + instance.deleted = True + instance.save(update_fields=["deleted"]) + + @action(detail=True, methods=["get"], url_path="runs") + @llma_track_latency("llma_evaluation_report_runs_list") + def runs(self, request: Request, **kwargs) -> Response: + """List report runs (history) for this report.""" + report = self.get_object() + runs = EvaluationReportRun.objects.filter(report=report).order_by("-created_at")[:50] + serializer = EvaluationReportRunSerializer(runs, many=True) + return Response(serializer.data) + + @extend_schema(request=None, responses={202: None}) + @action(detail=True, methods=["post"], url_path="generate") + @llma_track_latency("llma_evaluation_report_generate") + def generate(self, request: Request, **kwargs) -> Response: + """Trigger immediate report generation.""" + report = self.get_object() + + try: + from posthog.temporal.common.client import sync_connect + from posthog.temporal.llm_analytics.eval_reports.constants import GENERATE_EVAL_REPORT_WORKFLOW_NAME + from posthog.temporal.llm_analytics.eval_reports.types import GenerateAndDeliverEvalReportWorkflowInput + + client = sync_connect() + async_to_sync(client.start_workflow)( + GENERATE_EVAL_REPORT_WORKFLOW_NAME, + GenerateAndDeliverEvalReportWorkflowInput(report_id=str(report.id), manual=True), + id=f"eval-report-manual-{report.id}-{dt.datetime.now(tz=dt.UTC).timestamp():.0f}", + task_queue=settings.GENERAL_PURPOSE_TASK_QUEUE, + ) + except Exception: + logger.exception("Failed to trigger evaluation report generation", report_id=str(report.id)) + return Response( + {"error": "Failed to trigger report generation"}, + status=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + + return Response(status=status.HTTP_202_ACCEPTED) diff --git a/products/llm_analytics/backend/api/test/test_evaluation_reports.py b/products/llm_analytics/backend/api/test/test_evaluation_reports.py new file mode 100644 index 000000000000..59f4db44ab69 --- /dev/null +++ b/products/llm_analytics/backend/api/test/test_evaluation_reports.py @@ -0,0 +1,212 @@ +import datetime as dt + +from posthog.test.base import APIBaseTest +from unittest.mock import AsyncMock, MagicMock, patch + +from django.utils import timezone + +from rest_framework import status + +from products.llm_analytics.backend.models.evaluation_reports import EvaluationReport, EvaluationReportRun +from products.llm_analytics.backend.models.evaluations import Evaluation + + +class TestEvaluationReportApi(APIBaseTest): + def setUp(self): + super().setUp() + self.evaluation = Evaluation.objects.create( + team=self.team, + name="Test Eval", + evaluation_type="llm_judge", + evaluation_config={"prompt": "test"}, + output_type="boolean", + output_config={}, + enabled=True, + created_by=self.user, + conditions=[{"id": "c1", "rollout_percentage": 100, "properties": []}], + ) + self.base_url = f"/api/environments/{self.team.id}/llm_analytics/evaluation_reports/" + + def _create_report(self, **kwargs) -> EvaluationReport: + defaults = { + "team": self.team, + "evaluation": self.evaluation, + "frequency": "daily", + "start_date": timezone.now(), + "delivery_targets": [{"type": "email", "value": "test@example.com"}], + "created_by": self.user, + } + defaults.update(kwargs) + return EvaluationReport.objects.create(**defaults) + + def test_unauthenticated_user_cannot_access(self): + self.client.logout() + response = self.client.get(self.base_url) + self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) + + def test_list_reports(self): + self._create_report() + self._create_report() + response = self.client.get(self.base_url) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(len(response.json()["results"]), 2) + + def test_list_excludes_deleted(self): + self._create_report() + self._create_report(deleted=True) + response = self.client.get(self.base_url) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(len(response.json()["results"]), 1) + + def test_create_report(self): + response = self.client.post( + self.base_url, + { + "evaluation": str(self.evaluation.id), + "frequency": "daily", + "start_date": timezone.now().isoformat(), + "delivery_targets": [{"type": "email", "value": "test@example.com"}], + }, + format="json", + ) + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + self.assertEqual(EvaluationReport.objects.count(), 1) + report = EvaluationReport.objects.first() + self.assertEqual(report.team_id, self.team.id) + self.assertEqual(report.created_by_id, self.user.id) + + def test_create_report_sets_next_delivery_date(self): + response = self.client.post( + self.base_url, + { + "evaluation": str(self.evaluation.id), + "frequency": "hourly", + "start_date": timezone.now().isoformat(), + "delivery_targets": [{"type": "email", "value": "test@example.com"}], + }, + format="json", + ) + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + self.assertIsNotNone(response.json()["next_delivery_date"]) + + def test_create_allows_empty_delivery_targets(self): + response = self.client.post( + self.base_url, + { + "evaluation": str(self.evaluation.id), + "frequency": "daily", + "start_date": timezone.now().isoformat(), + "delivery_targets": [], + }, + format="json", + ) + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + + def test_validate_email_target(self): + response = self.client.post( + self.base_url, + { + "evaluation": str(self.evaluation.id), + "frequency": "daily", + "start_date": timezone.now().isoformat(), + "delivery_targets": [{"type": "email"}], + }, + format="json", + ) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + + def test_validate_slack_target(self): + response = self.client.post( + self.base_url, + { + "evaluation": str(self.evaluation.id), + "frequency": "daily", + "start_date": timezone.now().isoformat(), + "delivery_targets": [{"type": "slack"}], + }, + format="json", + ) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + + def test_validate_slack_target_valid(self): + response = self.client.post( + self.base_url, + { + "evaluation": str(self.evaluation.id), + "frequency": "daily", + "start_date": timezone.now().isoformat(), + "delivery_targets": [{"type": "slack", "integration_id": 1, "channel": "#reports"}], + }, + format="json", + ) + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + + def test_validate_invalid_target_type(self): + response = self.client.post( + self.base_url, + { + "evaluation": str(self.evaluation.id), + "frequency": "daily", + "start_date": timezone.now().isoformat(), + "delivery_targets": [{"type": "webhook"}], + }, + format="json", + ) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + + def test_retrieve_report(self): + report = self._create_report() + response = self.client.get(f"{self.base_url}{report.id}/") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.json()["id"], str(report.id)) + + def test_update_report(self): + report = self._create_report() + response = self.client.patch( + f"{self.base_url}{report.id}/", + {"frequency": "weekly"}, + format="json", + ) + self.assertEqual(response.status_code, status.HTTP_200_OK) + report.refresh_from_db() + self.assertEqual(report.frequency, "weekly") + + def test_delete_report_soft_deletes(self): + report = self._create_report() + response = self.client.delete(f"{self.base_url}{report.id}/") + self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) + report.refresh_from_db() + self.assertTrue(report.deleted) + self.assertEqual(EvaluationReport.objects.filter(deleted=False).count(), 0) + + def test_runs_action(self): + report = self._create_report() + EvaluationReportRun.objects.create( + report=report, + content={}, + metadata={}, + period_start=timezone.now() - dt.timedelta(hours=1), + period_end=timezone.now(), + ) + response = self.client.get(f"{self.base_url}{report.id}/runs/") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(len(response.json()), 1) + + @patch("posthog.temporal.common.client.sync_connect") + def test_generate_action(self, mock_connect): + report = self._create_report() + mock_client = MagicMock() + mock_client.start_workflow = AsyncMock() + mock_connect.return_value = mock_client + + response = self.client.post(f"{self.base_url}{report.id}/generate/") + self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) + mock_client.start_workflow.assert_called_once() + + @patch("posthog.temporal.common.client.sync_connect") + def test_generate_action_handles_failure(self, mock_connect): + report = self._create_report() + mock_connect.side_effect = Exception("temporal down") + + response = self.client.post(f"{self.base_url}{report.id}/generate/") + self.assertEqual(response.status_code, status.HTTP_500_INTERNAL_SERVER_ERROR) diff --git a/products/llm_analytics/backend/migrations/0023_evaluation_reports.py b/products/llm_analytics/backend/migrations/0023_evaluation_reports.py new file mode 100644 index 000000000000..30df2cb70a41 --- /dev/null +++ b/products/llm_analytics/backend/migrations/0023_evaluation_reports.py @@ -0,0 +1,189 @@ +import django.db.models.deletion +import django.contrib.postgres.fields +from django.conf import settings +from django.db import migrations, models + +import posthog.models.utils + + +class Migration(migrations.Migration): + dependencies = [ + ("posthog", "1043_add_15_minute_interval_to_batch_exports"), + ("llm_analytics", "0022_reviewqueue_reviewqueueitem_and_more"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name="EvaluationReport", + fields=[ + ( + "id", + models.UUIDField( + default=posthog.models.utils.UUIDT, + editable=False, + primary_key=True, + serialize=False, + ), + ), + ( + "frequency", + models.CharField( + choices=[ + ("hourly", "Hourly"), + ("daily", "Daily"), + ("weekly", "Weekly"), + ("every_n", "Every N"), + ], + default="every_n", + max_length=10, + ), + ), + ( + "byweekday", + django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("monday", "Monday"), + ("tuesday", "Tuesday"), + ("wednesday", "Wednesday"), + ("thursday", "Thursday"), + ("friday", "Friday"), + ("saturday", "Saturday"), + ("sunday", "Sunday"), + ], + max_length=10, + ), + blank=True, + default=None, + null=True, + size=None, + ), + ), + ("start_date", models.DateTimeField()), + ("next_delivery_date", models.DateTimeField(blank=True, null=True)), + ("delivery_targets", models.JSONField(default=list)), + ("max_sample_size", models.IntegerField(default=200)), + ("enabled", models.BooleanField(default=True)), + ("deleted", models.BooleanField(default=False)), + ("last_delivered_at", models.DateTimeField(blank=True, null=True)), + ( + "trigger_threshold", + models.IntegerField( + blank=True, + default=100, + help_text="Number of new eval results that triggers a report", + null=True, + ), + ), + ( + "cooldown_minutes", + models.IntegerField( + default=60, + help_text="Minimum minutes between count-triggered reports", + ), + ), + ( + "daily_run_cap", + models.IntegerField( + default=10, + help_text="Maximum count-triggered report runs per calendar day (UTC)", + ), + ), + ("report_prompt_guidance", models.TextField(blank=True, default="")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ( + "team", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="posthog.team", + ), + ), + ( + "evaluation", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="reports", + to="llm_analytics.evaluation", + ), + ), + ( + "created_by", + models.ForeignKey( + blank=True, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + to=settings.AUTH_USER_MODEL, + ), + ), + ], + options={ + "ordering": ["-created_at", "id"], + }, + ), + migrations.AddIndex( + model_name="evaluationreport", + index=models.Index( + fields=["team", "-created_at", "id"], + name="llm_analyti_team_id_b9ccef_idx", + ), + ), + migrations.AddIndex( + model_name="evaluationreport", + index=models.Index( + fields=["next_delivery_date", "enabled", "deleted"], + name="llm_analyti_next_de_a58933_idx", + ), + ), + migrations.CreateModel( + name="EvaluationReportRun", + fields=[ + ( + "id", + models.UUIDField( + default=posthog.models.utils.UUIDT, + editable=False, + primary_key=True, + serialize=False, + ), + ), + ("content", models.JSONField(default=dict)), + ("metadata", models.JSONField(default=dict)), + ("period_start", models.DateTimeField()), + ("period_end", models.DateTimeField()), + ( + "delivery_status", + models.CharField( + choices=[ + ("pending", "Pending"), + ("delivered", "Delivered"), + ("partial_failure", "Partial Failure"), + ("failed", "Failed"), + ], + default="pending", + max_length=20, + ), + ), + ("delivery_errors", models.JSONField(default=list)), + ("created_at", models.DateTimeField(auto_now_add=True)), + ( + "report", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="runs", + to="llm_analytics.evaluationreport", + ), + ), + ], + options={ + "ordering": ["-created_at"], + }, + ), + migrations.AddIndex( + model_name="evaluationreportrun", + index=models.Index( + fields=["report", "-created_at"], + name="llm_analyti_report__f6c41e_idx", + ), + ), + ] diff --git a/products/llm_analytics/backend/migrations/max_migration.txt b/products/llm_analytics/backend/migrations/max_migration.txt index 0d1958d3626f..4e6dfd20ef90 100644 --- a/products/llm_analytics/backend/migrations/max_migration.txt +++ b/products/llm_analytics/backend/migrations/max_migration.txt @@ -1 +1 @@ -0022_reviewqueue_reviewqueueitem_and_more +0023_evaluation_reports diff --git a/products/llm_analytics/backend/models/__init__.py b/products/llm_analytics/backend/models/__init__.py index bb525caf87da..09a989e66c7e 100644 --- a/products/llm_analytics/backend/models/__init__.py +++ b/products/llm_analytics/backend/models/__init__.py @@ -2,6 +2,7 @@ from .clustering_job import ClusteringJob from .datasets import Dataset, DatasetItem from .evaluation_config import EvaluationConfig +from .evaluation_reports import EvaluationReport, EvaluationReportRun from .evaluations import Evaluation from .model_configuration import LLMModelConfiguration from .provider_keys import LLMProvider, LLMProviderKey @@ -14,6 +15,8 @@ "ClusteringJob", "Evaluation", "EvaluationConfig", + "EvaluationReport", + "EvaluationReportRun", "Dataset", "DatasetItem", "LLMModelConfiguration", diff --git a/products/llm_analytics/backend/models/evaluation_reports.py b/products/llm_analytics/backend/models/evaluation_reports.py new file mode 100644 index 000000000000..eaddee81c3cd --- /dev/null +++ b/products/llm_analytics/backend/models/evaluation_reports.py @@ -0,0 +1,180 @@ +from datetime import timedelta +from typing import Literal, cast + +from django.contrib.postgres.fields import ArrayField +from django.db import models +from django.utils import timezone + +from dateutil.rrule import DAILY, FR, HOURLY, MO, SA, SU, TH, TU, WE, WEEKLY, rrule + +from posthog.models.utils import UUIDTModel + +RRULE_WEEKDAY_MAP = { + "monday": MO, + "tuesday": TU, + "wednesday": WE, + "thursday": TH, + "friday": FR, + "saturday": SA, + "sunday": SU, +} + + +class EvaluationReport(UUIDTModel): + class Frequency(models.TextChoices): + HOURLY = "hourly" + DAILY = "daily" + WEEKLY = "weekly" + EVERY_N = "every_n" + + TRIGGER_THRESHOLD_MIN = 10 + TRIGGER_THRESHOLD_MAX = 10_000 + TRIGGER_THRESHOLD_DEFAULT = 100 + COOLDOWN_MINUTES_MIN = 60 + COOLDOWN_MINUTES_DEFAULT = 60 + DAILY_RUN_CAP_DEFAULT = 10 + + class ByWeekDay(models.TextChoices): + MONDAY = "monday" + TUESDAY = "tuesday" + WEDNESDAY = "wednesday" + THURSDAY = "thursday" + FRIDAY = "friday" + SATURDAY = "saturday" + SUNDAY = "sunday" + + class Meta: + ordering = ["-created_at", "id"] + indexes = [ + models.Index(fields=["team", "-created_at", "id"]), + models.Index(fields=["next_delivery_date", "enabled", "deleted"]), + ] + + team = models.ForeignKey("posthog.Team", on_delete=models.CASCADE) + evaluation = models.ForeignKey( + "llm_analytics.Evaluation", + on_delete=models.CASCADE, + related_name="reports", + ) + + frequency = models.CharField(max_length=10, choices=Frequency.choices, default=Frequency.EVERY_N) + byweekday: ArrayField = ArrayField( + models.CharField(max_length=10, choices=ByWeekDay.choices), + null=True, + blank=True, + default=None, + ) + start_date = models.DateTimeField() + next_delivery_date = models.DateTimeField(null=True, blank=True) + + delivery_targets = models.JSONField(default=list) + max_sample_size = models.IntegerField(default=200) + enabled = models.BooleanField(default=True) + deleted = models.BooleanField(default=False) + last_delivered_at = models.DateTimeField(null=True, blank=True) + + # Count-based trigger settings (only used when frequency='every_n') + trigger_threshold = models.IntegerField( + null=True, + blank=True, + default=100, + help_text="Number of new eval results that triggers a report", + ) + cooldown_minutes = models.IntegerField( + default=60, + help_text="Minimum minutes between count-triggered reports", + ) + daily_run_cap = models.IntegerField( + default=10, + help_text="Maximum count-triggered report runs per calendar day (UTC)", + ) + + # Optional per-report custom guidance appended to the agent's system prompt. + # Lets users steer focus/scope/section choices without touching the base prompt. + report_prompt_guidance = models.TextField(blank=True, default="") + + created_by = models.ForeignKey("posthog.User", on_delete=models.SET_NULL, null=True, blank=True) + created_at = models.DateTimeField(auto_now_add=True) + + @property + def is_count_triggered(self) -> bool: + return self.frequency == self.Frequency.EVERY_N + + @property + def rrule(self): + if self.is_count_triggered: + raise ValueError("rrule is not available for count-triggered reports (frequency='every_n').") + freq_map: dict[str, int] = { + self.Frequency.HOURLY: HOURLY, + self.Frequency.DAILY: DAILY, + self.Frequency.WEEKLY: WEEKLY, + } + freq = cast(Literal[0, 1, 2, 3, 4, 5, 6], freq_map[self.frequency]) + return rrule( + freq=freq, + dtstart=self.start_date, + byweekday=_to_rrule_weekdays(self.byweekday) if self.byweekday else None, + ) + + SCHEDULE_FIELDS = ("frequency", "byweekday", "start_date") + + def set_next_delivery_date(self, from_dt=None): + if self.is_count_triggered: + # Count-based reports don't have a time-based schedule. + # next_delivery_date is unused — the 5-minute poll checks eval counts. + self.next_delivery_date = None + return + now = timezone.now() + timedelta(minutes=15) + self.next_delivery_date = self.rrule.after(dt=max(from_dt or now, now), inc=False) + + def save(self, *args, **kwargs): + recalc = not self.id or not self.next_delivery_date + if not recalc and self.id: + # If any schedule field changed, recompute next_delivery_date so the + # new cadence takes effect immediately rather than after the stale timestamp. + try: + old = type(self).objects.only(*self.SCHEDULE_FIELDS).get(pk=self.pk) + if any(getattr(old, f) != getattr(self, f) for f in self.SCHEDULE_FIELDS): + recalc = True + except type(self).DoesNotExist: + recalc = True + if recalc: + self.set_next_delivery_date() + if "update_fields" in kwargs and kwargs["update_fields"] is not None: + kwargs["update_fields"].append("next_delivery_date") + super().save(*args, **kwargs) + + +class EvaluationReportRun(UUIDTModel): + class DeliveryStatus(models.TextChoices): + PENDING = "pending" + DELIVERED = "delivered" + PARTIAL_FAILURE = "partial_failure" + FAILED = "failed" + + class Meta: + ordering = ["-created_at"] + indexes = [ + models.Index(fields=["report", "-created_at"]), + ] + + report = models.ForeignKey( + EvaluationReport, + on_delete=models.CASCADE, + related_name="runs", + ) + content = models.JSONField(default=dict) + metadata = models.JSONField(default=dict) + period_start = models.DateTimeField() + period_end = models.DateTimeField() + delivery_status = models.CharField( + max_length=20, + choices=DeliveryStatus.choices, + default=DeliveryStatus.PENDING, + ) + delivery_errors = models.JSONField(default=list) + created_at = models.DateTimeField(auto_now_add=True) + + +def _to_rrule_weekdays(weekdays: list[str]): + return {RRULE_WEEKDAY_MAP[x] for x in weekdays if x in RRULE_WEEKDAY_MAP} diff --git a/products/llm_analytics/backend/models/test/test_evaluation_reports.py b/products/llm_analytics/backend/models/test/test_evaluation_reports.py new file mode 100644 index 000000000000..182182f7e999 --- /dev/null +++ b/products/llm_analytics/backend/models/test/test_evaluation_reports.py @@ -0,0 +1,171 @@ +import datetime as dt + +from posthog.test.base import BaseTest + +from django.utils import timezone + +from parameterized import parameterized + +from products.llm_analytics.backend.models.evaluation_reports import ( + RRULE_WEEKDAY_MAP, + EvaluationReport, + EvaluationReportRun, + _to_rrule_weekdays, +) +from products.llm_analytics.backend.models.evaluations import Evaluation + + +class TestToRruleWeekdays(BaseTest): + def test_single_day(self): + result = _to_rrule_weekdays(["monday"]) + self.assertEqual(len(result), 1) + + def test_multiple_days(self): + result = _to_rrule_weekdays(["monday", "wednesday", "friday"]) + self.assertEqual(len(result), 3) + + def test_ignores_invalid_days(self): + result = _to_rrule_weekdays(["monday", "invalid", "friday"]) + self.assertEqual(len(result), 2) + + def test_empty_list(self): + result = _to_rrule_weekdays([]) + self.assertEqual(len(result), 0) + + @parameterized.expand(list(RRULE_WEEKDAY_MAP.keys())) + def test_all_valid_weekdays(self, day): + result = _to_rrule_weekdays([day]) + self.assertEqual(len(result), 1) + + +class TestEvaluationReportModel(BaseTest): + def _create_evaluation(self) -> Evaluation: + return Evaluation.objects.create( + team=self.team, + name="Test Eval", + evaluation_type="llm_judge", + evaluation_config={"prompt": "test"}, + output_type="boolean", + output_config={}, + enabled=True, + created_by=self.user, + conditions=[{"id": "c1", "rollout_percentage": 100, "properties": []}], + ) + + def test_save_sets_next_delivery_date_on_create(self): + now = timezone.now() + evaluation = self._create_evaluation() + report = EvaluationReport.objects.create( + team=self.team, + evaluation=evaluation, + frequency="hourly", + start_date=now - dt.timedelta(hours=2), + delivery_targets=[{"type": "email", "value": "test@example.com"}], + ) + self.assertIsNotNone(report.next_delivery_date) + self.assertGreater(report.next_delivery_date, now) + + def test_hourly_rrule(self): + now = timezone.now() + evaluation = self._create_evaluation() + start = now - dt.timedelta(hours=5) + report = EvaluationReport( + team=self.team, + evaluation=evaluation, + frequency="hourly", + start_date=start, + delivery_targets=[], + ) + next_occurrence = report.rrule.after(now, inc=False) + self.assertIsNotNone(next_occurrence) + self.assertEqual(next_occurrence.minute, start.minute) + + def test_daily_rrule(self): + now = timezone.now() + evaluation = self._create_evaluation() + start = now - dt.timedelta(days=2) + report = EvaluationReport( + team=self.team, + evaluation=evaluation, + frequency="daily", + start_date=start, + delivery_targets=[], + ) + next_occurrence = report.rrule.after(now, inc=False) + self.assertIsNotNone(next_occurrence) + self.assertEqual(next_occurrence.hour, start.hour) + + def test_weekly_rrule_with_byweekday(self): + now = timezone.now() + evaluation = self._create_evaluation() + report = EvaluationReport( + team=self.team, + evaluation=evaluation, + frequency="weekly", + byweekday=["monday", "friday"], + start_date=now - dt.timedelta(weeks=1), + delivery_targets=[], + ) + next_occurrence = report.rrule.after(now, inc=False) + self.assertIsNotNone(next_occurrence) + self.assertIn(next_occurrence.weekday(), [0, 4]) + + def test_set_next_delivery_date_uses_15min_buffer(self): + now = timezone.now() + evaluation = self._create_evaluation() + report = EvaluationReport( + team=self.team, + evaluation=evaluation, + frequency="hourly", + start_date=now - dt.timedelta(hours=1), + delivery_targets=[], + ) + report.set_next_delivery_date() + self.assertGreater(report.next_delivery_date, now + dt.timedelta(minutes=14)) + + def test_set_next_delivery_date_from_custom_dt(self): + now = timezone.now() + evaluation = self._create_evaluation() + from_dt = now + dt.timedelta(hours=2) + report = EvaluationReport( + team=self.team, + evaluation=evaluation, + frequency="hourly", + start_date=now - dt.timedelta(hours=1), + delivery_targets=[], + ) + report.set_next_delivery_date(from_dt=from_dt) + self.assertGreater(report.next_delivery_date, from_dt) + + +class TestEvaluationReportRunModel(BaseTest): + def test_create_report_run(self): + now = timezone.now() + evaluation = Evaluation.objects.create( + team=self.team, + name="Test Eval", + evaluation_type="llm_judge", + evaluation_config={"prompt": "test"}, + output_type="boolean", + output_config={}, + enabled=True, + created_by=self.user, + conditions=[{"id": "c1", "rollout_percentage": 100, "properties": []}], + ) + report = EvaluationReport.objects.create( + team=self.team, + evaluation=evaluation, + frequency="daily", + start_date=now, + delivery_targets=[{"type": "email", "value": "test@example.com"}], + ) + run = EvaluationReportRun.objects.create( + report=report, + content={"executive_summary": {"content": "test", "referenced_generation_ids": []}}, + metadata={"total_runs": 10, "pass_rate": 80.0}, + period_start=now - dt.timedelta(hours=1), + period_end=now, + ) + self.assertEqual(run.delivery_status, "pending") + self.assertEqual(run.delivery_errors, []) + self.assertEqual(run.report, report) diff --git a/products/llm_analytics/frontend/generated/api.schemas.ts b/products/llm_analytics/frontend/generated/api.schemas.ts index 06af22ad3d5b..bee6e11bae36 100644 --- a/products/llm_analytics/frontend/generated/api.schemas.ts +++ b/products/llm_analytics/frontend/generated/api.schemas.ts @@ -319,6 +319,140 @@ export interface ClusteringRunRequestApi { clustering_job_id?: string | null } +/** + * * `hourly` - Hourly + * `daily` - Daily + * `weekly` - Weekly + * `every_n` - Every N + */ +export type EvaluationReportFrequencyEnumApi = + (typeof EvaluationReportFrequencyEnumApi)[keyof typeof EvaluationReportFrequencyEnumApi] + +export const EvaluationReportFrequencyEnumApi = { + Hourly: 'hourly', + Daily: 'daily', + Weekly: 'weekly', + EveryN: 'every_n', +} as const + +/** + * * `monday` - Monday + * `tuesday` - Tuesday + * `wednesday` - Wednesday + * `thursday` - Thursday + * `friday` - Friday + * `saturday` - Saturday + * `sunday` - Sunday + */ +export type ByweekdayEnumApi = (typeof ByweekdayEnumApi)[keyof typeof ByweekdayEnumApi] + +export const ByweekdayEnumApi = { + Monday: 'monday', + Tuesday: 'tuesday', + Wednesday: 'wednesday', + Thursday: 'thursday', + Friday: 'friday', + Saturday: 'saturday', + Sunday: 'sunday', +} as const + +export interface EvaluationReportApi { + readonly id: string + evaluation: string + frequency?: EvaluationReportFrequencyEnumApi + /** @nullable */ + byweekday?: ByweekdayEnumApi[] | null + start_date: string + /** @nullable */ + readonly next_delivery_date: string | null + delivery_targets?: unknown + /** + * @minimum -2147483648 + * @maximum 2147483647 + */ + max_sample_size?: number + enabled?: boolean + deleted?: boolean + /** @nullable */ + readonly last_delivered_at: string | null + report_prompt_guidance?: string + /** + * Number of new eval results that triggers a report + * @minimum -2147483648 + * @maximum 2147483647 + * @nullable + */ + trigger_threshold?: number | null + /** + * Minimum minutes between count-triggered reports + * @minimum -2147483648 + * @maximum 2147483647 + */ + cooldown_minutes?: number + /** + * Maximum count-triggered report runs per calendar day (UTC) + * @minimum -2147483648 + * @maximum 2147483647 + */ + daily_run_cap?: number + /** @nullable */ + readonly created_by: number | null + readonly created_at: string +} + +export interface PaginatedEvaluationReportListApi { + count: number + /** @nullable */ + next?: string | null + /** @nullable */ + previous?: string | null + results: EvaluationReportApi[] +} + +export interface PatchedEvaluationReportApi { + readonly id?: string + evaluation?: string + frequency?: EvaluationReportFrequencyEnumApi + /** @nullable */ + byweekday?: ByweekdayEnumApi[] | null + start_date?: string + /** @nullable */ + readonly next_delivery_date?: string | null + delivery_targets?: unknown + /** + * @minimum -2147483648 + * @maximum 2147483647 + */ + max_sample_size?: number + enabled?: boolean + deleted?: boolean + /** @nullable */ + readonly last_delivered_at?: string | null + report_prompt_guidance?: string + /** + * Number of new eval results that triggers a report + * @minimum -2147483648 + * @maximum 2147483647 + * @nullable + */ + trigger_threshold?: number | null + /** + * Minimum minutes between count-triggered reports + * @minimum -2147483648 + * @maximum 2147483647 + */ + cooldown_minutes?: number + /** + * Maximum count-triggered report runs per calendar day (UTC) + * @minimum -2147483648 + * @maximum 2147483647 + */ + daily_run_cap?: number + /** @nullable */ + readonly created_by?: number | null + readonly created_at?: string +} + /** * * `all` - all * `pass` - pass @@ -1281,6 +1415,17 @@ export type LlmAnalyticsClusteringJobsListParams = { offset?: number } +export type LlmAnalyticsEvaluationReportsListParams = { + /** + * Number of results to return per page. + */ + limit?: number + /** + * The initial index from which to return the results. + */ + offset?: number +} + export type LlmAnalyticsEvaluationSummaryCreate400 = { [key: string]: unknown } export type LlmAnalyticsEvaluationSummaryCreate403 = { [key: string]: unknown } diff --git a/products/llm_analytics/frontend/generated/api.ts b/products/llm_analytics/frontend/generated/api.ts index 23c7251483a5..25424ef6433c 100644 --- a/products/llm_analytics/frontend/generated/api.ts +++ b/products/llm_analytics/frontend/generated/api.ts @@ -18,6 +18,7 @@ import type { DatasetItemsListParams, DatasetsListParams, EvaluationApi, + EvaluationReportApi, EvaluationSummaryRequestApi, EvaluationSummaryResponseApi, EvaluationsListParams, @@ -27,6 +28,7 @@ import type { LLMPromptResolveResponseApi, LLMProviderKeyApi, LlmAnalyticsClusteringJobsListParams, + LlmAnalyticsEvaluationReportsListParams, LlmAnalyticsProviderKeysListParams, LlmAnalyticsReviewQueueItemsListParams, LlmAnalyticsReviewQueuesListParams, @@ -39,6 +41,7 @@ import type { PaginatedDatasetItemListApi, PaginatedDatasetListApi, PaginatedEvaluationListApi, + PaginatedEvaluationReportListApi, PaginatedLLMPromptListListApi, PaginatedLLMProviderKeyListApi, PaginatedReviewQueueItemListApi, @@ -48,6 +51,7 @@ import type { PatchedClusteringJobApi, PatchedDatasetApi, PatchedDatasetItemApi, + PatchedEvaluationReportApi, PatchedLLMPromptPublishApi, PatchedLLMProviderKeyApi, PatchedReviewQueueItemUpdateApi, @@ -385,6 +389,173 @@ export const llmAnalyticsEvaluationConfigSetActiveKeyCreate = async ( }) } +/** + * CRUD for evaluation report configurations + report run history. + */ +export const getLlmAnalyticsEvaluationReportsListUrl = ( + projectId: string, + params?: LlmAnalyticsEvaluationReportsListParams +) => { + const normalizedParams = new URLSearchParams() + + Object.entries(params || {}).forEach(([key, value]) => { + if (value !== undefined) { + normalizedParams.append(key, value === null ? 'null' : value.toString()) + } + }) + + const stringifiedParams = normalizedParams.toString() + + return stringifiedParams.length > 0 + ? `/api/environments/${projectId}/llm_analytics/evaluation_reports/?${stringifiedParams}` + : `/api/environments/${projectId}/llm_analytics/evaluation_reports/` +} + +export const llmAnalyticsEvaluationReportsList = async ( + projectId: string, + params?: LlmAnalyticsEvaluationReportsListParams, + options?: RequestInit +): Promise => { + return apiMutator(getLlmAnalyticsEvaluationReportsListUrl(projectId, params), { + ...options, + method: 'GET', + }) +} + +/** + * CRUD for evaluation report configurations + report run history. + */ +export const getLlmAnalyticsEvaluationReportsCreateUrl = (projectId: string) => { + return `/api/environments/${projectId}/llm_analytics/evaluation_reports/` +} + +export const llmAnalyticsEvaluationReportsCreate = async ( + projectId: string, + evaluationReportApi: NonReadonly, + options?: RequestInit +): Promise => { + return apiMutator(getLlmAnalyticsEvaluationReportsCreateUrl(projectId), { + ...options, + method: 'POST', + headers: { 'Content-Type': 'application/json', ...options?.headers }, + body: JSON.stringify(evaluationReportApi), + }) +} + +/** + * CRUD for evaluation report configurations + report run history. + */ +export const getLlmAnalyticsEvaluationReportsRetrieveUrl = (projectId: string, id: string) => { + return `/api/environments/${projectId}/llm_analytics/evaluation_reports/${id}/` +} + +export const llmAnalyticsEvaluationReportsRetrieve = async ( + projectId: string, + id: string, + options?: RequestInit +): Promise => { + return apiMutator(getLlmAnalyticsEvaluationReportsRetrieveUrl(projectId, id), { + ...options, + method: 'GET', + }) +} + +/** + * CRUD for evaluation report configurations + report run history. + */ +export const getLlmAnalyticsEvaluationReportsUpdateUrl = (projectId: string, id: string) => { + return `/api/environments/${projectId}/llm_analytics/evaluation_reports/${id}/` +} + +export const llmAnalyticsEvaluationReportsUpdate = async ( + projectId: string, + id: string, + evaluationReportApi: NonReadonly, + options?: RequestInit +): Promise => { + return apiMutator(getLlmAnalyticsEvaluationReportsUpdateUrl(projectId, id), { + ...options, + method: 'PUT', + headers: { 'Content-Type': 'application/json', ...options?.headers }, + body: JSON.stringify(evaluationReportApi), + }) +} + +/** + * CRUD for evaluation report configurations + report run history. + */ +export const getLlmAnalyticsEvaluationReportsPartialUpdateUrl = (projectId: string, id: string) => { + return `/api/environments/${projectId}/llm_analytics/evaluation_reports/${id}/` +} + +export const llmAnalyticsEvaluationReportsPartialUpdate = async ( + projectId: string, + id: string, + patchedEvaluationReportApi: NonReadonly, + options?: RequestInit +): Promise => { + return apiMutator(getLlmAnalyticsEvaluationReportsPartialUpdateUrl(projectId, id), { + ...options, + method: 'PATCH', + headers: { 'Content-Type': 'application/json', ...options?.headers }, + body: JSON.stringify(patchedEvaluationReportApi), + }) +} + +/** + * CRUD for evaluation report configurations + report run history. + */ +export const getLlmAnalyticsEvaluationReportsDestroyUrl = (projectId: string, id: string) => { + return `/api/environments/${projectId}/llm_analytics/evaluation_reports/${id}/` +} + +export const llmAnalyticsEvaluationReportsDestroy = async ( + projectId: string, + id: string, + options?: RequestInit +): Promise => { + return apiMutator(getLlmAnalyticsEvaluationReportsDestroyUrl(projectId, id), { + ...options, + method: 'DELETE', + }) +} + +/** + * Trigger immediate report generation. + */ +export const getLlmAnalyticsEvaluationReportsGenerateCreateUrl = (projectId: string, id: string) => { + return `/api/environments/${projectId}/llm_analytics/evaluation_reports/${id}/generate/` +} + +export const llmAnalyticsEvaluationReportsGenerateCreate = async ( + projectId: string, + id: string, + options?: RequestInit +): Promise => { + return apiMutator(getLlmAnalyticsEvaluationReportsGenerateCreateUrl(projectId, id), { + ...options, + method: 'POST', + }) +} + +/** + * List report runs (history) for this report. + */ +export const getLlmAnalyticsEvaluationReportsRunsRetrieveUrl = (projectId: string, id: string) => { + return `/api/environments/${projectId}/llm_analytics/evaluation_reports/${id}/runs/` +} + +export const llmAnalyticsEvaluationReportsRunsRetrieve = async ( + projectId: string, + id: string, + options?: RequestInit +): Promise => { + return apiMutator(getLlmAnalyticsEvaluationReportsRunsRetrieveUrl(projectId, id), { + ...options, + method: 'GET', + }) +} + /** * Generate an AI-powered summary of evaluation results. diff --git a/products/llm_analytics/mcp/tools.yaml b/products/llm_analytics/mcp/tools.yaml index ab571a3f18f6..fda008946a05 100644 --- a/products/llm_analytics/mcp/tools.yaml +++ b/products/llm_analytics/mcp/tools.yaml @@ -43,10 +43,9 @@ tools: idempotent: true title: List clustering jobs description: > - List all clustering job configurations for the current team (max 5 per team). - Each job defines an analysis level (trace or generation) and event filters that - scope which traces are included in clustering runs. Cluster results are stored - as $ai_trace_clusters and $ai_generation_clusters events — use docs-search or + List all clustering job configurations for the current team (max 5 per team). Each job defines an analysis + level (trace or generation) and event filters that scope which traces are included in clustering runs. + Cluster results are stored as $ai_trace_clusters and $ai_generation_clusters events — use docs-search or execute-sql to query them. llm-analytics-clustering-jobs-create: operation: llm_analytics_clustering_jobs_create @@ -69,19 +68,15 @@ tools: readOnly: false destructive: false idempotent: true - requires_ai_consent: true title: Summarize evaluation results description: > - Generate an AI-powered summary of LLM evaluation results for a given - evaluation config. Pass an evaluation_id and an optional filter - ("all", "pass", "fail", or "na") to scope which runs are analyzed. - Returns an overall assessment, pattern groups for passing, failing, - and N/A runs (each with title, description, frequency, and example - generation IDs), actionable recommendations, and run statistics. - Optionally pass generation_ids to restrict the analysis to specific - runs. Results are cached for one hour — use force_refresh to - recompute. Rate-limited; requires AI data processing approval for - the organization. + Generate an AI-powered summary of LLM evaluation results for a given evaluation config. Pass an + evaluation_id and an optional filter ("all", "pass", "fail", or "na") to scope which runs are analyzed. + Returns an overall assessment, pattern groups for passing, failing, and N/A runs (each with title, + description, frequency, and example generation IDs), actionable recommendations, and run statistics. + Optionally pass generation_ids to restrict the analysis to specific runs. Results are cached for one hour — + use force_refresh to recompute. Rate-limited; requires AI data processing approval for the organization. + requires_ai_consent: true llm-analytics-models-retrieve: operation: llm_analytics_models_retrieve enabled: false @@ -120,11 +115,10 @@ tools: idempotent: true title: Analyze sentiment description: > - Classify sentiment of LLM trace or generation user messages as positive, - neutral, or negative. Pass a list of trace or generation IDs and an - analysis_level ("trace" or "generation"). Returns per-ID sentiment labels - with confidence scores and per-message breakdowns. Results are cached — - use force_refresh to recompute. Rate-limited. + Classify sentiment of LLM trace or generation user messages as positive, neutral, or negative. Pass a list + of trace or generation IDs and an analysis_level ("trace" or "generation"). Returns per-ID sentiment labels + with confidence scores and per-message breakdowns. Results are cached — use force_refresh to recompute. + Rate-limited. llm-analytics-summarization-create: operation: llm_analytics_summarization_create enabled: true @@ -136,11 +130,10 @@ tools: idempotent: true title: Summarize trace or generation description: > - Generate an AI-powered summary of an LLM trace or generation. Pass a - trace_id or generation_id with a date_from — the backend fetches the data - and returns a structured summary with title, flow diagram, summary bullets, - and interesting notes. Results are cached. Use mode "minimal" (default) for - 3-5 points or "detailed" for 5-10 points. Rate-limited. + Generate an AI-powered summary of an LLM trace or generation. Pass a trace_id or generation_id with a + date_from — the backend fetches the data and returns a structured summary with title, flow diagram, summary + bullets, and interesting notes. Results are cached. Use mode "minimal" (default) for 3-5 points or + "detailed" for 5-10 points. Rate-limited. llm-analytics-summarization-batch-check-create: operation: llm_analytics_summarization_batch_check_create enabled: false @@ -215,8 +208,8 @@ tools: idempotent: true title: Get clustering job description: > - Retrieve a specific clustering job configuration by ID. Returns the job name, - analysis level (trace or generation), event filters, enabled status, and timestamps. + Retrieve a specific clustering job configuration by ID. Returns the job name, analysis level (trace or + generation), event filters, enabled status, and timestamps. llm-analytics-clustering-jobs-update: operation: llm_analytics_clustering_jobs_update enabled: false @@ -298,3 +291,27 @@ tools: llm-analytics-provider-keys-assign-create: operation: llm_analytics_provider_keys_assign_create enabled: false + llm-analytics-evaluation-reports-list: + operation: llm_analytics_evaluation_reports_list + enabled: false + llm-analytics-evaluation-reports-create: + operation: llm_analytics_evaluation_reports_create + enabled: false + llm-analytics-evaluation-reports-retrieve: + operation: llm_analytics_evaluation_reports_retrieve + enabled: false + llm-analytics-evaluation-reports-update: + operation: llm_analytics_evaluation_reports_update + enabled: false + llm-analytics-evaluation-reports-partial-update: + operation: llm_analytics_evaluation_reports_partial_update + enabled: false + llm-analytics-evaluation-reports-destroy: + operation: llm_analytics_evaluation_reports_destroy + enabled: false + llm-analytics-evaluation-reports-generate-create: + operation: llm_analytics_evaluation_reports_generate_create + enabled: false + llm-analytics-evaluation-reports-runs-retrieve: + operation: llm_analytics_evaluation_reports_runs_retrieve + enabled: false diff --git a/services/mcp/src/api/generated.ts b/services/mcp/src/api/generated.ts index 388914a4efc5..aedcbfb5bd93 100644 --- a/services/mcp/src/api/generated.ts +++ b/services/mcp/src/api/generated.ts @@ -14467,6 +14467,66 @@ export namespace Schemas { example_generation_ids: string[]; } + /** + * * `hourly` - Hourly + * `daily` - Daily + * `weekly` - Weekly + * `every_n` - Every N + */ + export type EvaluationReportFrequencyEnum = typeof EvaluationReportFrequencyEnum[keyof typeof EvaluationReportFrequencyEnum]; + + + export const EvaluationReportFrequencyEnum = { + Hourly: 'hourly', + Daily: 'daily', + Weekly: 'weekly', + EveryN: 'every_n', + } as const; + + export interface EvaluationReport { + readonly id: string; + evaluation: string; + frequency?: EvaluationReportFrequencyEnum; + /** @nullable */ + byweekday?: ByweekdayEnum[] | null; + start_date: string; + /** @nullable */ + readonly next_delivery_date: string | null; + delivery_targets?: unknown; + /** + * @minimum -2147483648 + * @maximum 2147483647 + */ + max_sample_size?: number; + enabled?: boolean; + deleted?: boolean; + /** @nullable */ + readonly last_delivered_at: string | null; + report_prompt_guidance?: string; + /** + * Number of new eval results that triggers a report + * @minimum -2147483648 + * @maximum 2147483647 + * @nullable + */ + trigger_threshold?: number | null; + /** + * Minimum minutes between count-triggered reports + * @minimum -2147483648 + * @maximum 2147483647 + */ + cooldown_minutes?: number; + /** + * Maximum count-triggered report runs per calendar day (UTC) + * @minimum -2147483648 + * @maximum 2147483647 + */ + daily_run_cap?: number; + /** @nullable */ + readonly created_by: number | null; + readonly created_at: string; + } + /** * * `all` - all * `pass` - pass @@ -16195,22 +16255,6 @@ export namespace Schemas { refreshing: boolean; } - /** - * * `daily` - Daily - * `weekly` - Weekly - * `monthly` - Monthly - * `yearly` - Yearly - */ - export type FrequencyEnum = typeof FrequencyEnum[keyof typeof FrequencyEnum]; - - - export const FrequencyEnum = { - Daily: 'daily', - Weekly: 'weekly', - Monthly: 'monthly', - Yearly: 'yearly', - } as const; - export type GenerateRequestStepsItem = {[key: string]: unknown}; export interface GenerateRequest { @@ -20494,6 +20538,15 @@ export namespace Schemas { results: Evaluation[]; } + export interface PaginatedEvaluationReportList { + count: number; + /** @nullable */ + next?: string | null; + /** @nullable */ + previous?: string | null; + results: EvaluationReport[]; + } + export interface PaginatedEventSchemaList { count: number; /** @nullable */ @@ -21719,6 +21772,22 @@ export namespace Schemas { Webhook: 'webhook', } as const; + /** + * * `daily` - Daily + * `weekly` - Weekly + * `monthly` - Monthly + * `yearly` - Yearly + */ + export type SubscriptionFrequencyEnum = typeof SubscriptionFrequencyEnum[keyof typeof SubscriptionFrequencyEnum]; + + + export const SubscriptionFrequencyEnum = { + Daily: 'daily', + Weekly: 'weekly', + Monthly: 'monthly', + Yearly: 'yearly', + } as const; + /** * Standard Subscription serializer. */ @@ -21735,7 +21804,7 @@ export namespace Schemas { dashboard_export_insights?: number[]; target_type: TargetTypeEnum; target_value: string; - frequency: FrequencyEnum; + frequency: SubscriptionFrequencyEnum; /** * @minimum -2147483648 * @maximum 2147483647 @@ -23913,6 +23982,50 @@ export namespace Schemas { deleted?: boolean; } + export interface PatchedEvaluationReport { + readonly id?: string; + evaluation?: string; + frequency?: EvaluationReportFrequencyEnum; + /** @nullable */ + byweekday?: ByweekdayEnum[] | null; + start_date?: string; + /** @nullable */ + readonly next_delivery_date?: string | null; + delivery_targets?: unknown; + /** + * @minimum -2147483648 + * @maximum 2147483647 + */ + max_sample_size?: number; + enabled?: boolean; + deleted?: boolean; + /** @nullable */ + readonly last_delivered_at?: string | null; + report_prompt_guidance?: string; + /** + * Number of new eval results that triggers a report + * @minimum -2147483648 + * @maximum 2147483647 + * @nullable + */ + trigger_threshold?: number | null; + /** + * Minimum minutes between count-triggered reports + * @minimum -2147483648 + * @maximum 2147483647 + */ + cooldown_minutes?: number; + /** + * Maximum count-triggered report runs per calendar day (UTC) + * @minimum -2147483648 + * @maximum 2147483647 + */ + daily_run_cap?: number; + /** @nullable */ + readonly created_by?: number | null; + readonly created_at?: string; + } + export interface PatchedEventSchema { readonly id?: string; event_definition?: string; @@ -25550,7 +25663,7 @@ export namespace Schemas { dashboard_export_insights?: number[]; target_type?: TargetTypeEnum; target_value?: string; - frequency?: FrequencyEnum; + frequency?: SubscriptionFrequencyEnum; /** * @minimum -2147483648 * @maximum 2147483647 @@ -33585,6 +33698,17 @@ export namespace Schemas { offset?: number; }; + export type LlmAnalyticsEvaluationReportsListParams = { + /** + * Number of results to return per page. + */ + limit?: number; + /** + * The initial index from which to return the results. + */ + offset?: number; + }; + export type LlmAnalyticsEvaluationSummaryCreate400 = {[key: string]: unknown}; export type LlmAnalyticsEvaluationSummaryCreate403 = {[key: string]: unknown};