diff --git a/frontend/src/lib/constants.tsx b/frontend/src/lib/constants.tsx index 7ba1766db390..0bb970db07a6 100644 --- a/frontend/src/lib/constants.tsx +++ b/frontend/src/lib/constants.tsx @@ -324,6 +324,7 @@ export const FEATURE_FLAGS = { LLM_ANALYTICS_TRACE_NAVIGATION: 'llm-analytics-trace-navigation', // owner: #team-llm-analytics LLM_ANALYTICS_EVALUATIONS_CUSTOM_MODELS: 'llm-analytics-evaluations-custom-models', // owner: #team-llm-analytics LLM_ANALYTICS_EVALUATIONS_HOG_CODE: 'llm-analytics-evaluations-hog-code', // owner: #team-llm-analytics + LLM_ANALYTICS_EVALUATIONS_REPORTS: 'llm-analytics-evaluations-reports', // owner: #team-llm-analytics LLM_ANALYTICS_EVALUATIONS_SUMMARY: 'llm-analytics-evaluations-summary', // owner: #team-llm-analytics LLM_ANALYTICS_SESSION_SUMMARIZATION: 'llm-analytics-session-summarization', // owner: #team-llm-analytics LLM_ANALYTICS_CLUSTERS_TAB: 'llm-analytics-clusters-tab', // owner: #team-llm-analytics @@ -392,7 +393,6 @@ export const FEATURE_FLAGS = { ONBOARDING_WIZARD_PROMINENCE: 'onboarding-wizard-prominence', // owner: #team-growth multivariate=control,wizard-hero,wizard-tab,wizard-only ONBOARDING_WIZARD_INSTALLATION_IMPROVED_COPY: 'onboarding-wizard-installation-improved-copy', // owner: @fercgomes #team-growth multivariate=control,test ONBOARDING_MOBILE_INSTALL_HELPER: 'onboarding-mobile-install-helper', // owner: @fercgomes #team-growth multivariate=control,test — target $device_type=Mobile at the flag level - ONBOARDING_DATA_WAREHOUSE_VALUE_PROP: 'onboarding-data-warehouse-value-prop', // owner: @fercgomes #team-growth multivariate=control,table,query OWNER_ONLY_BILLING: 'owner-only-billing', // owner: @pawelcebula #team-billing POST_ONBOARDING_MODAL_EXPERIMENT: 'post-onboarding-modal-experiment', // owner: @fercgomes #team-growth multivariate=control,test PASSKEY_SIGNUP_ENABLED: 'passkey-signup-enabled', // owner: @reecejones #team-platform-features @@ -445,7 +445,6 @@ export const FEATURE_FLAGS = { SURVEYS_FORM_BUILDER: 'surveys-form-builder', // owner: @adboio #team-surveys SURVEY_HEADLINE_SUMMARY: 'survey-headline-summary', // owner: @adboio #team-surveys SURVEYS_INSIGHT_BUTTON_EXPERIMENT: 'ask-users-why-ai-vs-quickcreate', // owner: @adboio #team-surveys multivariate=true - SURVEYS_TOOLBAR: 'surveys-toolbar', // owner: @fcgomes SURVEYS_WEB_ANALYTICS_CROSS_SELL: 'surveys-in-web-analytics', // owner: @adboio #team-surveys TASK_SUMMARIES: 'task-summaries', // owner: #team-llm-analytics TASK_TOOL: 'phai-task-tool', // owner: @kappa90 #team-posthog-ai diff --git a/products/llm_analytics/frontend/evaluations/LLMAnalyticsEvaluation.tsx b/products/llm_analytics/frontend/evaluations/LLMAnalyticsEvaluation.tsx index 06d6a895b706..12492391d81f 100644 --- a/products/llm_analytics/frontend/evaluations/LLMAnalyticsEvaluation.tsx +++ b/products/llm_analytics/frontend/evaluations/LLMAnalyticsEvaluation.tsx @@ -35,6 +35,8 @@ import { modelPickerLogic } from '../modelPickerLogic' import { providerKeyStateIssueDescription, providerLabel } from '../settings/providerKeyStateUtils' import { EvaluationCodeEditor } from './components/EvaluationCodeEditor' import { EvaluationPromptEditor } from './components/EvaluationPromptEditor' +import { EvaluationReportConfig } from './components/EvaluationReportConfig' +import { EvaluationReportsTab } from './components/EvaluationReportsTab' import { EvaluationRunsTable } from './components/EvaluationRunsTable' import { EvaluationTriggers } from './components/EvaluationTriggers' import { LLMEvaluationLogicProps, llmEvaluationLogic } from './llmEvaluationLogic' @@ -297,6 +299,18 @@ export function LLMAnalyticsEvaluation(): JSX.Element { ), }, + !isNewEvaluation && + !!featureFlags[FEATURE_FLAGS.LLM_ANALYTICS_EVALUATIONS_REPORTS] && { + key: 'reports', + label: 'Reports', + 'data-attr': 'llma-evaluation-reports-tab', + content: ( + setActiveTab('configuration')} + /> + ), + }, { key: 'configuration', label: 'Configuration', @@ -453,7 +467,20 @@ export function LLMAnalyticsEvaluation(): JSX.Element {

+ + {/* Scheduled Reports (inline config for new evaluations) */} + {isNewEvaluation && + featureFlags[FEATURE_FLAGS.LLM_ANALYTICS_EVALUATIONS_REPORTS] && ( + + )} + + {/* Scheduled Reports (for existing evaluations, outside the form) */} + {!isNewEvaluation && featureFlags[FEATURE_FLAGS.LLM_ANALYTICS_EVALUATIONS_REPORTS] && ( +
+ +
+ )} ), }, diff --git a/products/llm_analytics/frontend/evaluations/components/EvaluationReportConfig.tsx b/products/llm_analytics/frontend/evaluations/components/EvaluationReportConfig.tsx new file mode 100644 index 000000000000..49eac5149bfa --- /dev/null +++ b/products/llm_analytics/frontend/evaluations/components/EvaluationReportConfig.tsx @@ -0,0 +1,459 @@ +import { useActions, useValues } from 'kea' +import { useEffect, useState } from 'react' + +import { LemonButton, LemonDialog, LemonInput, LemonSelect, LemonSwitch, LemonTextArea } from '@posthog/lemon-ui' + +import { IntegrationChoice } from 'lib/components/CyclotronJob/integrations/IntegrationChoice' +import { integrationsLogic } from 'lib/integrations/integrationsLogic' +import { SlackChannelPicker, SlackNotConfiguredBanner } from 'lib/integrations/SlackIntegrationHelpers' + +import { evaluationReportLogic } from '../evaluationReportLogic' +import type { EvaluationReportDeliveryTarget, EvaluationReportFrequency } from '../types' + +const GUIDANCE_PLACEHOLDER = + "Optional guidance for the report agent. e.g. 'Focus on cost regressions across models', 'Compare latency between gpt-4o-mini and claude-sonnet', 'Keep it to 2 sections max'" + +const FREQUENCY_OPTIONS = [ + { value: 'hourly' as const, label: 'Hourly' }, + { value: 'daily' as const, label: 'Daily' }, + { value: 'weekly' as const, label: 'Weekly' }, + { value: 'every_n' as const, label: 'Every N evaluations' }, +] + +const TRIGGER_THRESHOLD_MIN = 10 +const TRIGGER_THRESHOLD_MAX = 10_000 +const TRIGGER_THRESHOLD_DEFAULT = 100 + +/** Threshold config shown when frequency is 'every_n' */ +function ThresholdConfig({ value, onChange }: { value: number; onChange: (value: number) => void }): JSX.Element { + return ( +
+ + onChange(Number(val))} + fullWidth + /> +

+ A report will be generated after this many new evaluation results arrive. Checked every 5 minutes. Min{' '} + {TRIGGER_THRESHOLD_MIN}, max {TRIGGER_THRESHOLD_MAX.toLocaleString()}. Cooldown: at most one report per + hour, up to 10 per day. +

+
+ ) +} + +/** Shared delivery targets configuration */ +function DeliveryTargetsConfig({ + emailValue, + onEmailChange, + slackIntegrationId, + onSlackIntegrationChange, + slackChannelValue, + onSlackChannelChange, +}: { + emailValue: string + onEmailChange: (value: string) => void + slackIntegrationId: number | null + onSlackIntegrationChange: (value: number | null) => void + slackChannelValue: string + onSlackChannelChange: (value: string) => void +}): JSX.Element { + const { slackIntegrations, integrations } = useValues(integrationsLogic) + + return ( + <> +
+ + +

Comma-separated email addresses

+
+
+ + {!slackIntegrations?.length ? ( + + ) : ( +
+ { + if (newValue !== slackIntegrationId) { + onSlackChannelChange('') + } + onSlackIntegrationChange(newValue) + }} + /> + {slackIntegrationId && + (() => { + const selectedIntegration = integrations?.find((i) => i.id === slackIntegrationId) + return selectedIntegration ? ( + onSlackChannelChange(val || '')} + integration={selectedIntegration} + /> + ) : null + })()} +
+ )} +
+ + ) +} + +/** Inline config shown during new evaluation creation */ +function PendingReportConfig({ evaluationId }: { evaluationId: string }): JSX.Element { + const { pendingConfig } = useValues(evaluationReportLogic({ evaluationId })) + const { + setPendingEnabled, + setPendingFrequency, + setPendingEmailValue, + setPendingSlackIntegrationId, + setPendingSlackChannelValue, + setPendingReportPromptGuidance, + setPendingTriggerThreshold, + } = useActions(evaluationReportLogic({ evaluationId })) + + return ( +
+
+
+

Scheduled reports

+

+ AI-generated analysis of evaluation results. Reports are always available in the Reports tab. + Optionally add email or Slack to get notified. +

+
+ +
+ + {pendingConfig.enabled && ( +
+
+ + val && setPendingFrequency(val)} + options={FREQUENCY_OPTIONS} + fullWidth + /> +
+ {pendingConfig.frequency === 'every_n' && ( + + )} + +
+ + +

+ Steers the agent's focus, section choices, or scope. Appended to the base prompt. +

+
+
+ )} +
+ ) +} + +/** Toggle-based report management for existing evaluations */ +function ExistingReportConfig({ evaluationId }: { evaluationId: string }): JSX.Element { + const logic = evaluationReportLogic({ evaluationId }) + const { activeReport, reportsLoading } = useValues(logic) + const { updateReport, deleteReport, createReport } = useActions(logic) + + // Local state: toggle controls form visibility, Save button creates the report. + // All fields are local-first so the user can change multiple things before saving. + const [formEnabled, setFormEnabled] = useState(false) + const [frequency, setFrequency] = useState('daily') + const [emailValue, setEmailValue] = useState('') + const [slackIntegrationId, setSlackIntegrationId] = useState(null) + const [slackChannelValue, setSlackChannelValue] = useState('') + const [guidance, setGuidance] = useState('') + const [triggerThreshold, setTriggerThreshold] = useState(TRIGGER_THRESHOLD_DEFAULT) + + // Seed local form state from the active report so the user can edit + // any field without having to disable + recreate the schedule. + useEffect(() => { + if (!activeReport) { + return + } + const emailTarget = activeReport.delivery_targets.find( + (t: EvaluationReportDeliveryTarget) => t.type === 'email' + ) + const slackTarget = activeReport.delivery_targets.find( + (t: EvaluationReportDeliveryTarget) => t.type === 'slack' + ) + setFrequency(activeReport.frequency) + setEmailValue(emailTarget?.value ?? '') + setSlackIntegrationId(slackTarget?.integration_id ?? null) + setSlackChannelValue(slackTarget?.channel ?? '') + setGuidance(activeReport.report_prompt_guidance ?? '') + setTriggerThreshold(activeReport.trigger_threshold ?? TRIGGER_THRESHOLD_DEFAULT) + }, [activeReport]) + + const isEnabled = !!activeReport || formEnabled + + const handleToggle = (checked: boolean): void => { + if (checked) { + setFormEnabled(true) + } else if (activeReport) { + LemonDialog.open({ + title: 'Disable scheduled reports?', + description: 'This will stop all future report deliveries. Past reports will be preserved.', + primaryButton: { + children: 'Disable', + status: 'danger', + onClick: () => deleteReport(activeReport.id), + }, + secondaryButton: { children: 'Cancel' }, + }) + } else { + setFormEnabled(false) + } + } + + const hasEmail = emailValue.trim().length > 0 + const hasSlack = slackIntegrationId !== null && slackChannelValue.length > 0 + + const handleSave = (): void => { + const targets: EvaluationReportDeliveryTarget[] = [] + if (hasEmail) { + targets.push({ type: 'email', value: emailValue.trim() }) + } + if (hasSlack) { + targets.push({ type: 'slack', integration_id: slackIntegrationId!, channel: slackChannelValue }) + } + createReport({ + evaluationId, + frequency, + delivery_targets: targets, + report_prompt_guidance: guidance, + trigger_threshold: frequency === 'every_n' ? triggerThreshold : null, + }) + setFormEnabled(false) + } + + return ( +
+
+
+

Scheduled reports

+

+ AI-generated analysis of evaluation results. Reports are always available in the Reports tab. + Optionally add email or Slack to get notified. +

+
+ +
+ + {activeReport ? ( +
+
+ + val && setFrequency(val)} + options={FREQUENCY_OPTIONS} + fullWidth + /> +
+ + {frequency === 'every_n' && ( + + )} + + + +
+ + +

+ Steers the agent's focus, section choices, or scope. Appended to the base prompt. +

+
+ + {(() => { + const currentEmail = + activeReport.delivery_targets.find( + (t: EvaluationReportDeliveryTarget) => t.type === 'email' + )?.value ?? '' + const currentSlack = activeReport.delivery_targets.find( + (t: EvaluationReportDeliveryTarget) => t.type === 'slack' + ) + const currentSlackIntegrationId: number | null = currentSlack?.integration_id ?? null + const currentSlackChannel = currentSlack?.channel ?? '' + const currentGuidance = activeReport.report_prompt_guidance ?? '' + const currentThreshold = activeReport.trigger_threshold ?? TRIGGER_THRESHOLD_DEFAULT + const frequencyDirty = frequency !== activeReport.frequency + const targetsDirty = + emailValue.trim() !== currentEmail || + slackIntegrationId !== currentSlackIntegrationId || + slackChannelValue !== currentSlackChannel + const guidanceDirty = guidance !== currentGuidance + const thresholdDirty = frequency === 'every_n' && triggerThreshold !== currentThreshold + const isDirty = frequencyDirty || targetsDirty || guidanceDirty || thresholdDirty + const hasAnyTarget = hasEmail || hasSlack + return ( +
+ { + const targets: EvaluationReportDeliveryTarget[] = [] + if (hasEmail) { + targets.push({ type: 'email', value: emailValue.trim() }) + } + if (hasSlack) { + targets.push({ + type: 'slack', + integration_id: slackIntegrationId!, + channel: slackChannelValue, + }) + } + const data: Record = { + frequency, + delivery_targets: targets, + report_prompt_guidance: guidance, + } + if (frequency === 'every_n') { + data.trigger_threshold = triggerThreshold + } + updateReport({ + reportId: activeReport.id, + data, + }) + }} + > + Save changes + +
+ ) + })()} + + {frequency === 'every_n' ? ( +
+ A report will be generated when{' '} + {activeReport.trigger_threshold ?? TRIGGER_THRESHOLD_DEFAULT} new evaluation results arrive. + Checked every 5 minutes. +
+ ) : ( + activeReport.next_delivery_date && ( +
+ Next delivery: {new Date(activeReport.next_delivery_date).toLocaleString()} +
+ ) + )} + +

Generated reports appear in the Reports tab.

+
+ ) : ( + formEnabled && ( +
+
+ + val && setFrequency(val)} + options={FREQUENCY_OPTIONS} + fullWidth + /> +
+ {frequency === 'every_n' && ( + + )} + +
+ + +

+ Steers the agent's focus, section choices, or scope. Appended to the base prompt. +

+
+
+ + Save report schedule + +
+
+ ) + )} +
+ ) +} + +export function EvaluationReportConfig({ evaluationId }: { evaluationId: string }): JSX.Element { + if (evaluationId === 'new') { + return + } + return +} diff --git a/products/llm_analytics/frontend/evaluations/components/EvaluationReportViewer.tsx b/products/llm_analytics/frontend/evaluations/components/EvaluationReportViewer.tsx new file mode 100644 index 000000000000..9b8d225ea399 --- /dev/null +++ b/products/llm_analytics/frontend/evaluations/components/EvaluationReportViewer.tsx @@ -0,0 +1,265 @@ +import { useMemo, useState } from 'react' + +import { LemonBadge, LemonButton, LemonCollapse, LemonDivider } from '@posthog/lemon-ui' + +import { LemonMarkdown } from 'lib/lemon-ui/LemonMarkdown' +import { urls } from 'scenes/urls' + +import type { + EvaluationReportMetrics, + EvaluationReportRun, + EvaluationReportRunContent, + EvaluationReportSection, +} from '../types' + +// Match any UUID in the content — surrounding punctuation (backticks, angle brackets, etc.) +// is stripped so we don't depend on how the LLM formats references. +const UUID_REGEX = /[`<]*([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})[`>]*/g + +// Rewrite `` backtick tokens into markdown links pointing to the correct +// trace URL. Uses the citations list to map generation_id → trace_id so the link +// opens the right trace with the generation highlighted. +function linkifyUuids(content: string, citationMap: Record): string { + return content.replace(UUID_REGEX, (_match, generationId: string) => { + const traceId = citationMap[generationId] + const url = traceId + ? urls.llmAnalyticsTrace(traceId, { event: generationId }) + : urls.llmAnalyticsTrace(generationId) + return `[\`${generationId.slice(0, 8)}...\`](${url})` + }) +} + +// Strip a leading markdown heading line if it matches the section title. +// The agent sometimes prefixes each section's content with its own heading, +// which duplicates the heading the renderer emits separately. +function stripRedundantLeadingHeading(content: string, sectionTitle: string): string { + const match = content.match(/^\s*(#{1,6})\s+(.+?)\s*(?:\r?\n|$)/) + if (!match) { + return content + } + const headingText = match[2].trim().toLowerCase() + if (headingText.startsWith(sectionTitle.toLowerCase())) { + return content.slice(match[0].length).replace(/^\s+/, '') + } + return content +} + +function ReportSectionContent({ + section, + citationMap, +}: { + section: EvaluationReportSection + citationMap: Record +}): JSX.Element { + const markdown = linkifyUuids(stripRedundantLeadingHeading(section.content, section.title), citationMap) + return ( + + {markdown} + + ) +} + +function formatPassRate(rate: number | null | undefined): string { + if (rate == null) { + return '—' + } + return `${rate.toFixed(2)}%` +} + +function MetricsCard({ metrics }: { metrics: EvaluationReportMetrics }): JSX.Element { + // Period-over-period delta (if we have a previous pass rate to compare) + let deltaEl: JSX.Element | null = null + if (metrics.previous_pass_rate != null) { + const diff = metrics.pass_rate - metrics.previous_pass_rate + const arrow = diff > 0 ? '▲' : diff < 0 ? '▼' : '—' + const color = diff > 0 ? 'text-success' : diff < 0 ? 'text-danger' : 'text-muted' + deltaEl = ( + + {arrow} {Math.abs(diff).toFixed(2)}pp vs previous + + ) + } + + return ( +
+
+
+
Pass rate
+
+ {formatPassRate(metrics.pass_rate)} + {deltaEl} +
+
+
+
Total runs
+
{metrics.total_runs}
+
+
+
Pass
+
{metrics.pass_count}
+
+
+
Fail
+
{metrics.fail_count}
+
+
+
N/A
+
{metrics.na_count}
+
+ {metrics.previous_total_runs != null && ( +
+
Previous runs
+
{metrics.previous_total_runs}
+
+ )} +
+
+ ) +} + +function DeliveryStatusBadge({ status }: { status: string }): JSX.Element { + const statusMap: Record = { + delivered: { label: 'Delivered', status: 'success' }, + pending: { label: 'Pending', status: 'muted' }, + partial_failure: { label: 'Partial failure', status: 'warning' }, + failed: { label: 'Failed', status: 'danger' }, + } + const info = statusMap[status] || { label: status, status: 'muted' as const } + return +} + +export function EvaluationReportViewer({ + reportRun, + onClose, + compact = false, +}: { + reportRun: EvaluationReportRun + onClose?: () => void + /** When true, hides the header/close row — useful when the parent already provides framing (e.g. an expanded table row). */ + compact?: boolean +}): JSX.Element { + const content = reportRun.content as EvaluationReportRunContent + const sections = content.sections ?? [] + const metrics = content.metrics + + // Build generation_id → trace_id lookup from citations for correct trace URLs + const citationMap = useMemo(() => { + const map: Record = {} + for (const c of content.citations ?? []) { + if (c.generation_id && c.trace_id) { + map[c.generation_id] = c.trace_id + } + } + return map + }, [content.citations]) + + // Default to executive summary (first section) expanded. Memoized so Expand/Collapse all + // buttons can set the list deterministically. + const sectionKeys = useMemo(() => sections.map((_, i) => i.toString()), [sections]) + const [expandedKeys, setExpandedKeys] = useState(sections.length > 0 ? ['0'] : []) + + const allExpanded = expandedKeys.length === sectionKeys.length && sectionKeys.length > 0 + const allCollapsed = expandedKeys.length === 0 + + return ( +
+ {!compact && ( + <> +
+
+

{content.title || 'Report'}

+ +
+ {onClose && ( + + Close + + )} +
+
+ Period: {new Date(reportRun.period_start).toLocaleString()} –{' '} + {new Date(reportRun.period_end).toLocaleString()} +
+ + + + )} + + {compact && ( +
+ {content.title ?

{content.title}

:
} + {sections.length > 0 && ( +
+ setExpandedKeys(sectionKeys)} + disabledReason={allExpanded ? 'All sections already expanded' : undefined} + > + Expand all + + setExpandedKeys([])} + disabledReason={allCollapsed ? 'All sections already collapsed' : undefined} + > + Collapse all + +
+ )} +
+ )} + + {metrics && } + + {sections.length > 0 && ( + <> + {!compact && ( +
+ setExpandedKeys(sectionKeys)} + disabledReason={allExpanded ? 'All sections already expanded' : undefined} + > + Expand all + + setExpandedKeys([])} + disabledReason={allCollapsed ? 'All sections already collapsed' : undefined} + > + Collapse all + +
+ )} + + setExpandedKeys(keys as string[])} + panels={sections.map((section, idx) => ({ + key: idx.toString(), + header: section.title, + content: , + }))} + /> + + )} + + {reportRun.delivery_errors.length > 0 && ( +
+

Delivery errors

+
    + {reportRun.delivery_errors.map((err, i) => ( +
  • {err}
  • + ))} +
+
+ )} +
+ ) +} diff --git a/products/llm_analytics/frontend/evaluations/components/EvaluationReportsTab.tsx b/products/llm_analytics/frontend/evaluations/components/EvaluationReportsTab.tsx new file mode 100644 index 000000000000..4fec882bb347 --- /dev/null +++ b/products/llm_analytics/frontend/evaluations/components/EvaluationReportsTab.tsx @@ -0,0 +1,156 @@ +import { useActions, useValues } from 'kea' + +import { IconInfo } from '@posthog/icons' +import { LemonButton, LemonTable, LemonTag, Tooltip } from '@posthog/lemon-ui' + +import { TZLabel } from 'lib/components/TZLabel' + +import { evaluationReportLogic } from '../evaluationReportLogic' +import type { EvaluationReportRun } from '../types' +import { EvaluationReportViewer } from './EvaluationReportViewer' + +interface EvaluationReportsTabProps { + evaluationId: string + /** Called when the user clicks the "Set up scheduled reports" CTA in the empty state. */ + onConfigureClick?: () => void +} + +const STATUS_STYLES: Record< + EvaluationReportRun['delivery_status'], + { label: string; type: 'success' | 'warning' | 'danger' | 'muted' } +> = { + delivered: { label: 'Delivered', type: 'success' }, + pending: { label: 'Pending', type: 'muted' }, + partial_failure: { label: 'Partial failure', type: 'warning' }, + failed: { label: 'Failed', type: 'danger' }, +} + +export function EvaluationReportsTab({ evaluationId, onConfigureClick }: EvaluationReportsTabProps): JSX.Element { + const logic = evaluationReportLogic({ evaluationId }) + const { reportRuns, reportRunsLoading, reportsLoading, activeReport, generateResultLoading } = useValues(logic) + const { generateReport, loadReportRuns } = useActions(logic) + + // No schedule configured at all → CTA pointing to the Configuration tab. + // Avoids hiding the Reports tab entirely so it stays discoverable. + if (!reportsLoading && !activeReport) { + return ( +
+
+

No scheduled reports yet

+

+ Scheduled reports deliver AI-generated analysis of this evaluation's results to email or Slack + on a recurring basis. +

+ {onConfigureClick && ( + + Set up scheduled reports + + )} +
+
+ ) + } + + return ( +
+
+

+ History of AI-generated reports for this evaluation. Click a row to expand the full report. Schedule + and delivery targets are configured in the Configuration tab. +

+ {activeReport && ( +
+ loadReportRuns(activeReport.id)} + loading={reportRunsLoading} + > + Refresh + + generateReport(activeReport.id)} + loading={generateResultLoading} + > + Generate now + +
+ )} +
+ + , + }, + { + title: 'Title', + key: 'title', + render: (_, run: EvaluationReportRun) => ( + + {run.content?.title || '–'} + + ), + }, + { + title: 'Pass rate', + key: 'pass_rate', + render: (_, run: EvaluationReportRun) => { + const pct = run.content?.metrics?.pass_rate ?? run.metadata?.pass_rate + return typeof pct === 'number' ? `${pct.toFixed(1)}%` : '–' + }, + }, + { + title: 'Runs', + key: 'total_runs', + render: (_, run: EvaluationReportRun) => + run.content?.metrics?.total_runs ?? run.metadata?.total_runs ?? '–', + }, + { + title: 'Status', + key: 'delivery_status', + render: (_, run: EvaluationReportRun) => { + const info = STATUS_STYLES[run.delivery_status] || { + label: run.delivery_status, + type: 'default' as const, + } + return ( + + {info.label} + + ) + }, + }, + { + key: 'info', + width: 0, + render: (_, run: EvaluationReportRun) => ( + + + + ), + }, + ]} + expandable={{ + noIndent: true, + expandedRowRender: (run: EvaluationReportRun) => ( +
+ +
+ ), + }} + emptyState="No reports generated yet" + size="small" + /> +
+ ) +} diff --git a/products/llm_analytics/frontend/evaluations/evaluationReportLogic.ts b/products/llm_analytics/frontend/evaluations/evaluationReportLogic.ts new file mode 100644 index 000000000000..0e7bb9647fcc --- /dev/null +++ b/products/llm_analytics/frontend/evaluations/evaluationReportLogic.ts @@ -0,0 +1,241 @@ +import { actions, afterMount, connect, kea, key, listeners, path, props, reducers, selectors } from 'kea' +import { loaders } from 'kea-loaders' + +import api from 'lib/api' +import { lemonToast } from 'lib/lemon-ui/LemonToast' +import { teamLogic } from 'scenes/teamLogic' + +import type { evaluationReportLogicType } from './evaluationReportLogicType' +import type { + EvaluationReport, + EvaluationReportDeliveryTarget, + EvaluationReportFrequency, + EvaluationReportRun, +} from './types' + +export interface EvaluationReportLogicProps { + evaluationId: string +} + +export interface PendingReportConfig { + enabled: boolean + frequency: EvaluationReportFrequency + emailValue: string + slackIntegrationId: number | null + slackChannelValue: string + reportPromptGuidance: string + triggerThreshold: number +} + +const DEFAULT_PENDING_CONFIG: PendingReportConfig = { + enabled: true, + frequency: 'every_n', + emailValue: '', + slackIntegrationId: null, + slackChannelValue: '', + reportPromptGuidance: '', + triggerThreshold: 100, +} + +export const evaluationReportLogic = kea([ + path(['products', 'llm_analytics', 'frontend', 'evaluations', 'evaluationReportLogic']), + props({} as EvaluationReportLogicProps), + key((props) => props.evaluationId), + connect({ + values: [teamLogic, ['currentTeamId']], + }), + + actions({ + // Pending config for new evaluations + setPendingEnabled: (enabled: boolean) => ({ enabled }), + setPendingFrequency: (frequency: EvaluationReportFrequency) => ({ frequency }), + setPendingEmailValue: (emailValue: string) => ({ emailValue }), + setPendingSlackIntegrationId: (integrationId: number | null) => ({ integrationId }), + setPendingSlackChannelValue: (channelValue: string) => ({ channelValue }), + setPendingReportPromptGuidance: (reportPromptGuidance: string) => ({ reportPromptGuidance }), + setPendingTriggerThreshold: (triggerThreshold: number) => ({ triggerThreshold }), + createPendingReport: (evaluationId: string) => ({ evaluationId }), + + // Existing report actions + selectReportRun: (reportRun: EvaluationReportRun | null) => ({ reportRun }), + }), + + reducers({ + pendingConfig: [ + DEFAULT_PENDING_CONFIG as PendingReportConfig, + { + setPendingEnabled: (state, { enabled }) => ({ ...state, enabled }), + setPendingFrequency: (state, { frequency }) => ({ ...state, frequency }), + setPendingEmailValue: (state, { emailValue }) => ({ ...state, emailValue }), + setPendingSlackIntegrationId: (state, { integrationId }) => ({ + ...state, + slackIntegrationId: integrationId, + slackChannelValue: integrationId !== state.slackIntegrationId ? '' : state.slackChannelValue, + }), + setPendingSlackChannelValue: (state, { channelValue }) => ({ + ...state, + slackChannelValue: channelValue, + }), + setPendingReportPromptGuidance: (state, { reportPromptGuidance }) => ({ + ...state, + reportPromptGuidance, + }), + setPendingTriggerThreshold: (state, { triggerThreshold }) => ({ + ...state, + triggerThreshold, + }), + }, + ], + selectedReportRun: [ + null as EvaluationReportRun | null, + { + selectReportRun: (_, { reportRun }) => reportRun, + }, + ], + }), + + loaders(({ props, values }) => ({ + reports: [ + [] as EvaluationReport[], + { + loadReports: async () => { + if (props.evaluationId === 'new') { + return [] + } + const response = await api.get( + `api/environments/${values.currentTeamId}/llm_analytics/evaluation_reports/?evaluation=${props.evaluationId}` + ) + return response.results || [] + }, + createReport: async (params: { + evaluationId: string + frequency: EvaluationReportFrequency + delivery_targets: EvaluationReportDeliveryTarget[] + report_prompt_guidance?: string + trigger_threshold?: number | null + }) => { + const body: Record = { + evaluation: params.evaluationId, + frequency: params.frequency, + start_date: new Date().toISOString(), + delivery_targets: params.delivery_targets, + report_prompt_guidance: params.report_prompt_guidance ?? '', + enabled: true, + } + if (params.frequency === 'every_n' && params.trigger_threshold != null) { + body.trigger_threshold = params.trigger_threshold + } + const report = await api.create( + `api/environments/${values.currentTeamId}/llm_analytics/evaluation_reports/`, + body + ) + return [...values.reports, report] + }, + updateReport: async ({ reportId, data }: { reportId: string; data: Partial }) => { + const updated = await api.update( + `api/environments/${values.currentTeamId}/llm_analytics/evaluation_reports/${reportId}/`, + data + ) + return values.reports.map((r) => (r.id === reportId ? updated : r)) + }, + deleteReport: async (reportId: string) => { + await api.update( + `api/environments/${values.currentTeamId}/llm_analytics/evaluation_reports/${reportId}/`, + { deleted: true } + ) + return values.reports.filter((r) => r.id !== reportId) + }, + }, + ], + reportRuns: [ + [] as EvaluationReportRun[], + { + loadReportRuns: async (reportId: string) => { + const response = await api.get( + `api/environments/${values.currentTeamId}/llm_analytics/evaluation_reports/${reportId}/runs/` + ) + return response || [] + }, + }, + ], + generateResult: [ + null as null, + { + generateReport: async (reportId: string) => { + await api.create( + `api/environments/${values.currentTeamId}/llm_analytics/evaluation_reports/${reportId}/generate/` + ) + return null + }, + }, + ], + })), + + selectors({ + isNewEvaluation: [(_, p) => [p.evaluationId], (evaluationId: string) => evaluationId === 'new'], + activeReport: [ + (s) => [s.reports], + (reports): EvaluationReport | null => { + return reports.find((r: EvaluationReport) => r.enabled && !r.deleted) || null + }, + ], + }), + + listeners(({ actions, values }) => ({ + loadReportsSuccess: ({ reports }: { reports: EvaluationReport[] }) => { + // Auto-load the run history for the active report so the Reports tab knows + // whether to render itself and can show data immediately. + const active = reports.find((r: EvaluationReport) => r.enabled && !r.deleted) + if (active) { + actions.loadReportRuns(active.id) + } + }, + generateReportSuccess: () => { + lemonToast.success('Report is being generated and will be delivered to your configured targets shortly.') + }, + generateReportFailure: () => { + lemonToast.error('Failed to trigger report generation. Please try again.') + }, + createReportSuccess: () => { + actions.loadReports() + }, + updateReportSuccess: () => { + actions.loadReports() + }, + createPendingReport: ({ evaluationId }) => { + const { pendingConfig } = values + if (!pendingConfig.enabled) { + return + } + const targets: EvaluationReportDeliveryTarget[] = [] + if (pendingConfig.emailValue.trim()) { + targets.push({ type: 'email', value: pendingConfig.emailValue.trim() }) + } + if (pendingConfig.slackIntegrationId && pendingConfig.slackChannelValue) { + targets.push({ + type: 'slack', + integration_id: pendingConfig.slackIntegrationId, + channel: pendingConfig.slackChannelValue, + }) + } + // The backend auto-creates a default report config on eval creation. + // If the user configured delivery targets or custom settings, update + // the auto-created report after creation via the existing reports list. + if (targets.length > 0 || pendingConfig.reportPromptGuidance.trim()) { + actions.createReport({ + evaluationId, + frequency: pendingConfig.frequency, + delivery_targets: targets, + report_prompt_guidance: pendingConfig.reportPromptGuidance, + trigger_threshold: pendingConfig.frequency === 'every_n' ? pendingConfig.triggerThreshold : null, + }) + } + }, + })), + + afterMount(({ actions, props }) => { + if (props.evaluationId !== 'new') { + actions.loadReports() + } + }), +]) diff --git a/products/llm_analytics/frontend/evaluations/llmEvaluationLogic.ts b/products/llm_analytics/frontend/evaluations/llmEvaluationLogic.ts index a8c47c3cbcea..a694ea4e3293 100644 --- a/products/llm_analytics/frontend/evaluations/llmEvaluationLogic.ts +++ b/products/llm_analytics/frontend/evaluations/llmEvaluationLogic.ts @@ -18,6 +18,7 @@ import { LLMProviderKey, llmProviderKeysLogic } from '../settings/llmProviderKey import { isUnhealthyProviderKeyState } from '../settings/providerKeyStateUtils' import { queryEvaluationRuns } from '../utils' import { EVALUATION_SUMMARY_MAX_RUNS } from './constants' +import { evaluationReportLogic } from './evaluationReportLogic' import type { llmEvaluationLogicType } from './llmEvaluationLogicType' import { EvaluationTemplateKey, defaultEvaluationTemplates } from './templates' import { @@ -499,6 +500,13 @@ export const llmEvaluationLogic = kea([ }) }, + saveEvaluationSuccess: ({ evaluation }) => { + if (props.evaluationId === 'new' && evaluation?.id) { + // Create the pending report if the user configured one during evaluation creation + evaluationReportLogic({ evaluationId: 'new' }).actions.createPendingReport(evaluation.id) + } + }, + saveEvaluation: async () => { try { const teamId = teamLogic.values.currentTeamId diff --git a/products/llm_analytics/frontend/evaluations/types.ts b/products/llm_analytics/frontend/evaluations/types.ts index 44b1e3c1f551..01a0c29f0a24 100644 --- a/products/llm_analytics/frontend/evaluations/types.ts +++ b/products/llm_analytics/frontend/evaluations/types.ts @@ -82,6 +82,86 @@ export interface HogTestResult { error: string | null } +export type EvaluationReportFrequency = 'hourly' | 'daily' | 'weekly' | 'every_n' + +export interface EvaluationReportDeliveryTarget { + type: 'email' | 'slack' + value?: string + integration_id?: number + channel?: string +} + +export interface EvaluationReport { + id: string + evaluation: string + frequency: EvaluationReportFrequency + byweekday: string[] | null + start_date: string + next_delivery_date: string | null + delivery_targets: EvaluationReportDeliveryTarget[] + max_sample_size: number + enabled: boolean + deleted: boolean + last_delivered_at: string | null + /** Optional per-report custom guidance appended to the agent's system prompt. */ + report_prompt_guidance: string + /** Number of new eval results that triggers a report (only for every_n frequency). */ + trigger_threshold: number | null + /** Minimum minutes between count-triggered reports. */ + cooldown_minutes: number + /** Maximum count-triggered report runs per calendar day (UTC). */ + daily_run_cap: number + created_by: number | null + created_at: string +} + +/** A titled markdown section of the report (v2: agent-chosen title). */ +export interface EvaluationReportSection { + title: string + content: string +} + +/** A trace reference cited by the agent to ground a specific finding. */ +export interface EvaluationReportCitation { + generation_id: string + trace_id: string + reason: string +} + +/** Structured metrics computed mechanically from ClickHouse (agent cannot fabricate). */ +export interface EvaluationReportMetrics { + total_runs: number + pass_count: number + fail_count: number + na_count: number + pass_rate: number + period_start: string + period_end: string + previous_total_runs: number | null + previous_pass_rate: number | null +} + +/** Top-level report content stored in EvaluationReportRun.content. */ +export interface EvaluationReportRunContent { + title: string + sections: EvaluationReportSection[] + citations: EvaluationReportCitation[] + metrics: EvaluationReportMetrics +} + +export interface EvaluationReportRun { + id: string + report: string + content: EvaluationReportRunContent + /** Legacy mirror of content.metrics — populated by the store activity for backwards compat. */ + metadata: EvaluationReportMetrics + period_start: string + period_end: string + delivery_status: 'pending' | 'delivered' | 'partial_failure' | 'failed' + delivery_errors: string[] + created_at: string +} + export type EvaluationSummaryFilter = 'all' | 'pass' | 'fail' | 'na' export interface EvaluationPattern {