{tabContents}
From 8fbf1e98b3db1308fdecd58ea53babc1b5f522b7 Mon Sep 17 00:00:00 2001
From: Alex Culea <195758113+alexculealt@users.noreply.github.com>
Date: Tue, 17 Mar 2026 18:00:25 +0000
Subject: [PATCH 3/3] Add sample tool to extractions
---
.../app/extractions/SampleModal.tsx | 531 ++++++++++++++++++
.../src/components/app/extractions/detail.tsx | 21 +-
client/src/utils.ts | 35 +-
common/types.ts | 9 +
common/utils.ts | 6 +
server/src/data/extractions.ts | 16 +-
server/src/data/extractionsSample.ts | 161 ++++++
server/src/data/schema.ts | 17 +-
.../llm/determinePresenceOfEntity.ts | 1 +
.../extraction/llm/exploreAdditionalPages.ts | 1 +
.../src/extraction/llm/extractEntityData.ts | 20 +-
server/src/extraction/resumeExtraction.ts | 2 -
server/src/openai.ts | 13 +-
server/src/routers/extractions.ts | 38 +-
14 files changed, 810 insertions(+), 61 deletions(-)
create mode 100644 client/src/components/app/extractions/SampleModal.tsx
create mode 100644 common/utils.ts
create mode 100644 server/src/data/extractionsSample.ts
diff --git a/client/src/components/app/extractions/SampleModal.tsx b/client/src/components/app/extractions/SampleModal.tsx
new file mode 100644
index 0000000..ab8d860
--- /dev/null
+++ b/client/src/components/app/extractions/SampleModal.tsx
@@ -0,0 +1,531 @@
+import { Button } from "@/components/ui/button";
+import { Checkbox } from "@/components/ui/checkbox";
+import {
+ Command,
+ CommandEmpty,
+ CommandGroup,
+ CommandInput,
+ CommandItem,
+ CommandList,
+} from "@/components/ui/command";
+import {
+ Dialog,
+ DialogContent,
+ DialogHeader,
+ DialogTitle,
+} from "@/components/ui/dialog";
+import { Input } from "@/components/ui/input";
+import {
+ Popover,
+ PopoverContent,
+ PopoverTrigger,
+} from "@/components/ui/popover";
+import {
+ Select,
+ SelectContent,
+ SelectItem,
+ SelectTrigger,
+ SelectValue,
+} from "@/components/ui/select";
+import {
+ Table,
+ TableBody,
+ TableCell,
+ TableHead,
+ TableHeader,
+ TableRow,
+} from "@/components/ui/table";
+import {
+ Tooltip,
+ TooltipContent,
+ TooltipProvider,
+ TooltipTrigger,
+} from "@/components/ui/tooltip";
+import { UIPageStatus } from "@common/types";
+import { concisePrintDate, resolveCrawlPageUrl, trpc } from "@/utils";
+import { ExtractionStatus, PageStatus } from "@/utils";
+import {
+ ChevronsUpDown,
+ ExternalLink as ExternalLinkIcon,
+ FileJson,
+ FileText as FileTextIcon,
+ HelpCircle,
+ ScrollText as ScrollTextIcon,
+} from "lucide-react";
+import { useCallback, useEffect, useMemo, useState } from "react";
+
+interface MultiSelectOption
{
+ value: T;
+ label: string;
+}
+
+interface MultiSelectProps {
+ label?: string;
+ options: MultiSelectOption[];
+ value: T[];
+ onChange: (value: T[]) => void;
+ searchPlaceholder?: string;
+ emptyMessage?: string;
+}
+
+function MultiSelect({
+ label,
+ options,
+ value,
+ onChange,
+ searchPlaceholder = "Search...",
+ emptyMessage = "No results found.",
+}: MultiSelectProps) {
+ const [open, setOpen] = useState(false);
+
+ const triggerLabel =
+ value.length === 0
+ ? "None"
+ : value.length === options.length
+ ? "All"
+ : value
+ .map((v) => options.find((o) => o.value === v)?.label ?? v)
+ .join(", ");
+
+ const toggle = (itemValue: T) => {
+ onChange(
+ value.includes(itemValue)
+ ? value.filter((v) => v !== itemValue)
+ : [...value, itemValue]
+ );
+ };
+
+ const selectAll = () => onChange(options.map((o) => o.value));
+ const selectNone = () => onChange([]);
+
+ return (
+
+ {label && (
+
+ )}
+
+
+
+
+
+
+
+
+ {emptyMessage}
+
+
+ All
+
+
+ None
+
+ {options.map((opt) => (
+ toggle(opt.value)}
+ >
+
+ {opt.label}
+
+ ))}
+
+
+
+
+
+
+ );
+}
+
+const DATA_STATUS_OPTIONS: { value: "present" | "absent"; label: string }[] = [
+ { value: "present", label: "Present" },
+ { value: "absent", label: "Absent" },
+];
+
+const SORT_OPTIONS = [
+ { value: "random", label: "Random" },
+ { value: "most_expensive", label: "Most expensive" },
+ { value: "most_data_items", label: "Most data items" },
+ { value: "least_data_items", label: "Least data items" },
+];
+
+const STATUS_HELP: Record = {
+ [PageStatus.WAITING]:
+ "Queued — The page hasn't been processed yet; it's waiting its turn.",
+ [PageStatus.IN_PROGRESS]:
+ "In progress — The system is currently working on this page.",
+ [PageStatus.DOWNLOADED]:
+ "Downloaded — The page content was successfully retrieved from the website and is awaiting extraction.",
+ [PageStatus.SUCCESS]:
+ "Success — Useful information was successfully pulled from this page.",
+ [PageStatus.EXTRACTED_NO_DATA]:
+ "No data found — either the data is not present in the page, the model couldn't find the expected information or the page was not correctly converted to the simplified content form.",
+ [PageStatus.ERROR]:
+ "Error — Something went wrong (e.g. the page couldn't load or timed out).",
+};
+
+type SampledPage = {
+ id: number;
+ extractionId: number;
+ crawlStepId: number;
+ url: string;
+ status: string;
+ pageType: string | null;
+ createdAt: string;
+ dataItemCount: number;
+ tokenSum: number;
+};
+
+function formatThousands(n: number): string {
+ if (n === 0) return "0";
+ const k = n / 1_000;
+ return k % 1 === 0 ? `${k}K` : `${k.toFixed(1)}K`;
+}
+
+interface SampleModalProps {
+ open: boolean;
+ onOpenChange: (open: boolean) => void;
+ extractionId: number;
+ extractionStatus?: string;
+ /** Catalogue base URL for resolving relative crawl page URLs */
+ recipeUrl?: string;
+}
+
+export default function SampleModal({
+ open,
+ onOpenChange,
+ extractionId,
+ extractionStatus,
+ recipeUrl,
+}: SampleModalProps) {
+ const isExtractionInProgress = [
+ ExtractionStatus.IN_PROGRESS,
+ ExtractionStatus.WAITING,
+ ].includes(extractionStatus as ExtractionStatus);
+ const [sampleSizePercent, setSampleSizePercent] = useState(5);
+ const [dataStatus, setDataStatus] = useState<("present" | "absent")[]>([
+ "present",
+ ]);
+ const [statuses, setStatuses] = useState(() =>
+ UIPageStatus.map((o) => o.value)
+ );
+ const [sortBy, setSortBy] = useState<
+ "random" | "most_expensive" | "most_data_items" | "least_data_items"
+ >("random");
+
+ const [appliedFilters, setAppliedFilters] = useState<{
+ sampleSizePercent: number;
+ dataStatus: ("present" | "absent")[];
+ statuses: PageStatus[];
+ sortBy: "random" | "most_expensive" | "most_data_items" | "least_data_items";
+ applyKey: number;
+ } | null>(null);
+
+ useEffect(() => {
+ if (!open) {
+ setAppliedFilters(null);
+ }
+ }, [open]);
+
+ const filtersDirty = useMemo(() => {
+ if (!appliedFilters) return false;
+ const arrEq = (a: string[], b: string[]) =>
+ a.length === b.length && a.every((v, i) => v === b[i]);
+ return (
+ appliedFilters.sampleSizePercent !== sampleSizePercent ||
+ !arrEq([...appliedFilters.dataStatus].sort(), [...dataStatus].sort()) ||
+ !arrEq([...appliedFilters.statuses].sort(), [...statuses].sort()) ||
+ appliedFilters.sortBy !== sortBy
+ );
+ }, [appliedFilters, sampleSizePercent, dataStatus, statuses, sortBy]);
+
+ const sampleQuery = trpc.extractions.samplePages.useQuery(
+ {
+ extractionId,
+ sampleSizePercent: appliedFilters?.sampleSizePercent ?? 5,
+ dataStatus: appliedFilters?.dataStatus ?? ["present"],
+ statuses: appliedFilters?.statuses ?? [],
+ sortBy: appliedFilters?.sortBy ?? "random",
+ applyKey: appliedFilters?.applyKey ?? 0,
+ },
+ {
+ enabled: appliedFilters !== null && open,
+ }
+ );
+
+ const sampledPages = (sampleQuery.data ?? []) as SampledPage[];
+
+ const onApplyFilter = useCallback(() => {
+ setAppliedFilters((prev) => ({
+ sampleSizePercent,
+ dataStatus: [...dataStatus],
+ statuses: [...statuses],
+ sortBy,
+ applyKey: (prev?.applyKey ?? 0) + 1,
+ }));
+ }, [sampleSizePercent, dataStatus, statuses, sortBy]);
+
+ return (
+
+ );
+}
diff --git a/client/src/components/app/extractions/detail.tsx b/client/src/components/app/extractions/detail.tsx
index eef4c9b..124318c 100644
--- a/client/src/components/app/extractions/detail.tsx
+++ b/client/src/components/app/extractions/detail.tsx
@@ -38,12 +38,13 @@ import {
resolveCrawlPageUrl,
trpc,
} from "@/utils";
-import { CookingPot, LibraryBig, List } from "lucide-react";
+import { CookingPot, LibraryBig, List, Pipette } from "lucide-react";
import { useState } from "react";
import { Bar, BarChart, XAxis, YAxis } from "recharts";
import { Link, useLocation, useParams } from "wouter";
import { displayRecipeDetails } from "../recipes/util";
import AuditLogModal from "./AuditLogModal";
+import SampleModal from "./SampleModal";
import { displayStepType } from "./utils";
function displayStepParent(steps: CrawlStep[], parentId: number) {
@@ -139,6 +140,7 @@ export default function ExtractionDetail() {
const [lockedCancel, setLockedCancel] = useState(true);
const [lockedDelete, setLockDelete] = useState(true);
const [auditLogModalOpen, setAuditLogModalOpen] = useState(false);
+ const [sampleModalOpen, setSampleModalOpen] = useState(false);
const { toast } = useToast();
const [, navigate] = useLocation();
const query = trpc.extractions.detail.useQuery(
@@ -898,8 +900,16 @@ export default function ExtractionDetail() {
) : null}