From 8cde3cadc60357d80878bbfa52f7111578debdb8 Mon Sep 17 00:00:00 2001
From: Alex Culea <195758113+alexculealt@users.noreply.github.com>
Date: Mon, 16 Mar 2026 12:34:18 +0000
Subject: [PATCH 1/3] Fix simplified content breaking page UI due to line
 length


From 5117204190cf1520c0ab0eeb18b6f60b51457195 Mon Sep 17 00:00:00 2001
From: Alex Culea <195758113+alexculealt@users.noreply.github.com>
Date: Mon, 16 Mar 2026 13:57:16 +0000
Subject: [PATCH 2/3] Add URL based state for the tab navigation of the crawl
 page detail screen

---
 .../src/components/app/extractions/page.tsx   | 53 ++++++++++++++++---
 1 file changed, 45 insertions(+), 8 deletions(-)
diff --git a/client/src/components/app/extractions/page.tsx b/client/src/components/app/extractions/page.tsx
index 6ef03b9..7368cf9 100644
--- a/client/src/components/app/extractions/page.tsx
+++ b/client/src/components/app/extractions/page.tsx
@@ -18,12 +18,26 @@ import {
 import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
 import { concisePrintDate, prettyPrintDate, resolveCrawlPageUrl, trpc } from "@/utils";
 import { ExternalLink } from "lucide-react";
-import { useState } from "react";
-import { useParams } from "wouter";
+import { useEffect, useState } from "react";
+import { useLocation, useParams, useSearch } from "wouter";
 import { base64Img } from "./utils";
 
+const DEFAULT_TAB = "data";
+
+const VALID_TAB_VALUES = [
+  "data",
+  "raw_content",
+  "screenshot",
+  "simplified_content",
+  "operation_logs",
+];
+
 export default function CrawlPageDetail() {
   const { extractionId, stepId, crawlPageId } = useParams();
+  const [, navigate] = useLocation();
+  const search = useSearch();
+  const basePath = "~" + new URL(window.location.href).pathname;
+
   const crawlPageQuery = trpc.extractions.crawlPageDetail.useQuery(
     { crawlPageId: parseInt(crawlPageId || "") },
     { enabled: !!crawlPageId }
@@ -40,7 +54,22 @@ export default function CrawlPageDetail() {
         (typeof simulateExtractionQuery)["mutateAsync"]
       > | null>
     >(null);
-  if (!crawlPageQuery.data) {
+
+  const item = crawlPageQuery.data;
+
+  useEffect(() => {
+    const sp = new URLSearchParams(search || "");
+    const tabName = sp.get("tabName");
+    const shouldRedirect =
+      !tabName || !VALID_TAB_VALUES.includes(tabName || "");
+
+    if (shouldRedirect) {
+      sp.set("tabName", DEFAULT_TAB);
+      navigate(`${basePath}?${sp.toString()}`, { replace: true });
+    }
+  }, [basePath, navigate, search]);
+
+  if (!item) {
     return null;
   }
 
@@ -53,8 +82,6 @@ export default function CrawlPageDetail() {
     }
   };
 
-  const item = crawlPageQuery.data;
-
   const breadCrumbs = [
     { label: "Extractions", href: "/" },
     { label: `Extraction #${extractionId}`, href: `/${extractionId}` },
@@ -123,8 +150,6 @@ export default function CrawlPageDetail() {
     </TabsContent>,
   ];
 
-  const defaultTab = "data";
-
   if (item.markdownContent) {
     tabTriggers.push(
       <TabsTrigger key="simplified_content" value="simplified_content">Simplified Content</TabsTrigger>
@@ -191,6 +216,18 @@ export default function CrawlPageDetail() {
     tabContents.splice(1, 0, <TabsContent key="screenshot" value="screenshot">{screenshot}</TabsContent>);
   }
 
+  const tabNameFromUrl = new URLSearchParams(search || "").get("tabName");
+  if (tabNameFromUrl && !VALID_TAB_VALUES.includes(tabNameFromUrl)) {
+    return null;
+  }
+  const currentTab = tabNameFromUrl ?? DEFAULT_TAB;
+
+  const onTabChange = (value: string) => {
+    const sp = new URLSearchParams(search || "");
+    sp.set("tabName", value);
+    navigate(`${basePath}?${sp.toString()}`);
+  };
+
   const formattedSimulatedData = simulatedExtractedData?.data
     ? JSON.stringify(simulatedExtractedData?.data, null, 2)
     : null;
@@ -259,7 +296,7 @@ export default function CrawlPageDetail() {
         )}
       </div>
 
-      <Tabs defaultValue={defaultTab}>
+      <Tabs value={currentTab} onValueChange={onTabChange}>
         <TabsList className="w-full mb-4">{tabTriggers}</TabsList>
         <div className="border border-dashed p-4 text-xs overflow-auto">
           {tabContents}

From 8fbf1e98b3db1308fdecd58ea53babc1b5f522b7 Mon Sep 17 00:00:00 2001
From: Alex Culea <195758113+alexculealt@users.noreply.github.com>
Date: Tue, 17 Mar 2026 18:00:25 +0000
Subject: [PATCH 3/3] Add sample tool to extractions

---
 .../app/extractions/SampleModal.tsx           | 531 ++++++++++++++++++
 .../src/components/app/extractions/detail.tsx |  21 +-
 client/src/utils.ts                           |  35 +-
 common/types.ts                               |   9 +
 common/utils.ts                               |   6 +
 server/src/data/extractions.ts                |  16 +-
 server/src/data/extractionsSample.ts          | 161 ++++++
 server/src/data/schema.ts                     |  17 +-
 .../llm/determinePresenceOfEntity.ts          |   1 +
 .../extraction/llm/exploreAdditionalPages.ts  |   1 +
 .../src/extraction/llm/extractEntityData.ts   |  20 +-
 server/src/extraction/resumeExtraction.ts     |   2 -
 server/src/openai.ts                          |  13 +-
 server/src/routers/extractions.ts             |  38 +-
 14 files changed, 810 insertions(+), 61 deletions(-)
 create mode 100644 client/src/components/app/extractions/SampleModal.tsx
 create mode 100644 common/utils.ts
 create mode 100644 server/src/data/extractionsSample.ts

diff --git a/client/src/components/app/extractions/SampleModal.tsx b/client/src/components/app/extractions/SampleModal.tsx
new file mode 100644
index 0000000..ab8d860
--- /dev/null
+++ b/client/src/components/app/extractions/SampleModal.tsx
@@ -0,0 +1,531 @@
+import { Button } from "@/components/ui/button";
+import { Checkbox } from "@/components/ui/checkbox";
+import {
+  Command,
+  CommandEmpty,
+  CommandGroup,
+  CommandInput,
+  CommandItem,
+  CommandList,
+} from "@/components/ui/command";
+import {
+  Dialog,
+  DialogContent,
+  DialogHeader,
+  DialogTitle,
+} from "@/components/ui/dialog";
+import { Input } from "@/components/ui/input";
+import {
+  Popover,
+  PopoverContent,
+  PopoverTrigger,
+} from "@/components/ui/popover";
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from "@/components/ui/select";
+import {
+  Table,
+  TableBody,
+  TableCell,
+  TableHead,
+  TableHeader,
+  TableRow,
+} from "@/components/ui/table";
+import {
+  Tooltip,
+  TooltipContent,
+  TooltipProvider,
+  TooltipTrigger,
+} from "@/components/ui/tooltip";
+import { UIPageStatus } from "@common/types";
+import { concisePrintDate, resolveCrawlPageUrl, trpc } from "@/utils";
+import { ExtractionStatus, PageStatus } from "@/utils";
+import {
+  ChevronsUpDown,
+  ExternalLink as ExternalLinkIcon,
+  FileJson,
+  FileText as FileTextIcon,
+  HelpCircle,
+  ScrollText as ScrollTextIcon,
+} from "lucide-react";
+import { useCallback, useEffect, useMemo, useState } from "react";
+
+interface MultiSelectOption<T> {
+  value: T;
+  label: string;
+}
+
+interface MultiSelectProps<T> {
+  label?: string;
+  options: MultiSelectOption<T>[];
+  value: T[];
+  onChange: (value: T[]) => void;
+  searchPlaceholder?: string;
+  emptyMessage?: string;
+}
+
+function MultiSelect<T extends string>({
+  label,
+  options,
+  value,
+  onChange,
+  searchPlaceholder = "Search...",
+  emptyMessage = "No results found.",
+}: MultiSelectProps<T>) {
+  const [open, setOpen] = useState(false);
+
+  const triggerLabel =
+    value.length === 0
+      ? "None"
+      : value.length === options.length
+        ? "All"
+        : value
+            .map((v) => options.find((o) => o.value === v)?.label ?? v)
+            .join(", ");
+
+  const toggle = (itemValue: T) => {
+    onChange(
+      value.includes(itemValue)
+        ? value.filter((v) => v !== itemValue)
+        : [...value, itemValue]
+    );
+  };
+
+  const selectAll = () => onChange(options.map((o) => o.value));
+  const selectNone = () => onChange([]);
+
+  return (
+    <div>
+      {label && (
+        <label className="text-sm font-medium mb-1 block">{label}</label>
+      )}
+      <Popover open={open} onOpenChange={setOpen}>
+        <PopoverTrigger asChild>
+          <Button
+            variant="outline"
+            role="combobox"
+            aria-expanded={open}
+            className="w-full justify-between font-normal"
+          >
+            <span className="truncate">{triggerLabel}</span>
+            <ChevronsUpDown className="ml-2 h-4 w-4 shrink-0 opacity-50" />
+          </Button>
+        </PopoverTrigger>
+        <PopoverContent className="w-[200px] p-0" align="start">
+          <Command>
+            <CommandInput placeholder={searchPlaceholder} />
+            <CommandList>
+              <CommandEmpty>{emptyMessage}</CommandEmpty>
+              <CommandGroup>
+                <CommandItem
+                  value="all"
+                  onSelect={selectAll}
+                  className="justify-center font-medium"
+                >
+                  All
+                </CommandItem>
+                <CommandItem
+                  value="none"
+                  onSelect={selectNone}
+                  className="justify-center font-medium"
+                >
+                  None
+                </CommandItem>
+                {options.map((opt) => (
+                  <CommandItem
+                    key={opt.value}
+                    value={opt.label}
+                    onSelect={() => toggle(opt.value)}
+                  >
+                    <Checkbox
+                      checked={value.includes(opt.value)}
+                      className="mr-2"
+                    />
+                    {opt.label}
+                  </CommandItem>
+                ))}
+              </CommandGroup>
+            </CommandList>
+          </Command>
+        </PopoverContent>
+      </Popover>
+    </div>
+  );
+}
+
+const DATA_STATUS_OPTIONS: { value: "present" | "absent"; label: string }[] = [
+  { value: "present", label: "Present" },
+  { value: "absent", label: "Absent" },
+];
+
+const SORT_OPTIONS = [
+  { value: "random", label: "Random" },
+  { value: "most_expensive", label: "Most expensive" },
+  { value: "most_data_items", label: "Most data items" },
+  { value: "least_data_items", label: "Least data items" },
+];
+
+const STATUS_HELP: Record<string, string> = {
+  [PageStatus.WAITING]:
+    "Queued — The page hasn't been processed yet; it's waiting its turn.",
+  [PageStatus.IN_PROGRESS]:
+    "In progress — The system is currently working on this page.",
+  [PageStatus.DOWNLOADED]:
+    "Downloaded — The page content was successfully retrieved from the website and is awaiting extraction.",
+  [PageStatus.SUCCESS]:
+    "Success — Useful information was successfully pulled from this page.",
+  [PageStatus.EXTRACTED_NO_DATA]:
+    "No data found — either the data is not present in the page, the model couldn't find the expected information or the page was not correctly converted to the simplified content form.",
+  [PageStatus.ERROR]:
+    "Error — Something went wrong (e.g. the page couldn't load or timed out).",
+};
+
+type SampledPage = {
+  id: number;
+  extractionId: number;
+  crawlStepId: number;
+  url: string;
+  status: string;
+  pageType: string | null;
+  createdAt: string;
+  dataItemCount: number;
+  tokenSum: number;
+};
+
+function formatThousands(n: number): string {
+  if (n === 0) return "0";
+  const k = n / 1_000;
+  return k % 1 === 0 ? `${k}K` : `${k.toFixed(1)}K`;
+}
+
+interface SampleModalProps {
+  open: boolean;
+  onOpenChange: (open: boolean) => void;
+  extractionId: number;
+  extractionStatus?: string;
+  /** Catalogue base URL for resolving relative crawl page URLs */
+  recipeUrl?: string;
+}
+
+export default function SampleModal({
+  open,
+  onOpenChange,
+  extractionId,
+  extractionStatus,
+  recipeUrl,
+}: SampleModalProps) {
+  const isExtractionInProgress = [
+    ExtractionStatus.IN_PROGRESS,
+    ExtractionStatus.WAITING,
+  ].includes(extractionStatus as ExtractionStatus);
+  const [sampleSizePercent, setSampleSizePercent] = useState(5);
+  const [dataStatus, setDataStatus] = useState<("present" | "absent")[]>([
+    "present",
+  ]);
+  const [statuses, setStatuses] = useState<PageStatus[]>(() =>
+    UIPageStatus.map((o) => o.value)
+  );
+  const [sortBy, setSortBy] = useState<
+    "random" | "most_expensive" | "most_data_items" | "least_data_items"
+  >("random");
+
+  const [appliedFilters, setAppliedFilters] = useState<{
+    sampleSizePercent: number;
+    dataStatus: ("present" | "absent")[];
+    statuses: PageStatus[];
+    sortBy: "random" | "most_expensive" | "most_data_items" | "least_data_items";
+    applyKey: number;
+  } | null>(null);
+
+  useEffect(() => {
+    if (!open) {
+      setAppliedFilters(null);
+    }
+  }, [open]);
+
+  const filtersDirty = useMemo(() => {
+    if (!appliedFilters) return false;
+    const arrEq = (a: string[], b: string[]) =>
+      a.length === b.length && a.every((v, i) => v === b[i]);
+    return (
+      appliedFilters.sampleSizePercent !== sampleSizePercent ||
+      !arrEq([...appliedFilters.dataStatus].sort(), [...dataStatus].sort()) ||
+      !arrEq([...appliedFilters.statuses].sort(), [...statuses].sort()) ||
+      appliedFilters.sortBy !== sortBy
+    );
+  }, [appliedFilters, sampleSizePercent, dataStatus, statuses, sortBy]);
+
+  const sampleQuery = trpc.extractions.samplePages.useQuery(
+    {
+      extractionId,
+      sampleSizePercent: appliedFilters?.sampleSizePercent ?? 5,
+      dataStatus: appliedFilters?.dataStatus ?? ["present"],
+      statuses: appliedFilters?.statuses ?? [],
+      sortBy: appliedFilters?.sortBy ?? "random",
+      applyKey: appliedFilters?.applyKey ?? 0,
+    },
+    {
+      enabled: appliedFilters !== null && open,
+    }
+  );
+
+  const sampledPages = (sampleQuery.data ?? []) as SampledPage[];
+
+  const onApplyFilter = useCallback(() => {
+    setAppliedFilters((prev) => ({
+      sampleSizePercent,
+      dataStatus: [...dataStatus],
+      statuses: [...statuses],
+      sortBy,
+      applyKey: (prev?.applyKey ?? 0) + 1,
+    }));
+  }, [sampleSizePercent, dataStatus, statuses, sortBy]);
+
+  return (
+    <Dialog open={open} onOpenChange={onOpenChange}>
+      <DialogContent
+        className="sm:max-w-[66vw] w-[66vw] max-h-[66vh] h-[66vh] flex flex-col p-0 gap-0"
+        onPointerDownOutside={(e) => e.preventDefault()}
+      >
+        <DialogHeader className="px-6 pt-6 pb-4 shrink-0">
+          <DialogTitle>Sample extraction items</DialogTitle>
+          {appliedFilters && !sampleQuery.isLoading && (
+            <p className="text-sm text-muted-foreground mt-1">
+              {sampledPages.length} item{sampledPages.length !== 1 ? "s" : ""}
+            </p>
+          )}
+        </DialogHeader>
+
+        {isExtractionInProgress && (
+          <div className="mx-6 mb-4 px-4 py-3 rounded-md bg-amber-50 dark:bg-amber-950/30 border border-amber-200 dark:border-amber-800 text-amber-800 dark:text-amber-200 text-sm shrink-0">
+            Extraction is in progress. Data will change.
+          </div>
+        )}
+
+        <div className="flex flex-col flex-1 min-h-0 px-6 pb-6">
+          {/* Filter section */}
+          <div className="grid grid-cols-2 md:grid-cols-5 gap-4 mb-4 shrink-0">
+            <div>
+              <label className="text-sm font-medium mb-1 block">
+                Sample size (%)
+              </label>
+              <Input
+                type="number"
+                min={0}
+                max={100}
+                value={sampleSizePercent}
+                onChange={(e) =>
+                  setSampleSizePercent(
+                    Math.min(100, Math.max(0, parseInt(e.target.value) || 0))
+                  )
+                }
+              />
+            </div>
+            <MultiSelect
+              label="Data status"
+              options={DATA_STATUS_OPTIONS}
+              value={dataStatus}
+              onChange={setDataStatus}
+              searchPlaceholder="Search..."
+              emptyMessage="No option found."
+            />
+            <div>
+              <label className="text-sm font-medium mb-1 block">
+                <span className="inline-flex items-center gap-1">
+                  Status
+                  <TooltipProvider>
+                    <Tooltip delayDuration={200}>
+                      <TooltipTrigger asChild>
+                        <HelpCircle className="h-3.5 w-3.5 text-muted-foreground cursor-help shrink-0" />
+                      </TooltipTrigger>
+                      <TooltipContent
+                        side="top"
+                        align="start"
+                        className="max-w-[280px] space-y-1.5 p-3"
+                      >
+                        {UIPageStatus.map(({ value, label }) => (
+                          <p key={value} className="text-xs leading-snug">
+                            <span className="font-medium">{label}:</span>{" "}
+                            {STATUS_HELP[value]}
+                          </p>
+                        ))}
+                      </TooltipContent>
+                    </Tooltip>
+                  </TooltipProvider>
+                </span>
+              </label>
+              <MultiSelect
+                options={UIPageStatus}
+                value={statuses}
+                onChange={setStatuses}
+                searchPlaceholder="Search status..."
+                emptyMessage="No status found."
+              />
+            </div>
+            <div>
+              <label className="text-sm font-medium mb-1 block">Sort by</label>
+              <Select
+                value={sortBy}
+                onValueChange={(
+                  v:
+                    | "random"
+                    | "most_expensive"
+                    | "most_data_items"
+                    | "least_data_items"
+                ) => setSortBy(v)}
+              >
+                <SelectTrigger>
+                  <SelectValue />
+                </SelectTrigger>
+                <SelectContent>
+                  {SORT_OPTIONS.map((opt) => (
+                    <SelectItem key={opt.value} value={opt.value}>
+                      {opt.label}
+                    </SelectItem>
+                  ))}
+                </SelectContent>
+              </Select>
+            </div>
+            <div className="flex items-end col-span-2 md:col-span-1">
+              <Button
+                onClick={onApplyFilter}
+                disabled={sampleQuery.isFetching}
+              >
+                {sampleQuery.isFetching ? "Applying..." : "Apply filter"}
+              </Button>
+            </div>
+          </div>
+
+          {filtersDirty && (
+            <div className="mb-4 px-4 py-3 rounded-md bg-muted border text-muted-foreground text-sm shrink-0">
+              Filters have changed. Click Apply filter to update results.
+            </div>
+          )}
+
+          {/* Table section */}
+          <div className="flex-1 min-h-0 border rounded-md overflow-hidden">
+            {!appliedFilters ? (
+              <div className="h-full flex items-center justify-center text-muted-foreground">
+                Click Apply Filter
+              </div>
+            ) : sampleQuery.isLoading ? (
+              <div className="h-full flex items-center justify-center text-muted-foreground">
+                Loading...
+              </div>
+            ) : sampledPages.length === 0 ? (
+              <div className="h-full flex items-center justify-center text-muted-foreground">
+                No pages match the filter
+              </div>
+            ) : (
+              <div className="h-full overflow-auto">
+                <Table>
+                  <TableHeader>
+                    <TableRow className="text-xs">
+                      <TableHead>ID</TableHead>
+                      <TableHead>Status</TableHead>
+                      <TableHead>Created</TableHead>
+                      <TableHead className="max-w-[200px]">URL</TableHead>
+                      <TableHead className="text-right">Data items</TableHead>
+                      <TableHead className="text-right">Tokens used</TableHead>
+                      <TableHead className="text-right w-[140px]">
+                        Actions
+                      </TableHead>
+                    </TableRow>
+                  </TableHeader>
+                  <TableBody className="text-xs">
+                    {sampledPages.map((page) => (
+                      <TableRow key={page.id}>
+                        <TableCell>{page.id}</TableCell>
+                        <TableCell>{page.status}</TableCell>
+                        <TableCell>{concisePrintDate(page.createdAt)}</TableCell>
+                        <TableCell className="max-w-[200px] truncate" title={page.url}>
+                          {page.url}
+                        </TableCell>
+                        <TableCell className="text-right">
+                          {page.dataItemCount ?? 0}
+                        </TableCell>
+                        <TableCell className="text-right">
+                          {formatThousands(page.tokenSum)}
+                        </TableCell>
+                        <TableCell className="text-right">
+                          <div className="flex gap-1 justify-end">
+                            <Button
+                              variant="ghost"
+                              size="icon"
+                              className="h-7 w-7"
+                              title="Open catalogue URL in new tab"
+                              onClick={() =>
+                                window.open(
+                                  recipeUrl
+                                    ? resolveCrawlPageUrl(page.url, recipeUrl)
+                                    : page.url,
+                                  "_blank",
+                                  "noopener,noreferrer"
+                                )
+                              }
+                            >
+                              <ExternalLinkIcon className="h-3.5 w-3.5" />
+                            </Button>
+                            <Button
+                              variant="ghost"
+                              size="icon"
+                              className="h-7 w-7"
+                              title="Show extracted data"
+                              asChild
+                            >
+                              <a
+                                href={`/extractions/${extractionId}/steps/${page.crawlStepId}/items/${page.id}?tabName=data`}
+                                target="_blank"
+                                rel="noreferrer"
+                              >
+                                <FileJson className="h-3.5 w-3.5" />
+                              </a>
+                            </Button>
+                            <Button
+                              variant="ghost"
+                              size="icon"
+                              className="h-7 w-7"
+                              title="Show simplified content"
+                              asChild
+                            >
+                              <a
+                                href={`/extractions/${extractionId}/steps/${page.crawlStepId}/items/${page.id}?tabName=simplified_content`}
+                                target="_blank"
+                                rel="noreferrer"
+                              >
+                                <FileTextIcon className="h-3.5 w-3.5" />
+                              </a>
+                            </Button>
+                            <Button
+                              variant="ghost"
+                              size="icon"
+                              className="h-7 w-7"
+                              title="Show operation logs"
+                              asChild
+                            >
+                              <a
+                                href={`/extractions/${extractionId}/steps/${page.crawlStepId}/items/${page.id}?tabName=operation_logs`}
+                                target="_blank"
+                                rel="noreferrer"
+                              >
+                                <ScrollTextIcon className="h-3.5 w-3.5" />
+                              </a>
+                            </Button>
+                          </div>
+                        </TableCell>
+                      </TableRow>
+                    ))}
+                  </TableBody>
+                </Table>
+              </div>
+            )}
+          </div>
+        </div>
+      </DialogContent>
+    </Dialog>
+  );
+}
diff --git a/client/src/components/app/extractions/detail.tsx b/client/src/components/app/extractions/detail.tsx
index eef4c9b..124318c 100644
--- a/client/src/components/app/extractions/detail.tsx
+++ b/client/src/components/app/extractions/detail.tsx
@@ -38,12 +38,13 @@ import {
   resolveCrawlPageUrl,
   trpc,
 } from "@/utils";
-import { CookingPot, LibraryBig, List } from "lucide-react";
+import { CookingPot, LibraryBig, List, Pipette } from "lucide-react";
 import { useState } from "react";
 import { Bar, BarChart, XAxis, YAxis } from "recharts";
 import { Link, useLocation, useParams } from "wouter";
 import { displayRecipeDetails } from "../recipes/util";
 import AuditLogModal from "./AuditLogModal";
+import SampleModal from "./SampleModal";
 import { displayStepType } from "./utils";
 
 function displayStepParent(steps: CrawlStep[], parentId: number) {
@@ -139,6 +140,7 @@ export default function ExtractionDetail() {
   const [lockedCancel, setLockedCancel] = useState(true);
   const [lockedDelete, setLockDelete] = useState(true);
   const [auditLogModalOpen, setAuditLogModalOpen] = useState(false);
+  const [sampleModalOpen, setSampleModalOpen] = useState(false);
   const { toast } = useToast();
   const [, navigate] = useLocation();
   const query = trpc.extractions.detail.useQuery(
@@ -898,8 +900,16 @@ export default function ExtractionDetail() {
           ) : null}
         </div>
         <Card className="mt-4">
-          <CardHeader>
+          <CardHeader className="flex flex-row items-center justify-between space-y-0">
             <CardTitle>Extraction Steps</CardTitle>
+            <Button
+              variant="outline"
+              size="sm"
+              onClick={() => setSampleModalOpen(true)}
+            >
+              <Pipette className="w-3.5 h-3.5 mr-2" />
+              Sample
+            </Button>
           </CardHeader>
           <CardContent>
             <Tabs defaultValue="table">
@@ -987,6 +997,13 @@ export default function ExtractionDetail() {
         open={auditLogModalOpen}
         onOpenChange={setAuditLogModalOpen}
       />
+      <SampleModal
+        extractionId={extractionIdNum}
+        extractionStatus={extraction.status}
+        open={sampleModalOpen}
+        onOpenChange={setSampleModalOpen}
+        recipeUrl={extraction.recipe?.url}
+      />
     </>
   );
 }
diff --git a/client/src/utils.ts b/client/src/utils.ts
index 39fe3f2..a51160f 100644
--- a/client/src/utils.ts
+++ b/client/src/utils.ts
@@ -30,35 +30,12 @@ export type CrawlPage = ItemType<
 type DatasetItemsResponse = Exclude<RouterOutput["datasets"]["items"], null>;
 export type DatasetItem = ItemType<DatasetItemsResponse["items"]["results"]>;
 
-export enum ExtractionStatus {
-  WAITING = "WAITING",
-  IN_PROGRESS = "IN_PROGRESS",
-  COMPLETE = "COMPLETE",
-  STALE = "STALE",
-  CANCELLED = "CANCELLED",
-}
-
-export enum PageStatus {
-  WAITING = "WAITING",
-  IN_PROGRESS = "IN_PROGRESS",
-  DOWNLOADED = "DOWNLOADED",
-  SUCCESS = "SUCCESS",
-  EXTRACTED_NO_DATA = "EXTRACTED_NO_DATA",
-  ERROR = "ERROR",
-}
-
-export enum RecipeDetectionStatus {
-  WAITING = "WAITING",
-  IN_PROGRESS = "IN_PROGRESS",
-  SUCCESS = "SUCCESS",
-  ERROR = "ERROR",
-}
-
-export enum Step {
-  FETCH_ROOT = "FETCH_ROOT",
-  FETCH_PAGINATED = "FETCH_PAGINATED",
-  FETCH_LINKS = "FETCH_LINKS",
-}
+export {
+  ExtractionStatus,
+  PageStatus,
+  RecipeDetectionStatus,
+  Step,
+} from "../../common/types";
 
 export function cn(...inputs: ClassValue[]) {
   return twMerge(clsx(inputs));
diff --git a/common/types.ts b/common/types.ts
index 12182b8..9df6bfa 100644
--- a/common/types.ts
+++ b/common/types.ts
@@ -1,3 +1,5 @@
+import { toTitleCase } from "./utils";
+
 export enum CatalogueType {
   COURSES = "COURSES",
   LEARNING_PROGRAMS = "LEARNING_PROGRAMS",
@@ -112,6 +114,13 @@ export enum PageStatus {
   ERROR = "ERROR",
 }
 
+export const UIPageStatus: { value: PageStatus; label: string }[] = (
+  Object.values(PageStatus) as PageStatus[]
+).map((value) => ({
+  value,
+  label: toTitleCase(String(value).replace(/_/g, " ")),
+}));
+
 export enum RecipeDetectionStatus {
   WAITING = "WAITING",
   IN_PROGRESS = "IN_PROGRESS",
diff --git a/common/utils.ts b/common/utils.ts
new file mode 100644
index 0000000..9994154
--- /dev/null
+++ b/common/utils.ts
@@ -0,0 +1,6 @@
+export function toTitleCase(str: string): string {
+  return str
+    .split(" ")
+    .map((word) => word.charAt(0).toUpperCase() + word.slice(1).toLowerCase())
+    .join(" ");
+}
diff --git a/server/src/data/extractions.ts b/server/src/data/extractions.ts
index fb79877..d15fd50 100644
--- a/server/src/data/extractions.ts
+++ b/server/src/data/extractions.ts
@@ -831,8 +831,7 @@ export async function createModelApiCallLog(
   callSite: string,
   inputTokenCount: number,
   outputTokenCount: number,
-  datasetId?: number,
-  crawlPageId?: number
+  options?: { datasetId?: number; crawlPageId?: number }
 ) {
   const result = await db
     .insert(modelApiCalls)
@@ -843,8 +842,8 @@ export async function createModelApiCallLog(
       callSite,
       input_token_count: inputTokenCount,
       output_token_count: outputTokenCount,
-      datasetId,
-      crawlPageId,
+      datasetId: options?.datasetId,
+      crawlPageId: options?.crawlPageId,
     })
     .returning();
   return result[0];
@@ -923,7 +922,7 @@ export async function findFailedAndNoDataPageIds(crawlStepId: number) {
       )
     )
     .groupBy(crawlPages.id)
-    .having(sql`count(${dataItems.id}) = 0`);
+    .having(eq(count(dataItems.id), 0));
 
   return [...new Set(failedIds.concat(noDataIds).map((p) => p.id))];
 }
@@ -931,3 +930,10 @@ export async function findFailedAndNoDataPageIds(crawlStepId: number) {
 export async function destroyExtraction(id: number) {
   return db.delete(extractions).where(eq(extractions.id, id));
 }
+
+export {
+  findSampledPagesForExtraction,
+  type SamplePagesOptions,
+  type SampleSortOption,
+  type SampledPageRow,
+} from "./extractionsSample";
diff --git a/server/src/data/extractionsSample.ts b/server/src/data/extractionsSample.ts
new file mode 100644
index 0000000..c50d2c4
--- /dev/null
+++ b/server/src/data/extractionsSample.ts
@@ -0,0 +1,161 @@
+/**
+ * Sample pages query for extractions. Uses raw SQL to ensure correct column
+ * aliases (camelCase), data status filtering, and minimal data selection.
+ * All filtering, aggregation, and sampling is done in the database.
+ */
+import { sql } from "drizzle-orm";
+import { PageStatus } from "../../../common/types";
+import db from ".";
+
+export type SampleSortOption =
+  | "random"
+  | "most_expensive"
+  | "most_data_items"
+  | "least_data_items";
+
+const VALID_STATUSES = new Set<string>(Object.values(PageStatus));
+const VALID_SORT: SampleSortOption[] = [
+  "random",
+  "most_expensive",
+  "most_data_items",
+  "least_data_items",
+];
+
+export interface SamplePagesOptions {
+  extractionId: number;
+  sampleSizePercent: number;
+  dataStatus: ("present" | "absent")[];
+  statuses: PageStatus[];
+  sortBy: SampleSortOption;
+}
+
+export interface SampledPageRow {
+  id: number;
+  extractionId: number;
+  crawlStepId: number;
+  url: string;
+  status: string;
+  createdAt: Date;
+  dataItemCount: number;
+  tokenSum: number;
+}
+
+/**
+ * Fetches a sampled subset of crawl pages for an extraction.
+ * 
+ * ```markdown
+ * **SQL Injection safety**: Numerical parameters are validated using Zod,
+ * if calling this function from outside, make sure to validate the input.
+ * ```
+ */
+export async function findSampledPagesForExtraction(
+  opts: SamplePagesOptions
+): Promise<SampledPageRow[]> {
+  const statuses = (opts.statuses as string[]).filter((s) => VALID_STATUSES.has(s));
+  if (statuses.length === 0) {
+    return [];
+  }
+
+  const sortBy = VALID_SORT.includes(opts.sortBy) ? opts.sortBy : "random";
+
+  const hasPresent = opts.dataStatus.includes("present");
+  const hasAbsent = opts.dataStatus.includes("absent");
+  const dataStatusFilter =
+    (hasPresent && hasAbsent) || (!hasPresent && !hasAbsent)
+      ? undefined
+      : hasPresent && !hasAbsent
+        ? "present"
+        : "absent";
+
+  const noDataStatusFilter = dataStatusFilter === undefined;
+  const filterPresent = dataStatusFilter === "present";
+  const filterAbsent = dataStatusFilter === "absent";
+
+  const statusInClause = sql.join(
+    statuses.map((s) => sql`${s}`),
+    sql`, `
+  );
+
+  /* 
+  SQL Injection safety:
+  Interpolated parameters are safe to use with Drizzle `sql` function.
+  From docs (https://orm.drizzle.team/docs/sql):
+    Additionally, any dynamic parameters such as ${id} will be mapped to the $1 placeholder, 
+    and the corresponding values will be moved to an array of values that are passed separately to the database. 
+    This approach effectively prevents any potential SQL Injection vulnerabilities.
+  */
+  const result = await db.execute(sql`
+    WITH latest_dataset AS (
+      SELECT id FROM datasets
+      WHERE extraction_id = ${opts.extractionId}
+      ORDER BY created_at DESC
+      LIMIT 1
+    ),
+    base AS (
+      SELECT
+        cp.id,
+        cp.extraction_id AS "extractionId",
+        cp.crawl_step_id AS "crawlStepId",
+        cp.url,
+        cp.status,
+        cp.created_at AS "createdAt",
+        COALESCE((
+          SELECT COUNT(*)::integer
+          FROM data_items di
+          WHERE di.crawl_page_id = cp.id
+            AND di.dataset_id = (SELECT id FROM latest_dataset)
+        ), 0) AS "dataItemCount",
+        COALESCE((
+          SELECT SUM(mac.input_token_count + mac.output_token_count)::integer
+          FROM model_api_calls mac
+          WHERE mac.crawl_page_id = cp.id
+        ), 0) AS "tokenSum"
+      FROM crawl_pages cp
+      WHERE cp.extraction_id = ${opts.extractionId}
+        AND cp.status IN (${statusInClause})
+        AND (
+          ${noDataStatusFilter}
+          OR (${filterPresent} AND COALESCE((
+            SELECT COUNT(*)::integer FROM data_items di
+            WHERE di.crawl_page_id = cp.id
+              AND di.dataset_id = (SELECT id FROM latest_dataset)
+          ), 0) > 0)
+          OR (${filterAbsent} AND COALESCE((
+            SELECT COUNT(*)::integer FROM data_items di
+            WHERE di.crawl_page_id = cp.id
+              AND di.dataset_id = (SELECT id FROM latest_dataset)
+          ), 0) = 0)
+        )
+    ),
+    with_total AS (
+      SELECT *, COUNT(*) OVER () AS total FROM base
+    ),
+    numbered AS (
+      SELECT *,
+        ROW_NUMBER() OVER (
+          ORDER BY
+            CASE WHEN ${sortBy} = 'random' THEN random() END,
+            CASE WHEN ${sortBy} = 'most_expensive' THEN "tokenSum" END DESC NULLS LAST,
+            CASE WHEN ${sortBy} = 'most_data_items' THEN "dataItemCount" END DESC NULLS LAST,
+            CASE WHEN ${sortBy} = 'least_data_items' THEN "dataItemCount" END ASC NULLS LAST
+        ) AS rn
+      FROM with_total
+    ),
+    limited AS (
+      SELECT id, "extractionId", "crawlStepId", url, status, "createdAt",
+             "dataItemCount", "tokenSum"
+      FROM numbered
+      WHERE total = 0 OR rn <= GREATEST(0, CEIL(total * ${opts.sampleSizePercent}::float / 100)::integer)
+    )
+    SELECT id, "extractionId", "crawlStepId", url, status, "createdAt",
+           "dataItemCount", "tokenSum"
+    FROM limited
+    ORDER BY
+      CASE WHEN ${sortBy} = 'random' THEN random() END,
+      CASE WHEN ${sortBy} = 'most_expensive' THEN "tokenSum" END DESC NULLS LAST,
+      CASE WHEN ${sortBy} = 'most_data_items' THEN "dataItemCount" END DESC NULLS LAST,
+      CASE WHEN ${sortBy} = 'least_data_items' THEN "dataItemCount" END ASC NULLS LAST
+  `);
+
+  return (result.rows ?? []) as unknown as SampledPageRow[];
+}
diff --git a/server/src/data/schema.ts b/server/src/data/schema.ts
index 480a4c2..accbe15 100644
--- a/server/src/data/schema.ts
+++ b/server/src/data/schema.ts
@@ -332,6 +332,9 @@ const modelApiCalls = pgTable(
     extractionId: integer("extraction_id").references(() => extractions.id, {
       onDelete: "cascade",
     }),
+    crawlPageId: integer("crawl_page_id").references(() => crawlPages.id, {
+      onDelete: "cascade",
+    }),
     provider: providerEnum("provider").notNull(),
     model: providerModelEnum("model").notNull(),
     callSite: text("call_site").notNull(),
@@ -341,14 +344,11 @@ const modelApiCalls = pgTable(
     datasetId: integer("dataset_id").references(() => datasets.id, {
       onDelete: "cascade"
     }),
-    crawlPageId: integer("crawl_page_id").references(() => crawlPages.id, {
-      onDelete: "cascade",
-    }),
   },
   (t) => ({
     extractionIdx: index("model_api_calls_extraction_idx").on(t.extractionId),
-    datasetIdx: index("model_api_calls_datasaet_idx").on(t.datasetId),
     crawlPageIdx: index("model_api_calls_crawl_page_idx").on(t.crawlPageId),
+    datasetIdx: index("model_api_calls_datasaet_idx").on(t.datasetId),
   })
 );
 
@@ -357,14 +357,14 @@ const modelApiCallsRelations = relations(modelApiCalls, ({ one }) => ({
     fields: [modelApiCalls.extractionId],
     references: [extractions.id],
   }),
-  dataset: one(datasets, {
-    fields: [modelApiCalls.datasetId],
-    references: [datasets.id]
-  }),
   crawlPage: one(crawlPages, {
     fields: [modelApiCalls.crawlPageId],
     references: [crawlPages.id],
   }),
+  dataset: one(datasets, {
+    fields: [modelApiCalls.datasetId],
+    references: [datasets.id]
+  }),
 }));
 
 const extractionLogs = pgTable(
@@ -480,6 +480,7 @@ const crawlPageRelations = relations(crawlPages, ({ one, many }) => ({
   }),
   dataItems: many(dataItems),
   extractionLogs: many(extractionLogs),
+  modelApiCalls: many(modelApiCalls),
 }));
 
 const datasets = pgTable(
diff --git a/server/src/extraction/llm/determinePresenceOfEntity.ts b/server/src/extraction/llm/determinePresenceOfEntity.ts
index f8c528d..6f5e08d 100644
--- a/server/src/extraction/llm/determinePresenceOfEntity.ts
+++ b/server/src/extraction/llm/determinePresenceOfEntity.ts
@@ -84,6 +84,7 @@ ${MD_END}
     logApiCall: options?.logApiCalls
       ? {
           extractionId: options.logApiCalls.extractionId,
+          datasetId: options.logApiCalls.datasetId,
           crawlPageId: options.logApiCalls.crawlPageId,
           callSite: "determinePresenceOfEntity",
         }
diff --git a/server/src/extraction/llm/exploreAdditionalPages.ts b/server/src/extraction/llm/exploreAdditionalPages.ts
index dbf3725..81a1619 100644
--- a/server/src/extraction/llm/exploreAdditionalPages.ts
+++ b/server/src/extraction/llm/exploreAdditionalPages.ts
@@ -79,6 +79,7 @@ ${MD_END}
     logApiCall: options?.logApiCalls
       ? {
           extractionId: options.logApiCalls.extractionId,
+          datasetId: options.logApiCalls.datasetId,
           crawlPageId: options.logApiCalls.crawlPageId,
           callSite: "exploreAdditionalPages",
         }
diff --git a/server/src/extraction/llm/extractEntityData.ts b/server/src/extraction/llm/extractEntityData.ts
index e70439a..9a9ade7 100644
--- a/server/src/extraction/llm/extractEntityData.ts
+++ b/server/src/extraction/llm/extractEntityData.ts
@@ -276,11 +276,11 @@ ${basePrompt}
       model,
       logApiCall: options?.logApiCalls
         ? {
-            extractionId: options.logApiCalls.extractionId,
-            datasetId: options.logApiCalls.datasetId,
-            crawlPageId: options.logApiCalls.crawlPageId,
-            callSite: "extractEntityData",
-          }
+          extractionId: options.logApiCalls.extractionId,
+          datasetId: options.logApiCalls.datasetId,
+          crawlPageId: options.logApiCalls.crawlPageId,
+          callSite: "extractEntityData",
+        }
         : undefined,
     });
 
@@ -311,11 +311,11 @@ ${basePrompt}
       requiredParameters: ["items"],
       logApiCall: options?.logApiCalls
         ? {
-            extractionId: options.logApiCalls.extractionId,
-            datasetId: options.logApiCalls.datasetId,
-            crawlPageId: options.logApiCalls.crawlPageId,
-            callSite: "extractEntityData",
-          }
+          extractionId: options.logApiCalls.extractionId,
+          datasetId: options.logApiCalls.datasetId,
+          crawlPageId: options.logApiCalls.crawlPageId,
+          callSite: "extractEntityData",
+        }
         : undefined,
     };
 
diff --git a/server/src/extraction/resumeExtraction.ts b/server/src/extraction/resumeExtraction.ts
index 556f3fe..cb7c2f4 100644
--- a/server/src/extraction/resumeExtraction.ts
+++ b/server/src/extraction/resumeExtraction.ts
@@ -3,7 +3,6 @@ import { findLatestDataset } from "../data/datasets";
 import {
   createExtractionAuditLog,
   createExtractionLog,
-  findApiExtractionRootPage,
   findExtractionById,
   findInProgressPagesWithoutJobs,
   findPagesNeedingExtractData,
@@ -16,7 +15,6 @@ import {
   getPageIdsWithExistingJobs,
   Queues,
   REPEAT_UPDATE_COMPLETION_EVERY_MS,
-  submitJob,
   submitJobs,
   submitRepeatableJob,
 } from "../workers";
diff --git a/server/src/openai.ts b/server/src/openai.ts
index f79ae86..7b60072 100644
--- a/server/src/openai.ts
+++ b/server/src/openai.ts
@@ -99,6 +99,7 @@ export async function simpleToolCompletion<
   logApiCall?: {
     callSite: string;
     extractionId: number;
+    datasetId?: number;
     crawlPageId?: number;
   };
 }): Promise<{
@@ -195,8 +196,10 @@ export async function simpleToolCompletion<
         options.logApiCall.callSite,
         inputTokenCount,
         outputTokenCount,
-        undefined,
-        options.logApiCall.crawlPageId
+        {
+          datasetId: options.logApiCall.datasetId,
+          crawlPageId: options.logApiCall.crawlPageId,
+        }
       );
     }
 
@@ -326,8 +329,10 @@ export async function structuredCompletion<
         options.logApiCall.callSite,
         inputTokenCount,
         outputTokenCount,
-        options.logApiCall.datasetId,
-        options.logApiCall.crawlPageId
+        {
+          datasetId: options.logApiCall.datasetId,
+          crawlPageId: options.logApiCall.crawlPageId,
+        }
       );
     }
 
diff --git a/server/src/routers/extractions.ts b/server/src/routers/extractions.ts
index 0942f2c..8dd1f7e 100644
--- a/server/src/routers/extractions.ts
+++ b/server/src/routers/extractions.ts
@@ -3,6 +3,7 @@ import { publicProcedure, router } from ".";
 import {
   CatalogueType,
   ExtractionStatus,
+  PageStatus,
   ProviderModel,
 } from "../../../common/types";
 import { AppError, AppErrors } from "../appErrors";
@@ -24,7 +25,9 @@ import {
   findPage,
   findPageForJob,
   findPagesPaginated,
+  findSampledPagesForExtraction,
   findStep,
+  getApiCallSummary,
   getExtractionCount,
   getLogCount,
   getPageCount,
@@ -130,17 +133,30 @@ export const extractionsRouter = router({
       })
     )
     .query(async (opts) => {
-      let result = await findExtractionForDetailPage(opts.input.id);
+      const result = await findExtractionForDetailPage(opts.input.id);
       if (!result) {
         throw new AppError("Extraction not found", AppErrors.NOT_FOUND);
       }
       const datasets = await findExtractionDatasets(opts.input.id);
       const lastAuditLog = await findLastAuditLogEntry(opts.input.id);
+
+      const apiSummary = await getApiCallSummary(opts.input.id);
+      const totalInputTokens = apiSummary.reduce(
+        (sum, s) => sum + Number(s.totalInputTokens ?? 0),
+        0
+      );
+      const totalOutputTokens = apiSummary.reduce(
+        (sum, s) => sum + Number(s.totalOutputTokens ?? 0),
+        0
+      );
+
       return {
         ...result,
         datasets,
         latestDataset: datasets[0],
         lastAuditLog,
+        totalInputTokens,
+        totalOutputTokens,
       };
     }),
   destroy: publicProcedure
@@ -308,6 +324,26 @@ export const extractionsRouter = router({
     .query(async (opts) => {
       return findLogsByCrawlPageId(opts.input.crawlPageId);
     }),
+  samplePages: publicProcedure
+    .input(
+      z.object({
+        extractionId: z.number().int().positive(),
+        sampleSizePercent: z.number().min(0).max(100),
+        dataStatus: z.array(z.enum(["present", "absent"])),
+        statuses: z.array(z.nativeEnum(PageStatus)),
+        sortBy: z.enum(["random", "most_expensive", "most_data_items", "least_data_items"]),
+        applyKey: z.number().optional(),
+      })
+    )
+    .query(async (opts) => {
+      return findSampledPagesForExtraction({
+        extractionId: opts.input.extractionId,
+        sampleSizePercent: opts.input.sampleSizePercent,
+        dataStatus: opts.input.dataStatus,
+        statuses: opts.input.statuses,
+        sortBy: opts.input.sortBy,
+      });
+    }),
   simulateDataExtraction: publicProcedure
     .input(
       z.object({