From 8cde3cadc60357d80878bbfa52f7111578debdb8 Mon Sep 17 00:00:00 2001 From: Alex Culea <195758113+alexculealt@users.noreply.github.com> Date: Mon, 16 Mar 2026 12:34:18 +0000 Subject: [PATCH 1/3] Fix simplified content breaking page UI due to line length From 5117204190cf1520c0ab0eeb18b6f60b51457195 Mon Sep 17 00:00:00 2001 From: Alex Culea <195758113+alexculealt@users.noreply.github.com> Date: Mon, 16 Mar 2026 13:57:16 +0000 Subject: [PATCH 2/3] Add URL based state for the tab navigation of the crawl page detail screen --- .../src/components/app/extractions/page.tsx | 53 ++++++++++++++++--- 1 file changed, 45 insertions(+), 8 deletions(-) diff --git a/client/src/components/app/extractions/page.tsx b/client/src/components/app/extractions/page.tsx index 6ef03b9..7368cf9 100644 --- a/client/src/components/app/extractions/page.tsx +++ b/client/src/components/app/extractions/page.tsx @@ -18,12 +18,26 @@ import { import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs"; import { concisePrintDate, prettyPrintDate, resolveCrawlPageUrl, trpc } from "@/utils"; import { ExternalLink } from "lucide-react"; -import { useState } from "react"; -import { useParams } from "wouter"; +import { useEffect, useState } from "react"; +import { useLocation, useParams, useSearch } from "wouter"; import { base64Img } from "./utils"; +const DEFAULT_TAB = "data"; + +const VALID_TAB_VALUES = [ + "data", + "raw_content", + "screenshot", + "simplified_content", + "operation_logs", +]; + export default function CrawlPageDetail() { const { extractionId, stepId, crawlPageId } = useParams(); + const [, navigate] = useLocation(); + const search = useSearch(); + const basePath = "~" + new URL(window.location.href).pathname; + const crawlPageQuery = trpc.extractions.crawlPageDetail.useQuery( { crawlPageId: parseInt(crawlPageId || "") }, { enabled: !!crawlPageId } @@ -40,7 +54,22 @@ export default function CrawlPageDetail() { (typeof simulateExtractionQuery)["mutateAsync"] > | null> >(null); - if (!crawlPageQuery.data) { + + const item = crawlPageQuery.data; + + useEffect(() => { + const sp = new URLSearchParams(search || ""); + const tabName = sp.get("tabName"); + const shouldRedirect = + !tabName || !VALID_TAB_VALUES.includes(tabName || ""); + + if (shouldRedirect) { + sp.set("tabName", DEFAULT_TAB); + navigate(`${basePath}?${sp.toString()}`, { replace: true }); + } + }, [basePath, navigate, search]); + + if (!item) { return null; } @@ -53,8 +82,6 @@ export default function CrawlPageDetail() { } }; - const item = crawlPageQuery.data; - const breadCrumbs = [ { label: "Extractions", href: "/" }, { label: `Extraction #${extractionId}`, href: `/${extractionId}` }, @@ -123,8 +150,6 @@ export default function CrawlPageDetail() { , ]; - const defaultTab = "data"; - if (item.markdownContent) { tabTriggers.push( Simplified Content @@ -191,6 +216,18 @@ export default function CrawlPageDetail() { tabContents.splice(1, 0, {screenshot}); } + const tabNameFromUrl = new URLSearchParams(search || "").get("tabName"); + if (tabNameFromUrl && !VALID_TAB_VALUES.includes(tabNameFromUrl)) { + return null; + } + const currentTab = tabNameFromUrl ?? DEFAULT_TAB; + + const onTabChange = (value: string) => { + const sp = new URLSearchParams(search || ""); + sp.set("tabName", value); + navigate(`${basePath}?${sp.toString()}`); + }; + const formattedSimulatedData = simulatedExtractedData?.data ? JSON.stringify(simulatedExtractedData?.data, null, 2) : null; @@ -259,7 +296,7 @@ export default function CrawlPageDetail() { )} - + {tabTriggers}
{tabContents} From 8fbf1e98b3db1308fdecd58ea53babc1b5f522b7 Mon Sep 17 00:00:00 2001 From: Alex Culea <195758113+alexculealt@users.noreply.github.com> Date: Tue, 17 Mar 2026 18:00:25 +0000 Subject: [PATCH 3/3] Add sample tool to extractions --- .../app/extractions/SampleModal.tsx | 531 ++++++++++++++++++ .../src/components/app/extractions/detail.tsx | 21 +- client/src/utils.ts | 35 +- common/types.ts | 9 + common/utils.ts | 6 + server/src/data/extractions.ts | 16 +- server/src/data/extractionsSample.ts | 161 ++++++ server/src/data/schema.ts | 17 +- .../llm/determinePresenceOfEntity.ts | 1 + .../extraction/llm/exploreAdditionalPages.ts | 1 + .../src/extraction/llm/extractEntityData.ts | 20 +- server/src/extraction/resumeExtraction.ts | 2 - server/src/openai.ts | 13 +- server/src/routers/extractions.ts | 38 +- 14 files changed, 810 insertions(+), 61 deletions(-) create mode 100644 client/src/components/app/extractions/SampleModal.tsx create mode 100644 common/utils.ts create mode 100644 server/src/data/extractionsSample.ts diff --git a/client/src/components/app/extractions/SampleModal.tsx b/client/src/components/app/extractions/SampleModal.tsx new file mode 100644 index 0000000..ab8d860 --- /dev/null +++ b/client/src/components/app/extractions/SampleModal.tsx @@ -0,0 +1,531 @@ +import { Button } from "@/components/ui/button"; +import { Checkbox } from "@/components/ui/checkbox"; +import { + Command, + CommandEmpty, + CommandGroup, + CommandInput, + CommandItem, + CommandList, +} from "@/components/ui/command"; +import { + Dialog, + DialogContent, + DialogHeader, + DialogTitle, +} from "@/components/ui/dialog"; +import { Input } from "@/components/ui/input"; +import { + Popover, + PopoverContent, + PopoverTrigger, +} from "@/components/ui/popover"; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from "@/components/ui/select"; +import { + Table, + TableBody, + TableCell, + TableHead, + TableHeader, + TableRow, +} from "@/components/ui/table"; +import { + Tooltip, + TooltipContent, + TooltipProvider, + TooltipTrigger, +} from "@/components/ui/tooltip"; +import { UIPageStatus } from "@common/types"; +import { concisePrintDate, resolveCrawlPageUrl, trpc } from "@/utils"; +import { ExtractionStatus, PageStatus } from "@/utils"; +import { + ChevronsUpDown, + ExternalLink as ExternalLinkIcon, + FileJson, + FileText as FileTextIcon, + HelpCircle, + ScrollText as ScrollTextIcon, +} from "lucide-react"; +import { useCallback, useEffect, useMemo, useState } from "react"; + +interface MultiSelectOption { + value: T; + label: string; +} + +interface MultiSelectProps { + label?: string; + options: MultiSelectOption[]; + value: T[]; + onChange: (value: T[]) => void; + searchPlaceholder?: string; + emptyMessage?: string; +} + +function MultiSelect({ + label, + options, + value, + onChange, + searchPlaceholder = "Search...", + emptyMessage = "No results found.", +}: MultiSelectProps) { + const [open, setOpen] = useState(false); + + const triggerLabel = + value.length === 0 + ? "None" + : value.length === options.length + ? "All" + : value + .map((v) => options.find((o) => o.value === v)?.label ?? v) + .join(", "); + + const toggle = (itemValue: T) => { + onChange( + value.includes(itemValue) + ? value.filter((v) => v !== itemValue) + : [...value, itemValue] + ); + }; + + const selectAll = () => onChange(options.map((o) => o.value)); + const selectNone = () => onChange([]); + + return ( +
+ {label && ( + + )} + + + + + + + + + {emptyMessage} + + + All + + + None + + {options.map((opt) => ( + toggle(opt.value)} + > + + {opt.label} + + ))} + + + + + +
+ ); +} + +const DATA_STATUS_OPTIONS: { value: "present" | "absent"; label: string }[] = [ + { value: "present", label: "Present" }, + { value: "absent", label: "Absent" }, +]; + +const SORT_OPTIONS = [ + { value: "random", label: "Random" }, + { value: "most_expensive", label: "Most expensive" }, + { value: "most_data_items", label: "Most data items" }, + { value: "least_data_items", label: "Least data items" }, +]; + +const STATUS_HELP: Record = { + [PageStatus.WAITING]: + "Queued — The page hasn't been processed yet; it's waiting its turn.", + [PageStatus.IN_PROGRESS]: + "In progress — The system is currently working on this page.", + [PageStatus.DOWNLOADED]: + "Downloaded — The page content was successfully retrieved from the website and is awaiting extraction.", + [PageStatus.SUCCESS]: + "Success — Useful information was successfully pulled from this page.", + [PageStatus.EXTRACTED_NO_DATA]: + "No data found — either the data is not present in the page, the model couldn't find the expected information or the page was not correctly converted to the simplified content form.", + [PageStatus.ERROR]: + "Error — Something went wrong (e.g. the page couldn't load or timed out).", +}; + +type SampledPage = { + id: number; + extractionId: number; + crawlStepId: number; + url: string; + status: string; + pageType: string | null; + createdAt: string; + dataItemCount: number; + tokenSum: number; +}; + +function formatThousands(n: number): string { + if (n === 0) return "0"; + const k = n / 1_000; + return k % 1 === 0 ? `${k}K` : `${k.toFixed(1)}K`; +} + +interface SampleModalProps { + open: boolean; + onOpenChange: (open: boolean) => void; + extractionId: number; + extractionStatus?: string; + /** Catalogue base URL for resolving relative crawl page URLs */ + recipeUrl?: string; +} + +export default function SampleModal({ + open, + onOpenChange, + extractionId, + extractionStatus, + recipeUrl, +}: SampleModalProps) { + const isExtractionInProgress = [ + ExtractionStatus.IN_PROGRESS, + ExtractionStatus.WAITING, + ].includes(extractionStatus as ExtractionStatus); + const [sampleSizePercent, setSampleSizePercent] = useState(5); + const [dataStatus, setDataStatus] = useState<("present" | "absent")[]>([ + "present", + ]); + const [statuses, setStatuses] = useState(() => + UIPageStatus.map((o) => o.value) + ); + const [sortBy, setSortBy] = useState< + "random" | "most_expensive" | "most_data_items" | "least_data_items" + >("random"); + + const [appliedFilters, setAppliedFilters] = useState<{ + sampleSizePercent: number; + dataStatus: ("present" | "absent")[]; + statuses: PageStatus[]; + sortBy: "random" | "most_expensive" | "most_data_items" | "least_data_items"; + applyKey: number; + } | null>(null); + + useEffect(() => { + if (!open) { + setAppliedFilters(null); + } + }, [open]); + + const filtersDirty = useMemo(() => { + if (!appliedFilters) return false; + const arrEq = (a: string[], b: string[]) => + a.length === b.length && a.every((v, i) => v === b[i]); + return ( + appliedFilters.sampleSizePercent !== sampleSizePercent || + !arrEq([...appliedFilters.dataStatus].sort(), [...dataStatus].sort()) || + !arrEq([...appliedFilters.statuses].sort(), [...statuses].sort()) || + appliedFilters.sortBy !== sortBy + ); + }, [appliedFilters, sampleSizePercent, dataStatus, statuses, sortBy]); + + const sampleQuery = trpc.extractions.samplePages.useQuery( + { + extractionId, + sampleSizePercent: appliedFilters?.sampleSizePercent ?? 5, + dataStatus: appliedFilters?.dataStatus ?? ["present"], + statuses: appliedFilters?.statuses ?? [], + sortBy: appliedFilters?.sortBy ?? "random", + applyKey: appliedFilters?.applyKey ?? 0, + }, + { + enabled: appliedFilters !== null && open, + } + ); + + const sampledPages = (sampleQuery.data ?? []) as SampledPage[]; + + const onApplyFilter = useCallback(() => { + setAppliedFilters((prev) => ({ + sampleSizePercent, + dataStatus: [...dataStatus], + statuses: [...statuses], + sortBy, + applyKey: (prev?.applyKey ?? 0) + 1, + })); + }, [sampleSizePercent, dataStatus, statuses, sortBy]); + + return ( + + e.preventDefault()} + > + + Sample extraction items + {appliedFilters && !sampleQuery.isLoading && ( +

+ {sampledPages.length} item{sampledPages.length !== 1 ? "s" : ""} +

+ )} +
+ + {isExtractionInProgress && ( +
+ Extraction is in progress. Data will change. +
+ )} + +
+ {/* Filter section */} +
+
+ + + setSampleSizePercent( + Math.min(100, Math.max(0, parseInt(e.target.value) || 0)) + ) + } + /> +
+ +
+ + +
+
+ + +
+
+ +
+
+ + {filtersDirty && ( +
+ Filters have changed. Click Apply filter to update results. +
+ )} + + {/* Table section */} +
+ {!appliedFilters ? ( +
+ Click Apply Filter +
+ ) : sampleQuery.isLoading ? ( +
+ Loading... +
+ ) : sampledPages.length === 0 ? ( +
+ No pages match the filter +
+ ) : ( +
+ + + + ID + Status + Created + URL + Data items + Tokens used + + Actions + + + + + {sampledPages.map((page) => ( + + {page.id} + {page.status} + {concisePrintDate(page.createdAt)} + + {page.url} + + + {page.dataItemCount ?? 0} + + + {formatThousands(page.tokenSum)} + + +
+ + + + +
+
+
+ ))} +
+
+
+ )} +
+
+
+
+ ); +} diff --git a/client/src/components/app/extractions/detail.tsx b/client/src/components/app/extractions/detail.tsx index eef4c9b..124318c 100644 --- a/client/src/components/app/extractions/detail.tsx +++ b/client/src/components/app/extractions/detail.tsx @@ -38,12 +38,13 @@ import { resolveCrawlPageUrl, trpc, } from "@/utils"; -import { CookingPot, LibraryBig, List } from "lucide-react"; +import { CookingPot, LibraryBig, List, Pipette } from "lucide-react"; import { useState } from "react"; import { Bar, BarChart, XAxis, YAxis } from "recharts"; import { Link, useLocation, useParams } from "wouter"; import { displayRecipeDetails } from "../recipes/util"; import AuditLogModal from "./AuditLogModal"; +import SampleModal from "./SampleModal"; import { displayStepType } from "./utils"; function displayStepParent(steps: CrawlStep[], parentId: number) { @@ -139,6 +140,7 @@ export default function ExtractionDetail() { const [lockedCancel, setLockedCancel] = useState(true); const [lockedDelete, setLockDelete] = useState(true); const [auditLogModalOpen, setAuditLogModalOpen] = useState(false); + const [sampleModalOpen, setSampleModalOpen] = useState(false); const { toast } = useToast(); const [, navigate] = useLocation(); const query = trpc.extractions.detail.useQuery( @@ -898,8 +900,16 @@ export default function ExtractionDetail() { ) : null}
- + Extraction Steps + @@ -987,6 +997,13 @@ export default function ExtractionDetail() { open={auditLogModalOpen} onOpenChange={setAuditLogModalOpen} /> + ); } diff --git a/client/src/utils.ts b/client/src/utils.ts index 39fe3f2..a51160f 100644 --- a/client/src/utils.ts +++ b/client/src/utils.ts @@ -30,35 +30,12 @@ export type CrawlPage = ItemType< type DatasetItemsResponse = Exclude; export type DatasetItem = ItemType; -export enum ExtractionStatus { - WAITING = "WAITING", - IN_PROGRESS = "IN_PROGRESS", - COMPLETE = "COMPLETE", - STALE = "STALE", - CANCELLED = "CANCELLED", -} - -export enum PageStatus { - WAITING = "WAITING", - IN_PROGRESS = "IN_PROGRESS", - DOWNLOADED = "DOWNLOADED", - SUCCESS = "SUCCESS", - EXTRACTED_NO_DATA = "EXTRACTED_NO_DATA", - ERROR = "ERROR", -} - -export enum RecipeDetectionStatus { - WAITING = "WAITING", - IN_PROGRESS = "IN_PROGRESS", - SUCCESS = "SUCCESS", - ERROR = "ERROR", -} - -export enum Step { - FETCH_ROOT = "FETCH_ROOT", - FETCH_PAGINATED = "FETCH_PAGINATED", - FETCH_LINKS = "FETCH_LINKS", -} +export { + ExtractionStatus, + PageStatus, + RecipeDetectionStatus, + Step, +} from "../../common/types"; export function cn(...inputs: ClassValue[]) { return twMerge(clsx(inputs)); diff --git a/common/types.ts b/common/types.ts index 12182b8..9df6bfa 100644 --- a/common/types.ts +++ b/common/types.ts @@ -1,3 +1,5 @@ +import { toTitleCase } from "./utils"; + export enum CatalogueType { COURSES = "COURSES", LEARNING_PROGRAMS = "LEARNING_PROGRAMS", @@ -112,6 +114,13 @@ export enum PageStatus { ERROR = "ERROR", } +export const UIPageStatus: { value: PageStatus; label: string }[] = ( + Object.values(PageStatus) as PageStatus[] +).map((value) => ({ + value, + label: toTitleCase(String(value).replace(/_/g, " ")), +})); + export enum RecipeDetectionStatus { WAITING = "WAITING", IN_PROGRESS = "IN_PROGRESS", diff --git a/common/utils.ts b/common/utils.ts new file mode 100644 index 0000000..9994154 --- /dev/null +++ b/common/utils.ts @@ -0,0 +1,6 @@ +export function toTitleCase(str: string): string { + return str + .split(" ") + .map((word) => word.charAt(0).toUpperCase() + word.slice(1).toLowerCase()) + .join(" "); +} diff --git a/server/src/data/extractions.ts b/server/src/data/extractions.ts index fb79877..d15fd50 100644 --- a/server/src/data/extractions.ts +++ b/server/src/data/extractions.ts @@ -831,8 +831,7 @@ export async function createModelApiCallLog( callSite: string, inputTokenCount: number, outputTokenCount: number, - datasetId?: number, - crawlPageId?: number + options?: { datasetId?: number; crawlPageId?: number } ) { const result = await db .insert(modelApiCalls) @@ -843,8 +842,8 @@ export async function createModelApiCallLog( callSite, input_token_count: inputTokenCount, output_token_count: outputTokenCount, - datasetId, - crawlPageId, + datasetId: options?.datasetId, + crawlPageId: options?.crawlPageId, }) .returning(); return result[0]; @@ -923,7 +922,7 @@ export async function findFailedAndNoDataPageIds(crawlStepId: number) { ) ) .groupBy(crawlPages.id) - .having(sql`count(${dataItems.id}) = 0`); + .having(eq(count(dataItems.id), 0)); return [...new Set(failedIds.concat(noDataIds).map((p) => p.id))]; } @@ -931,3 +930,10 @@ export async function findFailedAndNoDataPageIds(crawlStepId: number) { export async function destroyExtraction(id: number) { return db.delete(extractions).where(eq(extractions.id, id)); } + +export { + findSampledPagesForExtraction, + type SamplePagesOptions, + type SampleSortOption, + type SampledPageRow, +} from "./extractionsSample"; diff --git a/server/src/data/extractionsSample.ts b/server/src/data/extractionsSample.ts new file mode 100644 index 0000000..c50d2c4 --- /dev/null +++ b/server/src/data/extractionsSample.ts @@ -0,0 +1,161 @@ +/** + * Sample pages query for extractions. Uses raw SQL to ensure correct column + * aliases (camelCase), data status filtering, and minimal data selection. + * All filtering, aggregation, and sampling is done in the database. + */ +import { sql } from "drizzle-orm"; +import { PageStatus } from "../../../common/types"; +import db from "."; + +export type SampleSortOption = + | "random" + | "most_expensive" + | "most_data_items" + | "least_data_items"; + +const VALID_STATUSES = new Set(Object.values(PageStatus)); +const VALID_SORT: SampleSortOption[] = [ + "random", + "most_expensive", + "most_data_items", + "least_data_items", +]; + +export interface SamplePagesOptions { + extractionId: number; + sampleSizePercent: number; + dataStatus: ("present" | "absent")[]; + statuses: PageStatus[]; + sortBy: SampleSortOption; +} + +export interface SampledPageRow { + id: number; + extractionId: number; + crawlStepId: number; + url: string; + status: string; + createdAt: Date; + dataItemCount: number; + tokenSum: number; +} + +/** + * Fetches a sampled subset of crawl pages for an extraction. + * + * ```markdown + * **SQL Injection safety**: Numerical parameters are validated using Zod, + * if calling this function from outside, make sure to validate the input. + * ``` + */ +export async function findSampledPagesForExtraction( + opts: SamplePagesOptions +): Promise { + const statuses = (opts.statuses as string[]).filter((s) => VALID_STATUSES.has(s)); + if (statuses.length === 0) { + return []; + } + + const sortBy = VALID_SORT.includes(opts.sortBy) ? opts.sortBy : "random"; + + const hasPresent = opts.dataStatus.includes("present"); + const hasAbsent = opts.dataStatus.includes("absent"); + const dataStatusFilter = + (hasPresent && hasAbsent) || (!hasPresent && !hasAbsent) + ? undefined + : hasPresent && !hasAbsent + ? "present" + : "absent"; + + const noDataStatusFilter = dataStatusFilter === undefined; + const filterPresent = dataStatusFilter === "present"; + const filterAbsent = dataStatusFilter === "absent"; + + const statusInClause = sql.join( + statuses.map((s) => sql`${s}`), + sql`, ` + ); + + /* + SQL Injection safety: + Interpolated parameters are safe to use with Drizzle `sql` function. + From docs (https://orm.drizzle.team/docs/sql): + Additionally, any dynamic parameters such as ${id} will be mapped to the $1 placeholder, + and the corresponding values will be moved to an array of values that are passed separately to the database. + This approach effectively prevents any potential SQL Injection vulnerabilities. + */ + const result = await db.execute(sql` + WITH latest_dataset AS ( + SELECT id FROM datasets + WHERE extraction_id = ${opts.extractionId} + ORDER BY created_at DESC + LIMIT 1 + ), + base AS ( + SELECT + cp.id, + cp.extraction_id AS "extractionId", + cp.crawl_step_id AS "crawlStepId", + cp.url, + cp.status, + cp.created_at AS "createdAt", + COALESCE(( + SELECT COUNT(*)::integer + FROM data_items di + WHERE di.crawl_page_id = cp.id + AND di.dataset_id = (SELECT id FROM latest_dataset) + ), 0) AS "dataItemCount", + COALESCE(( + SELECT SUM(mac.input_token_count + mac.output_token_count)::integer + FROM model_api_calls mac + WHERE mac.crawl_page_id = cp.id + ), 0) AS "tokenSum" + FROM crawl_pages cp + WHERE cp.extraction_id = ${opts.extractionId} + AND cp.status IN (${statusInClause}) + AND ( + ${noDataStatusFilter} + OR (${filterPresent} AND COALESCE(( + SELECT COUNT(*)::integer FROM data_items di + WHERE di.crawl_page_id = cp.id + AND di.dataset_id = (SELECT id FROM latest_dataset) + ), 0) > 0) + OR (${filterAbsent} AND COALESCE(( + SELECT COUNT(*)::integer FROM data_items di + WHERE di.crawl_page_id = cp.id + AND di.dataset_id = (SELECT id FROM latest_dataset) + ), 0) = 0) + ) + ), + with_total AS ( + SELECT *, COUNT(*) OVER () AS total FROM base + ), + numbered AS ( + SELECT *, + ROW_NUMBER() OVER ( + ORDER BY + CASE WHEN ${sortBy} = 'random' THEN random() END, + CASE WHEN ${sortBy} = 'most_expensive' THEN "tokenSum" END DESC NULLS LAST, + CASE WHEN ${sortBy} = 'most_data_items' THEN "dataItemCount" END DESC NULLS LAST, + CASE WHEN ${sortBy} = 'least_data_items' THEN "dataItemCount" END ASC NULLS LAST + ) AS rn + FROM with_total + ), + limited AS ( + SELECT id, "extractionId", "crawlStepId", url, status, "createdAt", + "dataItemCount", "tokenSum" + FROM numbered + WHERE total = 0 OR rn <= GREATEST(0, CEIL(total * ${opts.sampleSizePercent}::float / 100)::integer) + ) + SELECT id, "extractionId", "crawlStepId", url, status, "createdAt", + "dataItemCount", "tokenSum" + FROM limited + ORDER BY + CASE WHEN ${sortBy} = 'random' THEN random() END, + CASE WHEN ${sortBy} = 'most_expensive' THEN "tokenSum" END DESC NULLS LAST, + CASE WHEN ${sortBy} = 'most_data_items' THEN "dataItemCount" END DESC NULLS LAST, + CASE WHEN ${sortBy} = 'least_data_items' THEN "dataItemCount" END ASC NULLS LAST + `); + + return (result.rows ?? []) as unknown as SampledPageRow[]; +} diff --git a/server/src/data/schema.ts b/server/src/data/schema.ts index 480a4c2..accbe15 100644 --- a/server/src/data/schema.ts +++ b/server/src/data/schema.ts @@ -332,6 +332,9 @@ const modelApiCalls = pgTable( extractionId: integer("extraction_id").references(() => extractions.id, { onDelete: "cascade", }), + crawlPageId: integer("crawl_page_id").references(() => crawlPages.id, { + onDelete: "cascade", + }), provider: providerEnum("provider").notNull(), model: providerModelEnum("model").notNull(), callSite: text("call_site").notNull(), @@ -341,14 +344,11 @@ const modelApiCalls = pgTable( datasetId: integer("dataset_id").references(() => datasets.id, { onDelete: "cascade" }), - crawlPageId: integer("crawl_page_id").references(() => crawlPages.id, { - onDelete: "cascade", - }), }, (t) => ({ extractionIdx: index("model_api_calls_extraction_idx").on(t.extractionId), - datasetIdx: index("model_api_calls_datasaet_idx").on(t.datasetId), crawlPageIdx: index("model_api_calls_crawl_page_idx").on(t.crawlPageId), + datasetIdx: index("model_api_calls_datasaet_idx").on(t.datasetId), }) ); @@ -357,14 +357,14 @@ const modelApiCallsRelations = relations(modelApiCalls, ({ one }) => ({ fields: [modelApiCalls.extractionId], references: [extractions.id], }), - dataset: one(datasets, { - fields: [modelApiCalls.datasetId], - references: [datasets.id] - }), crawlPage: one(crawlPages, { fields: [modelApiCalls.crawlPageId], references: [crawlPages.id], }), + dataset: one(datasets, { + fields: [modelApiCalls.datasetId], + references: [datasets.id] + }), })); const extractionLogs = pgTable( @@ -480,6 +480,7 @@ const crawlPageRelations = relations(crawlPages, ({ one, many }) => ({ }), dataItems: many(dataItems), extractionLogs: many(extractionLogs), + modelApiCalls: many(modelApiCalls), })); const datasets = pgTable( diff --git a/server/src/extraction/llm/determinePresenceOfEntity.ts b/server/src/extraction/llm/determinePresenceOfEntity.ts index f8c528d..6f5e08d 100644 --- a/server/src/extraction/llm/determinePresenceOfEntity.ts +++ b/server/src/extraction/llm/determinePresenceOfEntity.ts @@ -84,6 +84,7 @@ ${MD_END} logApiCall: options?.logApiCalls ? { extractionId: options.logApiCalls.extractionId, + datasetId: options.logApiCalls.datasetId, crawlPageId: options.logApiCalls.crawlPageId, callSite: "determinePresenceOfEntity", } diff --git a/server/src/extraction/llm/exploreAdditionalPages.ts b/server/src/extraction/llm/exploreAdditionalPages.ts index dbf3725..81a1619 100644 --- a/server/src/extraction/llm/exploreAdditionalPages.ts +++ b/server/src/extraction/llm/exploreAdditionalPages.ts @@ -79,6 +79,7 @@ ${MD_END} logApiCall: options?.logApiCalls ? { extractionId: options.logApiCalls.extractionId, + datasetId: options.logApiCalls.datasetId, crawlPageId: options.logApiCalls.crawlPageId, callSite: "exploreAdditionalPages", } diff --git a/server/src/extraction/llm/extractEntityData.ts b/server/src/extraction/llm/extractEntityData.ts index e70439a..9a9ade7 100644 --- a/server/src/extraction/llm/extractEntityData.ts +++ b/server/src/extraction/llm/extractEntityData.ts @@ -276,11 +276,11 @@ ${basePrompt} model, logApiCall: options?.logApiCalls ? { - extractionId: options.logApiCalls.extractionId, - datasetId: options.logApiCalls.datasetId, - crawlPageId: options.logApiCalls.crawlPageId, - callSite: "extractEntityData", - } + extractionId: options.logApiCalls.extractionId, + datasetId: options.logApiCalls.datasetId, + crawlPageId: options.logApiCalls.crawlPageId, + callSite: "extractEntityData", + } : undefined, }); @@ -311,11 +311,11 @@ ${basePrompt} requiredParameters: ["items"], logApiCall: options?.logApiCalls ? { - extractionId: options.logApiCalls.extractionId, - datasetId: options.logApiCalls.datasetId, - crawlPageId: options.logApiCalls.crawlPageId, - callSite: "extractEntityData", - } + extractionId: options.logApiCalls.extractionId, + datasetId: options.logApiCalls.datasetId, + crawlPageId: options.logApiCalls.crawlPageId, + callSite: "extractEntityData", + } : undefined, }; diff --git a/server/src/extraction/resumeExtraction.ts b/server/src/extraction/resumeExtraction.ts index 556f3fe..cb7c2f4 100644 --- a/server/src/extraction/resumeExtraction.ts +++ b/server/src/extraction/resumeExtraction.ts @@ -3,7 +3,6 @@ import { findLatestDataset } from "../data/datasets"; import { createExtractionAuditLog, createExtractionLog, - findApiExtractionRootPage, findExtractionById, findInProgressPagesWithoutJobs, findPagesNeedingExtractData, @@ -16,7 +15,6 @@ import { getPageIdsWithExistingJobs, Queues, REPEAT_UPDATE_COMPLETION_EVERY_MS, - submitJob, submitJobs, submitRepeatableJob, } from "../workers"; diff --git a/server/src/openai.ts b/server/src/openai.ts index f79ae86..7b60072 100644 --- a/server/src/openai.ts +++ b/server/src/openai.ts @@ -99,6 +99,7 @@ export async function simpleToolCompletion< logApiCall?: { callSite: string; extractionId: number; + datasetId?: number; crawlPageId?: number; }; }): Promise<{ @@ -195,8 +196,10 @@ export async function simpleToolCompletion< options.logApiCall.callSite, inputTokenCount, outputTokenCount, - undefined, - options.logApiCall.crawlPageId + { + datasetId: options.logApiCall.datasetId, + crawlPageId: options.logApiCall.crawlPageId, + } ); } @@ -326,8 +329,10 @@ export async function structuredCompletion< options.logApiCall.callSite, inputTokenCount, outputTokenCount, - options.logApiCall.datasetId, - options.logApiCall.crawlPageId + { + datasetId: options.logApiCall.datasetId, + crawlPageId: options.logApiCall.crawlPageId, + } ); } diff --git a/server/src/routers/extractions.ts b/server/src/routers/extractions.ts index 0942f2c..8dd1f7e 100644 --- a/server/src/routers/extractions.ts +++ b/server/src/routers/extractions.ts @@ -3,6 +3,7 @@ import { publicProcedure, router } from "."; import { CatalogueType, ExtractionStatus, + PageStatus, ProviderModel, } from "../../../common/types"; import { AppError, AppErrors } from "../appErrors"; @@ -24,7 +25,9 @@ import { findPage, findPageForJob, findPagesPaginated, + findSampledPagesForExtraction, findStep, + getApiCallSummary, getExtractionCount, getLogCount, getPageCount, @@ -130,17 +133,30 @@ export const extractionsRouter = router({ }) ) .query(async (opts) => { - let result = await findExtractionForDetailPage(opts.input.id); + const result = await findExtractionForDetailPage(opts.input.id); if (!result) { throw new AppError("Extraction not found", AppErrors.NOT_FOUND); } const datasets = await findExtractionDatasets(opts.input.id); const lastAuditLog = await findLastAuditLogEntry(opts.input.id); + + const apiSummary = await getApiCallSummary(opts.input.id); + const totalInputTokens = apiSummary.reduce( + (sum, s) => sum + Number(s.totalInputTokens ?? 0), + 0 + ); + const totalOutputTokens = apiSummary.reduce( + (sum, s) => sum + Number(s.totalOutputTokens ?? 0), + 0 + ); + return { ...result, datasets, latestDataset: datasets[0], lastAuditLog, + totalInputTokens, + totalOutputTokens, }; }), destroy: publicProcedure @@ -308,6 +324,26 @@ export const extractionsRouter = router({ .query(async (opts) => { return findLogsByCrawlPageId(opts.input.crawlPageId); }), + samplePages: publicProcedure + .input( + z.object({ + extractionId: z.number().int().positive(), + sampleSizePercent: z.number().min(0).max(100), + dataStatus: z.array(z.enum(["present", "absent"])), + statuses: z.array(z.nativeEnum(PageStatus)), + sortBy: z.enum(["random", "most_expensive", "most_data_items", "least_data_items"]), + applyKey: z.number().optional(), + }) + ) + .query(async (opts) => { + return findSampledPagesForExtraction({ + extractionId: opts.input.extractionId, + sampleSizePercent: opts.input.sampleSizePercent, + dataStatus: opts.input.dataStatus, + statuses: opts.input.statuses, + sortBy: opts.input.sortBy, + }); + }), simulateDataExtraction: publicProcedure .input( z.object({