diff --git a/client/src/components/app/extractions/detail.tsx b/client/src/components/app/extractions/detail.tsx index 8eb1060..eef4c9b 100644 --- a/client/src/components/app/extractions/detail.tsx +++ b/client/src/components/app/extractions/detail.tsx @@ -35,6 +35,7 @@ import { RecipeDetectionStatus, concisePrintDate, prettyPrintDate, + resolveCrawlPageUrl, trpc, } from "@/utils"; import { CookingPot, LibraryBig, List } from "lucide-react"; @@ -711,7 +712,10 @@ export default function ExtractionDetail() { window.open(item.crawlPage.url, "_blank", "noopener,noreferrer")} + onClick={() => { + const url = item.crawlPage.url; + const baseUrl = item.crawlPage.extraction?.recipe?.url; + const resolved = baseUrl + ? resolveCrawlPageUrl(url, baseUrl) + : url; + window.open(resolved, "_blank", "noopener,noreferrer"); + }} > Open Page URL diff --git a/client/src/components/app/extractions/step.tsx b/client/src/components/app/extractions/step.tsx index 8cddcb4..72bbf89 100644 --- a/client/src/components/app/extractions/step.tsx +++ b/client/src/components/app/extractions/step.tsx @@ -10,7 +10,7 @@ import { TableHeader, TableRow, } from "@/components/ui/table"; -import { concisePrintDate, trpc } from "@/utils"; +import { concisePrintDate, resolveCrawlPageUrl, trpc } from "@/utils"; import { Link, useParams } from "wouter"; import { PageType } from "../../../../../common/types"; import usePagination from "../usePagination"; @@ -85,7 +85,14 @@ export default function CrawlStepDetail() { {concisePrintDate(item.createdAt)} - + {item.url} diff --git a/client/src/utils.ts b/client/src/utils.ts index 3853158..39fe3f2 100644 --- a/client/src/utils.ts +++ b/client/src/utils.ts @@ -108,6 +108,19 @@ export function formatCatalogueType(catalogueType: string): string { return typeMap[catalogueType] || catalogueType; } +/** + * Resolves a crawl page URL to an absolute URL. Relative URLs (e.g. /courses/math) + * are resolved against the catalogue base URL so they open on the extracted + * website rather than the app origin. + */ +export function resolveCrawlPageUrl(url: string, baseUrl: string): string { + try { + return new URL(url, baseUrl).href; + } catch { + return url; + } +} + export type IterableElement = TargetIterable extends Iterable ? ElementType diff --git a/server/src/data/extractions.ts b/server/src/data/extractions.ts index a3912f2..30b39b7 100644 --- a/server/src/data/extractions.ts +++ b/server/src/data/extractions.ts @@ -608,6 +608,11 @@ export async function findPage(crawlPageId: number) { where: (crawlPages, { eq }) => eq(crawlPages.id, crawlPageId), with: { crawlStep: true, + extraction: { + with: { + recipe: true, + }, + }, }, }); return result; diff --git a/server/src/utils.ts b/server/src/utils.ts index 49cc045..9d51f49 100644 --- a/server/src/utils.ts +++ b/server/src/utils.ts @@ -1,4 +1,4 @@ -import { BrowserFetchError, BrowserTaskResult } from "./extraction/browser"; +import { BrowserFetchError } from "./extraction/browser"; import getLogger from "./logging"; const logger = getLogger("utils"); diff --git a/server/src/workers/index.ts b/server/src/workers/index.ts index 01b75bd..c2f4d00 100644 --- a/server/src/workers/index.ts +++ b/server/src/workers/index.ts @@ -204,6 +204,9 @@ export async function detectExtractionJobs(extractionId: number) { break; } for (const job of jobs) { + if (!job || !job.data) + continue; + if (job.data.extractionId === extractionId) { return true; }