Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions client/src/components/app/extractions/detail.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import {
RecipeDetectionStatus,
concisePrintDate,
prettyPrintDate,
resolveCrawlPageUrl,
trpc,
} from "@/utils";
import { CookingPot, LibraryBig, List } from "lucide-react";
Expand Down Expand Up @@ -711,7 +712,10 @@ export default function ExtractionDetail() {
</TableCell>
<TableCell className="break-all align-top">
<a
href={p.url}
href={resolveCrawlPageUrl(
p.url,
extraction.recipe.url
)}
target="_blank"
rel="noreferrer"
className="underline"
Expand Down Expand Up @@ -784,7 +788,10 @@ export default function ExtractionDetail() {
</TableCell>
<TableCell className="break-all align-top">
<a
href={p.url}
href={resolveCrawlPageUrl(
p.url,
extraction.recipe.url
)}
target="_blank"
rel="noreferrer"
className="underline"
Expand Down
11 changes: 9 additions & 2 deletions client/src/components/app/extractions/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import {
TableRow,
} from "@/components/ui/table";
import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
import { concisePrintDate, prettyPrintDate, trpc } from "@/utils";
import { concisePrintDate, prettyPrintDate, resolveCrawlPageUrl, trpc } from "@/utils";
import { ExternalLink } from "lucide-react";
import { useState } from "react";
import { useParams } from "wouter";
Expand Down Expand Up @@ -244,7 +244,14 @@ export default function CrawlPageDetail() {
<Button
variant="outline"
size="sm"
onClick={() => window.open(item.crawlPage.url, "_blank", "noopener,noreferrer")}
onClick={() => {
const url = item.crawlPage.url;
const baseUrl = item.crawlPage.extraction?.recipe?.url;
const resolved = baseUrl
? resolveCrawlPageUrl(url, baseUrl)
: url;
window.open(resolved, "_blank", "noopener,noreferrer");
}}
>
<ExternalLink className="w-4 h-4 mr-2" />
Open Page URL
Expand Down
11 changes: 9 additions & 2 deletions client/src/components/app/extractions/step.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import {
TableHeader,
TableRow,
} from "@/components/ui/table";
import { concisePrintDate, trpc } from "@/utils";
import { concisePrintDate, resolveCrawlPageUrl, trpc } from "@/utils";
import { Link, useParams } from "wouter";
import { PageType } from "../../../../../common/types";
import usePagination from "../usePagination";
Expand Down Expand Up @@ -85,7 +85,14 @@ export default function CrawlStepDetail() {
</TableCell>
<TableCell>{concisePrintDate(item.createdAt)}</TableCell>
<TableCell className="max-w-40 overflow-hidden whitespace-nowrap text-ellipsis text-blue-800 underline">
<a href={item.url} target="_blank">
<a
href={resolveCrawlPageUrl(
item.url,
extractionQuery.data.recipe.url
)}
target="_blank"
rel="noreferrer"
>
{item.url}
</a>
</TableCell>
Expand Down
13 changes: 13 additions & 0 deletions client/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,19 @@ export function formatCatalogueType(catalogueType: string): string {
return typeMap[catalogueType] || catalogueType;
}

/**
* Resolves a crawl page URL to an absolute URL. Relative URLs (e.g. /courses/math)
* are resolved against the catalogue base URL so they open on the extracted
* website rather than the app origin.
*/
export function resolveCrawlPageUrl(url: string, baseUrl: string): string {
try {
return new URL(url, baseUrl).href;
} catch {
return url;
}
}

export type IterableElement<TargetIterable> =
TargetIterable extends Iterable<infer ElementType>
? ElementType
Expand Down
5 changes: 5 additions & 0 deletions server/src/data/extractions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,11 @@ export async function findPage(crawlPageId: number) {
where: (crawlPages, { eq }) => eq(crawlPages.id, crawlPageId),
with: {
crawlStep: true,
extraction: {
with: {
recipe: true,
},
},
},
});
return result;
Expand Down
2 changes: 1 addition & 1 deletion server/src/utils.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { BrowserFetchError, BrowserTaskResult } from "./extraction/browser";
import { BrowserFetchError } from "./extraction/browser";
import getLogger from "./logging";

const logger = getLogger("utils");
Expand Down
3 changes: 3 additions & 0 deletions server/src/workers/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,9 @@ export async function detectExtractionJobs(extractionId: number) {
break;
}
for (const job of jobs) {
if (!job || !job.data)
continue;

if (job.data.extractionId === extractionId) {
return true;
}
Expand Down