diff --git a/scripts/markdown/governance.mjs b/scripts/markdown/governance.mjs index 305fb45..4e89d7b 100644 --- a/scripts/markdown/governance.mjs +++ b/scripts/markdown/governance.mjs @@ -1,6 +1,7 @@ import { mkdir, writeFile } from 'node:fs/promises'; import { join } from 'node:path'; import { fetchWithRetry } from '../utils/fetch.mjs'; +import { rewriteRelativeLinks } from './sanitize.mjs'; const { GH_TOKEN } = process.env; @@ -33,18 +34,6 @@ const LINK_REWRITE_MAP = Object.fromEntries( ]) ); -// Rewrites relative cross-references between governance docs. -// Covers both inline [text](./FILE.md) and reference-style [label]: ./FILE.md. -// Negative lookaheads prevent rewriting absolute URLs that happen to end in a known filename. -const rewriteLinks = content => - content.replace( - /(\]\(|\]:\s*)(?!https?:\/\/)(?!\/)(\.\/)?([A-Z_]+\.md)/g, - (match, prefix, _dot, filename) => - LINK_REWRITE_MAP[filename] - ? `${prefix}${LINK_REWRITE_MAP[filename]}` - : match - ); - const outputDir = join( import.meta.dirname, '..', @@ -65,7 +54,10 @@ const results = await Promise.all( return null; } - let body = rewriteLinks(await res.text()); + let body = rewriteRelativeLinks( + await res.text(), + file => LINK_REWRITE_MAP[file] ?? null + ); // Some governance docs (e.g. MEMBER_EXPECTATIONS.md) have no H1, which the // site derives the page title from — fall back to the sidebar label. diff --git a/scripts/markdown/readmes.mjs b/scripts/markdown/readmes.mjs index 2d516af..b374989 100644 --- a/scripts/markdown/readmes.mjs +++ b/scripts/markdown/readmes.mjs @@ -1,6 +1,7 @@ import { mkdir, writeFile } from 'node:fs/promises'; import { join } from 'node:path'; import { fetchWithRetry } from '../utils/fetch.mjs'; +import cleanupMarkdown from './sanitize.mjs'; const { GH_TOKEN } = process.env; @@ -33,19 +34,12 @@ const discoverRepos = async () => { return { loaders, plugins }; }; -const stripLeadingDiv = content => - content.replace(/^\s*\n*/i, ''); - -// Remove badge-only lines: [![...][ref]][ref] or [![...](url)](url) -const stripBadges = content => - content - .replace( - /^(\[!\[[^\]]*\](?:\[[^\]]*\]|\([^)]*\))\]\s*(?:\[[^\]]*\]|\([^)]*\))\s*)+$/gm, - '' - ) - .replace(/\n{3,}/g, '\n\n'); - -const processContent = content => stripBadges(stripLeadingDiv(content)); +// Strip repo chrome, then point any relative links at the source repo on GitHub. +const cleanReadme = (content, fullName) => + cleanupMarkdown( + content, + target => `https://github.com/${fullName}/blob/HEAD/${target}` + ); const repoName = fullName => fullName.split('/')[1]; @@ -65,7 +59,7 @@ const processRepos = async (repos, { label, basePath, outputDir }) => { const result = await fetchReadme(fullName); await writeFile( join(outputDir, `${name}.md`), - processContent(result), + cleanReadme(result, fullName), 'utf8' ); return name; diff --git a/scripts/markdown/sanitize.mjs b/scripts/markdown/sanitize.mjs new file mode 100644 index 0000000..407d505 --- /dev/null +++ b/scripts/markdown/sanitize.mjs @@ -0,0 +1,39 @@ +// Cleanup for Markdown fetched from other repos (READMEs, governance docs). + +const LEADING_HTML = /^\s*\n*/i; +const BADGES = + /^(\[!\[[^\]]*\](?:\[[^\]]*\]|\([^)]*\))\]\s*(?:\[[^\]]*\]|\([^)]*\))\s*)+$/gm; +const EXTRA_BLANK_LINES = /\n{3,}/g; +const BOILERPLATE = /^#{1,6}\s*(?:Contributing|License)\b.*$/im; +const RELATIVE_LINK = + /(\]\(|\]:\s*)(?![a-z][\w+.-]*:)(?!\/)(?!#)(?:\.{1,2}\/)?([^)\s#]+)/g; + +// Drop the leading
logo banner. +export const stripLeadingHtml = content => content.replace(LEADING_HTML, ''); + +// Drop badge-only lines. +export const stripBadges = content => + content.replace(BADGES, '').replace(EXTRA_BLANK_LINES, '\n\n'); + +// Cut the trailing Contributing/License sections (and anything after) off the end. +export const stripBoilerplate = content => { + const match = content.match(BOILERPLATE); + return match ? `${content.slice(0, match.index).trimEnd()}\n` : content; +}; + +// Rewrite relative links via resolve(); skips full URLs, root-relative and anchors. +export const rewriteRelativeLinks = (content, resolve) => + content.replace(RELATIVE_LINK, (match, prefix, target) => { + const url = resolve(target); + return url ? `${prefix}${url}` : match; + }); + +const STEPS = [stripLeadingHtml, stripBadges, stripBoilerplate]; + +// Run the full cleanup pipeline. +const cleanupMarkdown = (content, resolve) => { + const result = STEPS.reduce((acc, step) => step(acc), content); + return resolve ? rewriteRelativeLinks(result, resolve) : result; +}; + +export default cleanupMarkdown;