From 2ee4f7d89c919c722ad406c3d47bc854d719ec9e Mon Sep 17 00:00:00 2001 From: Nikhil Kumar Rajak Date: Tue, 16 Jun 2026 16:27:01 +0000 Subject: [PATCH 1/2] feat: sanitized fetched docs --- scripts/markdown/governance.mjs | 18 +++++------------- scripts/markdown/readmes.mjs | 27 +++++++++++++-------------- scripts/utils/sanitize.mjs | 30 ++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 27 deletions(-) create mode 100644 scripts/utils/sanitize.mjs diff --git a/scripts/markdown/governance.mjs b/scripts/markdown/governance.mjs index 305fb45..f4668e9 100644 --- a/scripts/markdown/governance.mjs +++ b/scripts/markdown/governance.mjs @@ -1,6 +1,7 @@ import { mkdir, writeFile } from 'node:fs/promises'; import { join } from 'node:path'; import { fetchWithRetry } from '../utils/fetch.mjs'; +import { rewriteRelativeLinks } from '../utils/sanitize.mjs'; const { GH_TOKEN } = process.env; @@ -33,18 +34,6 @@ const LINK_REWRITE_MAP = Object.fromEntries( ]) ); -// Rewrites relative cross-references between governance docs. -// Covers both inline [text](./FILE.md) and reference-style [label]: ./FILE.md. -// Negative lookaheads prevent rewriting absolute URLs that happen to end in a known filename. -const rewriteLinks = content => - content.replace( - /(\]\(|\]:\s*)(?!https?:\/\/)(?!\/)(\.\/)?([A-Z_]+\.md)/g, - (match, prefix, _dot, filename) => - LINK_REWRITE_MAP[filename] - ? `${prefix}${LINK_REWRITE_MAP[filename]}` - : match - ); - const outputDir = join( import.meta.dirname, '..', @@ -65,7 +54,10 @@ const results = await Promise.all( return null; } - let body = rewriteLinks(await res.text()); + let body = rewriteRelativeLinks( + await res.text(), + file => LINK_REWRITE_MAP[file] ?? null + ); // Some governance docs (e.g. MEMBER_EXPECTATIONS.md) have no H1, which the // site derives the page title from — fall back to the sidebar label. diff --git a/scripts/markdown/readmes.mjs b/scripts/markdown/readmes.mjs index 2d516af..923393a 100644 --- a/scripts/markdown/readmes.mjs +++ b/scripts/markdown/readmes.mjs @@ -1,6 +1,12 @@ import { mkdir, writeFile } from 'node:fs/promises'; import { join } from 'node:path'; import { fetchWithRetry } from '../utils/fetch.mjs'; +import { + stripLeadingHtml, + stripBadges, + stripBoilerplate, + rewriteRelativeLinks, +} from '../utils/sanitize.mjs'; const { GH_TOKEN } = process.env; @@ -33,19 +39,12 @@ const discoverRepos = async () => { return { loaders, plugins }; }; -const stripLeadingDiv = content => - content.replace(/^\s*\n*/i, ''); - -// Remove badge-only lines: [![...][ref]][ref] or [![...](url)](url) -const stripBadges = content => - content - .replace( - /^(\[!\[[^\]]*\](?:\[[^\]]*\]|\([^)]*\))\]\s*(?:\[[^\]]*\]|\([^)]*\))\s*)+$/gm, - '' - ) - .replace(/\n{3,}/g, '\n\n'); - -const processContent = content => stripBadges(stripLeadingDiv(content)); +// Strip repo chrome, then point any relative links at the source repo on GitHub. +const cleanReadme = (content, fullName) => + rewriteRelativeLinks( + stripBoilerplate(stripBadges(stripLeadingHtml(content))), + target => `https://github.com/${fullName}/blob/HEAD/${target}` + ); const repoName = fullName => fullName.split('/')[1]; @@ -65,7 +64,7 @@ const processRepos = async (repos, { label, basePath, outputDir }) => { const result = await fetchReadme(fullName); await writeFile( join(outputDir, `${name}.md`), - processContent(result), + cleanReadme(result, fullName), 'utf8' ); return name; diff --git a/scripts/utils/sanitize.mjs b/scripts/utils/sanitize.mjs new file mode 100644 index 0000000..c25d3dd --- /dev/null +++ b/scripts/utils/sanitize.mjs @@ -0,0 +1,30 @@ +// Cleanup for Markdown fetched from other repos (READMEs, governance docs). + +// Drop the leading
logo banner. +export const stripLeadingHtml = content => + content.replace(/^\s*\n*/i, ''); + +// Drop badge-only lines. +export const stripBadges = content => + content + .replace( + /^(\[!\[[^\]]*\](?:\[[^\]]*\]|\([^)]*\))\]\s*(?:\[[^\]]*\]|\([^)]*\))\s*)+$/gm, + '' + ) + .replace(/\n{3,}/g, '\n\n'); + +// Cut the trailing Contributing/License sections (and anything after) off the end. +export const stripBoilerplate = content => { + const match = content.match(/^#{1,6}\s*(?:Contributing|License)\b.*$/im); + return match ? `${content.slice(0, match.index).trimEnd()}\n` : content; +}; + +// Rewrite relative links via resolve(); skips full URLs, root-relative and anchors. +export const rewriteRelativeLinks = (content, resolve) => + content.replace( + /(\]\(|\]:\s*)(?![a-z][\w+.-]*:)(?!\/)(?!#)(?:\.{1,2}\/)?([^)\s#]+)/g, + (match, prefix, target) => { + const url = resolve(target); + return url ? `${prefix}${url}` : match; + } + ); From ec52998a6f5258da33bfd03d15d2121ab29f7fb3 Mon Sep 17 00:00:00 2001 From: Nikhil Kumar Rajak Date: Tue, 16 Jun 2026 17:24:32 +0000 Subject: [PATCH 2/2] fixup-nitpicks --- scripts/markdown/governance.mjs | 2 +- scripts/markdown/readmes.mjs | 11 +++------- scripts/markdown/sanitize.mjs | 39 +++++++++++++++++++++++++++++++++ scripts/utils/sanitize.mjs | 30 ------------------------- 4 files changed, 43 insertions(+), 39 deletions(-) create mode 100644 scripts/markdown/sanitize.mjs delete mode 100644 scripts/utils/sanitize.mjs diff --git a/scripts/markdown/governance.mjs b/scripts/markdown/governance.mjs index f4668e9..4e89d7b 100644 --- a/scripts/markdown/governance.mjs +++ b/scripts/markdown/governance.mjs @@ -1,7 +1,7 @@ import { mkdir, writeFile } from 'node:fs/promises'; import { join } from 'node:path'; import { fetchWithRetry } from '../utils/fetch.mjs'; -import { rewriteRelativeLinks } from '../utils/sanitize.mjs'; +import { rewriteRelativeLinks } from './sanitize.mjs'; const { GH_TOKEN } = process.env; diff --git a/scripts/markdown/readmes.mjs b/scripts/markdown/readmes.mjs index 923393a..b374989 100644 --- a/scripts/markdown/readmes.mjs +++ b/scripts/markdown/readmes.mjs @@ -1,12 +1,7 @@ import { mkdir, writeFile } from 'node:fs/promises'; import { join } from 'node:path'; import { fetchWithRetry } from '../utils/fetch.mjs'; -import { - stripLeadingHtml, - stripBadges, - stripBoilerplate, - rewriteRelativeLinks, -} from '../utils/sanitize.mjs'; +import cleanupMarkdown from './sanitize.mjs'; const { GH_TOKEN } = process.env; @@ -41,8 +36,8 @@ const discoverRepos = async () => { // Strip repo chrome, then point any relative links at the source repo on GitHub. const cleanReadme = (content, fullName) => - rewriteRelativeLinks( - stripBoilerplate(stripBadges(stripLeadingHtml(content))), + cleanupMarkdown( + content, target => `https://github.com/${fullName}/blob/HEAD/${target}` ); diff --git a/scripts/markdown/sanitize.mjs b/scripts/markdown/sanitize.mjs new file mode 100644 index 0000000..407d505 --- /dev/null +++ b/scripts/markdown/sanitize.mjs @@ -0,0 +1,39 @@ +// Cleanup for Markdown fetched from other repos (READMEs, governance docs). + +const LEADING_HTML = /^\s*\n*/i; +const BADGES = + /^(\[!\[[^\]]*\](?:\[[^\]]*\]|\([^)]*\))\]\s*(?:\[[^\]]*\]|\([^)]*\))\s*)+$/gm; +const EXTRA_BLANK_LINES = /\n{3,}/g; +const BOILERPLATE = /^#{1,6}\s*(?:Contributing|License)\b.*$/im; +const RELATIVE_LINK = + /(\]\(|\]:\s*)(?![a-z][\w+.-]*:)(?!\/)(?!#)(?:\.{1,2}\/)?([^)\s#]+)/g; + +// Drop the leading
logo banner. +export const stripLeadingHtml = content => content.replace(LEADING_HTML, ''); + +// Drop badge-only lines. +export const stripBadges = content => + content.replace(BADGES, '').replace(EXTRA_BLANK_LINES, '\n\n'); + +// Cut the trailing Contributing/License sections (and anything after) off the end. +export const stripBoilerplate = content => { + const match = content.match(BOILERPLATE); + return match ? `${content.slice(0, match.index).trimEnd()}\n` : content; +}; + +// Rewrite relative links via resolve(); skips full URLs, root-relative and anchors. +export const rewriteRelativeLinks = (content, resolve) => + content.replace(RELATIVE_LINK, (match, prefix, target) => { + const url = resolve(target); + return url ? `${prefix}${url}` : match; + }); + +const STEPS = [stripLeadingHtml, stripBadges, stripBoilerplate]; + +// Run the full cleanup pipeline. +const cleanupMarkdown = (content, resolve) => { + const result = STEPS.reduce((acc, step) => step(acc), content); + return resolve ? rewriteRelativeLinks(result, resolve) : result; +}; + +export default cleanupMarkdown; diff --git a/scripts/utils/sanitize.mjs b/scripts/utils/sanitize.mjs deleted file mode 100644 index c25d3dd..0000000 --- a/scripts/utils/sanitize.mjs +++ /dev/null @@ -1,30 +0,0 @@ -// Cleanup for Markdown fetched from other repos (READMEs, governance docs). - -// Drop the leading
logo banner. -export const stripLeadingHtml = content => - content.replace(/^\s*\n*/i, ''); - -// Drop badge-only lines. -export const stripBadges = content => - content - .replace( - /^(\[!\[[^\]]*\](?:\[[^\]]*\]|\([^)]*\))\]\s*(?:\[[^\]]*\]|\([^)]*\))\s*)+$/gm, - '' - ) - .replace(/\n{3,}/g, '\n\n'); - -// Cut the trailing Contributing/License sections (and anything after) off the end. -export const stripBoilerplate = content => { - const match = content.match(/^#{1,6}\s*(?:Contributing|License)\b.*$/im); - return match ? `${content.slice(0, match.index).trimEnd()}\n` : content; -}; - -// Rewrite relative links via resolve(); skips full URLs, root-relative and anchors. -export const rewriteRelativeLinks = (content, resolve) => - content.replace( - /(\]\(|\]:\s*)(?![a-z][\w+.-]*:)(?!\/)(?!#)(?:\.{1,2}\/)?([^)\s#]+)/g, - (match, prefix, target) => { - const url = resolve(target); - return url ? `${prefix}${url}` : match; - } - );