From c5bc6b5f59021b20223e8969196281cef0e09951 Mon Sep 17 00:00:00 2001 From: dacharyc Date: Fri, 3 Apr 2026 11:13:55 -0400 Subject: [PATCH 1/3] Exclude www canonicalization from cross-host redirect checking --- package.json | 4 ++- .../llms-txt-exists.ts | 20 +++++++---- src/checks/url-stability/redirect-behavior.ts | 9 +++-- src/cli/commands/check.ts | 8 +++-- src/helpers/get-page-urls.ts | 5 ++- src/helpers/to-md-urls.ts | 13 ++++++- test/unit/helpers/to-md-urls.test.ts | 35 ++++++++++++++++++- 7 files changed, 77 insertions(+), 17 deletions(-) diff --git a/package.json b/package.json index 57ecc65..0915002 100644 --- a/package.json +++ b/package.json @@ -50,7 +50,9 @@ "documentation", "testing", "llms-txt", - "agent-friendly" + "agent-friendly", + "agent score", + "agent friendly docs" ], "files": [ "dist/", diff --git a/src/checks/content-discoverability/llms-txt-exists.ts b/src/checks/content-discoverability/llms-txt-exists.ts index 6311fd9..c6b4104 100644 --- a/src/checks/content-discoverability/llms-txt-exists.ts +++ b/src/checks/content-discoverability/llms-txt-exists.ts @@ -146,17 +146,25 @@ async function checkLlmsTxtExists(ctx: CheckContext): Promise { details.redirectedOrigins = redirectedOrigins; } - // Set effectiveOrigin for downstream checks when content lives at a different host. - // Derive from redirect URLs on discovered files, or from the fallback redirectedOrigins. + // Set effectiveOrigin for downstream checks when content lives at a different origin. + // This covers both true cross-host redirects (e.g. example.com → docs.other.com) + // and www canonicalization (e.g. mongodb.com → www.mongodb.com). Downstream checks + // need to know the actual origin so sitemap scoping and link classification work. if (!ctx.effectiveOrigin) { - const crossHostFile = discovered.find((f) => f.crossHostRedirect && f.redirectUrl); - if (crossHostFile?.redirectUrl) { + // First try: a discovered file that redirected to a different origin + const redirectedFile = discovered.find((f) => f.redirectUrl); + if (redirectedFile?.redirectUrl) { try { - ctx.effectiveOrigin = new URL(crossHostFile.redirectUrl).origin; + const redirectedOrigin = new URL(redirectedFile.redirectUrl).origin; + if (redirectedOrigin !== ctx.origin) { + ctx.effectiveOrigin = redirectedOrigin; + } } catch { /* ignore malformed */ } - } else if (redirectedOrigins.length > 0) { + } + // Second try: origins discovered from cross-host redirect fallback probing + if (!ctx.effectiveOrigin && redirectedOrigins.length > 0) { ctx.effectiveOrigin = redirectedOrigins[0]; } } diff --git a/src/checks/url-stability/redirect-behavior.ts b/src/checks/url-stability/redirect-behavior.ts index 9b46e8f..f45916e 100644 --- a/src/checks/url-stability/redirect-behavior.ts +++ b/src/checks/url-stability/redirect-behavior.ts @@ -1,5 +1,6 @@ import { registerCheck } from '../registry.js'; import { discoverAndSamplePages } from '../../helpers/get-page-urls.js'; +import { isCrossHostRedirect } from '../../helpers/to-md-urls.js'; import type { CheckContext, CheckResult } from '../../types.js'; interface RedirectResult { @@ -56,13 +57,11 @@ async function check(ctx: CheckContext): Promise { } const resolvedTarget = new URL(location, url).toString(); - const sourceOrigin = new URL(url).origin; - const targetOrigin = new URL(resolvedTarget).origin; - if (sourceOrigin === targetOrigin) { - return { url, status, classification: 'same-host', redirectTarget: resolvedTarget }; + if (isCrossHostRedirect(url, resolvedTarget)) { + return { url, status, classification: 'cross-host', redirectTarget: resolvedTarget }; } - return { url, status, classification: 'cross-host', redirectTarget: resolvedTarget }; + return { url, status, classification: 'same-host', redirectTarget: resolvedTarget }; } catch (err) { return { url, diff --git a/src/cli/commands/check.ts b/src/cli/commands/check.ts index 3d11ec3..70439b9 100644 --- a/src/cli/commands/check.ts +++ b/src/cli/commands/check.ts @@ -53,8 +53,12 @@ export function registerCheckCommand(program: Command): void { } if (format !== 'json') { - const domain = new URL(url).hostname; - process.stderr.write(`Running checks on ${domain}...\n`); + const parsed = new URL(url); + const target = + parsed.pathname && parsed.pathname !== '/' + ? `${parsed.hostname}${parsed.pathname}` + : parsed.hostname; + process.stderr.write(`Running checks on ${target}...\n`); } const report = await runChecks(url, { diff --git a/src/helpers/get-page-urls.ts b/src/helpers/get-page-urls.ts index 13651df..f19081d 100644 --- a/src/helpers/get-page-urls.ts +++ b/src/helpers/get-page-urls.ts @@ -124,7 +124,10 @@ async function walkAggregateLinks(ctx: CheckContext, urls: string[]): Promise { it('returns URL as-is when it already ends in .md', () => { @@ -64,3 +64,36 @@ describe('toMdUrls', () => { expect(toMdUrls('https://example.com/sitemap.xml')).toEqual([]); }); }); + +describe('isCrossHostRedirect', () => { + it('returns false for same host', () => { + expect(isCrossHostRedirect('https://example.com/a', 'https://example.com/b')).toBe(false); + }); + + it('returns false for www to bare domain', () => { + expect(isCrossHostRedirect('https://www.example.com/a', 'https://example.com/a')).toBe(false); + }); + + it('returns false for bare domain to www', () => { + expect( + isCrossHostRedirect( + 'https://mongodb.com/docs/llms.txt', + 'https://www.mongodb.com/docs/llms.txt', + ), + ).toBe(false); + }); + + it('returns true for genuinely different hosts', () => { + expect(isCrossHostRedirect('https://example.com/a', 'https://other.com/a')).toBe(true); + }); + + it('returns true for different subdomains (not www)', () => { + expect(isCrossHostRedirect('https://docs.example.com/a', 'https://api.example.com/a')).toBe( + true, + ); + }); + + it('returns false for malformed URLs', () => { + expect(isCrossHostRedirect('not-a-url', 'https://example.com')).toBe(false); + }); +}); From 9f9f90c20ea328ac190906cb39911c32a1e87884 Mon Sep 17 00:00:00 2001 From: dacharyc Date: Fri, 3 Apr 2026 11:21:50 -0400 Subject: [PATCH 2/3] Fix formatter incorrectly splitting headings on file extensions --- src/cli/formatters/scorecard.ts | 8 +++++-- test/unit/cli/scorecard-formatter.test.ts | 27 +++++++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/src/cli/formatters/scorecard.ts b/src/cli/formatters/scorecard.ts index 3f8e676..627e31f 100644 --- a/src/cli/formatters/scorecard.ts +++ b/src/cli/formatters/scorecard.ts @@ -49,9 +49,13 @@ function formatCategoryLine(name: string, score: number, grade: string): string function formatDiagnostic(diag: Diagnostic): string[] { const icon = SEVERITY_ICONS[diag.severity] ?? '[?]'; const lines: string[] = []; - lines.push(` ${icon} ${chalk.bold(diag.message.split('.')[0])}`); + // Extract first sentence for heading. Split on ". " (period + space) rather than + // bare "." to avoid breaking on file extensions like .md or llms.txt. + const firstSentenceEnd = diag.message.indexOf('. '); + const heading = firstSentenceEnd !== -1 ? diag.message.slice(0, firstSentenceEnd) : diag.message; + lines.push(` ${icon} ${chalk.bold(heading)}`); - // Wrap message (skip the first sentence already used as heading) + // Full message as detail text const fullMsg = diag.message; lines.push(` ${chalk.dim(fullMsg)}`); lines.push(''); diff --git a/test/unit/cli/scorecard-formatter.test.ts b/test/unit/cli/scorecard-formatter.test.ts index c7bd144..542fc50 100644 --- a/test/unit/cli/scorecard-formatter.test.ts +++ b/test/unit/cli/scorecard-formatter.test.ts @@ -306,6 +306,33 @@ describe('formatScorecard', () => { expect(output).toContain('Weird status'); }); + it('does not split diagnostic heading on periods in file extensions', () => { + const score = makeScoreResult({ + diagnostics: [ + { + id: 'markdown-undiscoverable', + severity: 'warning', + message: + 'Your site serves markdown at .md URLs, but agents have no way to discover this. Without content negotiation, an llms.txt directive on your pages, most agents will default to the HTML path.', + resolution: 'Add a blockquote directive.', + }, + { + id: 'llms-txt-oversized', + severity: 'warning', + message: + 'Your llms.txt is 4,561,591 characters. Agents see roughly the first 100,000 characters.', + resolution: 'Split into section-level files.', + }, + ], + }); + const output = formatScorecard(makeReport(), score); + // The heading should include the full first sentence, not split on ".md" or "llms.txt" + expect(output).toContain( + 'Your site serves markdown at .md URLs, but agents have no way to discover this', + ); + expect(output).toContain('Your llms.txt is 4,561,591 characters'); + }); + it('handles diagnostic with unknown severity gracefully', () => { const score = makeScoreResult({ diagnostics: [ From e23137f3d664bd976edb9ebbfe11cd2c794a3708 Mon Sep 17 00:00:00 2001 From: dacharyc Date: Fri, 3 Apr 2026 11:27:53 -0400 Subject: [PATCH 3/3] Display text report datetime in local datetime, preserve UTC for JSON --- src/cli/formatters/scorecard.ts | 8 +++++++- src/cli/formatters/text.ts | 6 +++++- test/unit/cli/formatters.test.ts | 10 +++++++++- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/cli/formatters/scorecard.ts b/src/cli/formatters/scorecard.ts index 627e31f..98d43f7 100644 --- a/src/cli/formatters/scorecard.ts +++ b/src/cli/formatters/scorecard.ts @@ -39,6 +39,12 @@ function gradeColor(grade: string): (s: string) => string { return GRADE_COLORS[grade] ?? ((s: string) => s); } +function formatLocalTime(iso: string): string { + const d = new Date(iso); + if (isNaN(d.getTime())) return iso; + return d.toLocaleString(); +} + function formatCategoryLine(name: string, score: number, grade: string): string { const paddedName = name.padEnd(36); const scoreStr = `${score} / 100`; @@ -73,7 +79,7 @@ export function formatScorecard(report: ReportResult, scoreResult?: ScoreResult) lines.push(chalk.bold('Agent-Friendly Docs Scorecard')); lines.push(chalk.bold('==============================')); lines.push(''); - lines.push(chalk.gray(`${report.url} · ${report.timestamp}`)); + lines.push(chalk.gray(`${report.url} · ${formatLocalTime(report.timestamp)}`)); lines.push(''); // Overall score diff --git a/src/cli/formatters/text.ts b/src/cli/formatters/text.ts index d2aab21..f63bf49 100644 --- a/src/cli/formatters/text.ts +++ b/src/cli/formatters/text.ts @@ -186,7 +186,11 @@ export function formatText(report: ReportResult, options?: FormatTextOptions): s lines.push(''); lines.push(chalk.bold(`Agent-Friendly Docs Check: ${report.url}`)); - lines.push(chalk.gray(`Timestamp: ${report.timestamp}`)); + const localTime = (() => { + const d = new Date(report.timestamp); + return isNaN(d.getTime()) ? report.timestamp : d.toLocaleString(); + })(); + lines.push(chalk.gray(`Timestamp: ${localTime}`)); lines.push(''); // Group by category diff --git a/test/unit/cli/formatters.test.ts b/test/unit/cli/formatters.test.ts index 32bae37..6a7de0f 100644 --- a/test/unit/cli/formatters.test.ts +++ b/test/unit/cli/formatters.test.ts @@ -43,7 +43,9 @@ describe('formatText', () => { it('includes the URL and timestamp', () => { const output = formatText(makeReport()); expect(output).toContain('http://example.com'); - expect(output).toContain('2026-01-01T00:00:00.000Z'); + // Timestamp is displayed in local time, not raw ISO + const expected = new Date('2026-01-01T00:00:00.000Z').toLocaleString(); + expect(output).toContain(expected); }); it('groups results by category', () => { @@ -599,6 +601,12 @@ describe('formatJson', () => { expect(parsed.summary.total).toBe(5); }); + it('preserves raw ISO timestamp', () => { + const output = formatJson(makeReport()); + const parsed = JSON.parse(output); + expect(parsed.timestamp).toBe('2026-01-01T00:00:00.000Z'); + }); + it('is pretty-printed', () => { const output = formatJson(makeReport()); expect(output).toContain('\n');