diff --git a/src/clis/youtube/transcript.ts b/src/clis/youtube/transcript.ts index 441f9081..107b3ec0 100644 --- a/src/clis/youtube/transcript.ts +++ b/src/clis/youtube/transcript.ts @@ -10,7 +10,7 @@ * --mode raw: every caption segment as-is with precise timestamps */ import { cli, Strategy } from '../../registry.js'; -import { parseVideoId } from './utils.js'; +import { parseVideoId, prepareYoutubeApiPage } from './utils.js'; import { groupTranscriptSegments, formatGroupedTranscript, @@ -34,9 +34,7 @@ cli({ // so we let the renderer auto-detect columns from the data keys. func: async (page, kwargs) => { const videoId = parseVideoId(kwargs.url); - const videoUrl = `https://www.youtube.com/watch?v=${videoId}`; - await page.goto(videoUrl); - await page.wait(3); + await prepareYoutubeApiPage(page); const lang = kwargs.lang || ''; const mode = kwargs.mode || 'grouped'; diff --git a/src/clis/youtube/utils.test.ts b/src/clis/youtube/utils.test.ts new file mode 100644 index 00000000..4189e083 --- /dev/null +++ b/src/clis/youtube/utils.test.ts @@ -0,0 +1,43 @@ +import { describe, expect, it, vi } from 'vitest'; +import { extractJsonAssignmentFromHtml, prepareYoutubeApiPage } from './utils.js'; + +describe('youtube utils', () => { + it('extractJsonAssignmentFromHtml parses bootstrap objects with nested braces in strings', () => { + const html = ` + + `; + + expect(extractJsonAssignmentFromHtml(html, 'ytInitialPlayerResponse')).toEqual({ + title: 'brace { inside } string', + nested: { count: 2, text: 'quote "value"' }, + }); + }); + + it('extractJsonAssignmentFromHtml supports window assignments', () => { + const html = ` + + `; + + expect(extractJsonAssignmentFromHtml(html, 'ytInitialData')).toEqual({ + contents: { items: [1, 2, 3] }, + }); + }); + + it('prepareYoutubeApiPage loads the quiet API bootstrap page', async () => { + const page = { + goto: vi.fn().mockResolvedValue(undefined), + wait: vi.fn().mockResolvedValue(undefined), + }; + + await expect(prepareYoutubeApiPage(page as any)).resolves.toBeUndefined(); + expect(page.goto).toHaveBeenCalledWith('https://www.youtube.com', { waitUntil: 'none' }); + expect(page.wait).toHaveBeenCalledWith(2); + }); +}); diff --git a/src/clis/youtube/utils.ts b/src/clis/youtube/utils.ts index 9ceaf414..caa73050 100644 --- a/src/clis/youtube/utils.ts +++ b/src/clis/youtube/utils.ts @@ -1,6 +1,7 @@ /** * Shared YouTube utilities — URL parsing, video ID extraction, etc. */ +import type { IPage } from '../../types.js'; /** * Extract a YouTube video ID from a URL or bare video ID string. @@ -26,3 +27,71 @@ export function parseVideoId(input: string): string { return input; } + +/** + * Extract a JSON object assigned to a known bootstrap variable inside YouTube HTML. + */ +export function extractJsonAssignmentFromHtml(html: string, keys: string | string[]): Record | null { + const candidates = Array.isArray(keys) ? keys : [keys]; + for (const key of candidates) { + const markers = [ + `var ${key} = `, + `window["${key}"] = `, + `window.${key} = `, + `${key} = `, + ]; + for (const marker of markers) { + const markerIndex = html.indexOf(marker); + if (markerIndex === -1) continue; + + const jsonStart = html.indexOf('{', markerIndex + marker.length); + if (jsonStart === -1) continue; + + let depth = 0; + let inString = false; + let escaping = false; + for (let i = jsonStart; i < html.length; i += 1) { + const ch = html[i]; + if (inString) { + if (escaping) { + escaping = false; + } else if (ch === '\\') { + escaping = true; + } else if (ch === '"') { + inString = false; + } + continue; + } + + if (ch === '"') { + inString = true; + continue; + } + if (ch === '{') { + depth += 1; + continue; + } + if (ch === '}') { + depth -= 1; + if (depth === 0) { + try { + return JSON.parse(html.slice(jsonStart, i + 1)) as Record; + } catch { + break; + } + } + } + } + } + } + + return null; +} + +/** + * Prepare a quiet YouTube API-capable page without opening the watch UI. + */ +export async function prepareYoutubeApiPage(page: IPage): Promise { + await page.goto('https://www.youtube.com', { waitUntil: 'none' }); + await page.wait(2); +} diff --git a/src/clis/youtube/video.ts b/src/clis/youtube/video.ts index 7ed4aab3..84cf883c 100644 --- a/src/clis/youtube/video.ts +++ b/src/clis/youtube/video.ts @@ -1,8 +1,8 @@ /** - * YouTube video metadata — read ytInitialPlayerResponse + ytInitialData from video page. + * YouTube video metadata — fetch watch HTML and parse bootstrap data without opening the watch UI. */ import { cli, Strategy } from '../../registry.js'; -import { parseVideoId } from './utils.js'; +import { extractJsonAssignmentFromHtml, parseVideoId, prepareYoutubeApiPage } from './utils.js'; import { CommandExecutionError } from '../../errors.js'; cli({ @@ -17,24 +17,29 @@ cli({ columns: ['field', 'value'], func: async (page, kwargs) => { const videoId = parseVideoId(kwargs.url); - const videoUrl = `https://www.youtube.com/watch?v=${videoId}`; - await page.goto(videoUrl); - await page.wait(3); + await prepareYoutubeApiPage(page); const data = await page.evaluate(` (async () => { - const player = window.ytInitialPlayerResponse; - const yt = window.ytInitialData; - if (!player) return { error: 'ytInitialPlayerResponse not found' }; + const extractJsonAssignmentFromHtml = ${extractJsonAssignmentFromHtml.toString()}; + + const watchResp = await fetch('/watch?v=' + encodeURIComponent(${JSON.stringify(videoId)}), { + credentials: 'include', + }); + if (!watchResp.ok) return { error: 'Watch HTML returned HTTP ' + watchResp.status }; + + const html = await watchResp.text(); + const player = extractJsonAssignmentFromHtml(html, 'ytInitialPlayerResponse'); + const yt = extractJsonAssignmentFromHtml(html, 'ytInitialData'); + if (!player) return { error: 'ytInitialPlayerResponse not found in watch HTML' }; const details = player.videoDetails || {}; const microformat = player.microformat?.playerMicroformatRenderer || {}; + const contents = yt?.contents?.twoColumnWatchNextResults?.results?.results?.contents || []; - // Try to get full description from ytInitialData + // Try to get full description from watch bootstrap data let fullDescription = details.shortDescription || ''; try { - const contents = yt?.contents?.twoColumnWatchNextResults - ?.results?.results?.contents; if (contents) { for (const c of contents) { const desc = c.videoSecondaryInfoRenderer?.attributedDescription?.content; @@ -46,8 +51,6 @@ cli({ // Get like count if available let likes = ''; try { - const contents = yt?.contents?.twoColumnWatchNextResults - ?.results?.results?.contents; if (contents) { for (const c of contents) { const buttons = c.videoPrimaryInfoRenderer?.videoActions @@ -75,8 +78,6 @@ cli({ // Get channel subscriber count if available let subscribers = ''; try { - const contents = yt?.contents?.twoColumnWatchNextResults - ?.results?.results?.contents; if (contents) { for (const c of contents) { const owner = c.videoSecondaryInfoRenderer?.owner