From f846bac6a89d97a1e8199bbb678eb7bb46fb79f5 Mon Sep 17 00:00:00 2001 From: jackwener Date: Mon, 18 May 2026 19:24:12 +0800 Subject: [PATCH 1/2] fix(twitter): harden profile media download --- cli-manifest.json | 6 +- clis/twitter/download.js | 510 +++++++++++++++++++---- clis/twitter/download.test.js | 428 +++++++++++++++++++ docs/adapters/browser/twitter.md | 8 +- scripts/silent-column-drop-baseline.json | 15 - 5 files changed, 875 insertions(+), 92 deletions(-) create mode 100644 clis/twitter/download.test.js diff --git a/cli-manifest.json b/cli-manifest.json index 09cd53443..0eff37cc0 100644 --- a/cli-manifest.json +++ b/cli-manifest.json @@ -23040,7 +23040,7 @@ { "site": "twitter", "name": "download", - "description": "Download Twitter/X media (images and videos). Provide either to scan a profile's media tab, or --tweet-url to download a single tweet.", + "description": "Download Twitter/X media (images and videos). Provide either to fetch every media item from their profile via the GraphQL UserMedia endpoint with cursor pagination, or --tweet-url to download a single tweet.", "access": "read", "domain": "x.com", "strategy": "cookie", @@ -23051,7 +23051,7 @@ "type": "str", "required": false, "positional": true, - "help": "Twitter username (with or without @) to scan their /media tab. Either or --tweet-url is required." + "help": "Twitter username (with or without @) to scan their profile media. Either or --tweet-url is required." }, { "name": "tweet-url", @@ -23076,6 +23076,8 @@ ], "columns": [ "index", + "tweet_id", + "url", "type", "status", "size" diff --git a/clis/twitter/download.js b/clis/twitter/download.js index cc4e84982..d9dfc8f50 100644 --- a/clis/twitter/download.js +++ b/clis/twitter/download.js @@ -1,111 +1,473 @@ /** * Twitter/X download — download images and videos from tweets. * + * Profile media path uses the same GraphQL UserMedia endpoint the + * native client uses with cursor-based pagination, so it bypasses the + * virtual-scroll DOM cap that limited the previous scraper to ~visible + * tiles (see #1612). + * * Usage: - * opencli twitter download elonmusk --limit 10 --output ./twitter + * opencli twitter download elonmusk --limit 50 --output ./twitter * opencli twitter download --tweet-url https://x.com/xxx/status/123 --output ./twitter */ import { cli, Strategy } from '@jackwener/opencli/registry'; +import { ArgumentError, AuthRequiredError, CliError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors'; import { formatCookieHeader } from '@jackwener/opencli/download'; import { downloadMedia } from '@jackwener/opencli/download/media-download'; +import { + resolveTwitterOperationMetadata, + normalizeTwitterGraphqlPayload, + unwrapBrowserResult, + normalizeTwitterScreenName, + extractMedia, + parseTweetUrl, +} from './shared.js'; +import { TWITTER_BEARER_TOKEN } from './utils.js'; + +const USER_MEDIA_QUERY_ID = '9EovraBTXJYGSEQXZqlLmQ'; +const USER_BY_SCREEN_NAME_QUERY_ID = 'IGgvgiOx4QZndDHuD3x9TQ'; +const MAX_PAGINATION_PAGES = 100; + +const USER_MEDIA_FEATURES = { + rweb_video_screen_enabled: true, + rweb_cashtags_enabled: true, + profile_label_improvements_pcf_label_in_post_enabled: true, + responsive_web_profile_redirect_enabled: true, + rweb_tipjar_consumption_enabled: true, + verified_phone_label_enabled: false, + creator_subscriptions_tweet_preview_api_enabled: true, + responsive_web_graphql_timeline_navigation_enabled: true, + responsive_web_graphql_skip_user_profile_image_extensions_enabled: false, + premium_content_api_read_enabled: false, + communities_web_enable_tweet_community_results_fetch: true, + c9s_tweet_anatomy_moderator_badge_enabled: true, + responsive_web_grok_analyze_button_fetch_trends_enabled: false, + responsive_web_grok_analyze_post_followups_enabled: true, + rweb_cashtags_composer_attachment_enabled: true, + responsive_web_jetfuel_frame: true, + responsive_web_grok_share_attachment_enabled: true, + responsive_web_grok_annotations_enabled: true, + articles_preview_enabled: true, + responsive_web_edit_tweet_api_enabled: true, + rweb_conversational_replies_downvote_enabled: true, + graphql_is_translatable_rweb_tweet_is_translatable_enabled: true, + view_counts_everywhere_api_enabled: true, + longform_notetweets_consumption_enabled: true, + responsive_web_twitter_article_tweet_consumption_enabled: true, + content_disclosure_indicator_enabled: true, + content_disclosure_ai_generated_indicator_enabled: true, + responsive_web_grok_show_grok_translated_post: false, + responsive_web_grok_analysis_button_from_backend: true, + post_ctas_fetch_enabled: false, + freedom_of_speech_not_reach_fetch_enabled: true, + standardized_nudges_misinfo: true, + tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled: true, + longform_notetweets_rich_text_read_enabled: true, + longform_notetweets_inline_media_enabled: true, + responsive_web_grok_image_annotation_enabled: true, + responsive_web_grok_imagine_annotation_enabled: true, + responsive_web_grok_community_note_auto_translation_is_enabled: false, + responsive_web_enhance_cards_enabled: false, +}; + +const USER_MEDIA_FIELD_TOGGLES = { + withPayments: true, + withAuxiliaryUserLabels: true, + withArticleRichContentState: true, + withArticlePlainText: true, + withArticleSummaryText: true, + withArticleVoiceOver: true, + withGrokAnalyze: true, + withDisallowedReplyControls: true, +}; + +const USER_BY_SCREEN_NAME_FEATURES = { + hidden_profile_subscriptions_enabled: true, + profile_label_improvements_pcf_label_in_post_enabled: true, + responsive_web_profile_redirect_enabled: true, + rweb_tipjar_consumption_enabled: true, + responsive_web_graphql_exclude_directive_enabled: true, + verified_phone_label_enabled: false, + subscriptions_verification_info_is_identity_verified_enabled: true, + subscriptions_verification_info_verified_since_enabled: true, + highlights_tweets_tab_ui_enabled: true, + responsive_web_twitter_article_notes_tab_enabled: true, + subscriptions_feature_can_gift_premium: true, + creator_subscriptions_tweet_preview_api_enabled: true, + responsive_web_graphql_skip_user_profile_image_extensions_enabled: false, + responsive_web_graphql_timeline_navigation_enabled: true, +}; + +const USER_BY_SCREEN_NAME_FIELD_TOGGLES = { + withPayments: true, + withAuxiliaryUserLabels: true, +}; + +const USER_MEDIA_OPERATION = { + queryId: USER_MEDIA_QUERY_ID, + features: USER_MEDIA_FEATURES, + fieldToggles: USER_MEDIA_FIELD_TOGGLES, +}; + +const USER_BY_SCREEN_NAME_OPERATION = { + queryId: USER_BY_SCREEN_NAME_QUERY_ID, + features: USER_BY_SCREEN_NAME_FEATURES, + fieldToggles: USER_BY_SCREEN_NAME_FIELD_TOGGLES, +}; + +function requireLimit(value) { + const limit = Number(value ?? 10); + if (!Number.isInteger(limit) || limit < 1 || limit > 1000) { + throw new ArgumentError('--limit must be an integer between 1 and 1000'); + } + return limit; +} + +function nextUserMediaFetchCount(limit, downloadedCount) { + const remaining = limit - downloadedCount; + if (remaining <= 0) return 0; + const requested = remaining + 10; + if (requested > 100) return 100; + return requested; +} + +async function downloadTwitterMedia(items, options) { + const rows = await downloadMedia(items, options); + return rows.map((row, index) => { + const item = items[index] || {}; + return { + index: row.index, + tweet_id: item.tweet_id || '', + url: item.url || '', + type: row.type, + status: row.status, + size: row.size, + }; + }); +} + +function normalizeUserMediaOperation(operation) { + if (typeof operation === 'string') { + return { queryId: operation, features: USER_MEDIA_FEATURES, fieldToggles: USER_MEDIA_FIELD_TOGGLES }; + } + return { + queryId: operation?.queryId || USER_MEDIA_QUERY_ID, + features: operation?.features || USER_MEDIA_FEATURES, + fieldToggles: operation?.fieldToggles || USER_MEDIA_FIELD_TOGGLES, + }; +} + +function normalizeUserByScreenNameOperation(operation) { + if (typeof operation === 'string') { + return { queryId: operation, features: USER_BY_SCREEN_NAME_FEATURES, fieldToggles: USER_BY_SCREEN_NAME_FIELD_TOGGLES }; + } + return { + queryId: operation?.queryId || USER_BY_SCREEN_NAME_QUERY_ID, + features: operation?.features || USER_BY_SCREEN_NAME_FEATURES, + fieldToggles: operation?.fieldToggles || USER_BY_SCREEN_NAME_FIELD_TOGGLES, + }; +} + +function appendGraphqlParams(path, variables, operation) { + const fieldToggles = operation.fieldToggles || {}; + const params = [ + `variables=${encodeURIComponent(JSON.stringify(variables))}`, + `features=${encodeURIComponent(JSON.stringify(operation.features || {}))}`, + ]; + if (Object.keys(fieldToggles).length > 0) { + params.push(`fieldToggles=${encodeURIComponent(JSON.stringify(fieldToggles))}`); + } + return `${path}?${params.join('&')}`; +} + +function buildUserMediaUrl(operation, userId, count, cursor) { + const normalized = normalizeUserMediaOperation(operation); + const vars = { + userId, + count, + includePromotedContent: false, + withClientEventToken: false, + withBirdwatchNotes: false, + withVoice: true, + }; + if (cursor) vars.cursor = cursor; + return appendGraphqlParams(`/i/api/graphql/${normalized.queryId}/UserMedia`, vars, normalized); +} + +function buildUserByScreenNameUrl(operation, screenName) { + const normalized = normalizeUserByScreenNameOperation(operation); + const vars = { screen_name: screenName, withSafetyModeUserFields: true }; + return appendGraphqlParams(`/i/api/graphql/${normalized.queryId}/UserByScreenName`, vars, normalized); +} + +function classifyMediaUrl(url) { + if (!url) return 'unknown'; + if (/video\.twimg\.com|\.mp4(\?|$)|\.m3u8(\?|$)/.test(url)) return 'video'; + return 'image'; +} + +function requireObjectPayload(value, context) { + if (!value || typeof value !== 'object' || Array.isArray(value)) { + throw new CommandExecutionError(`Twitter ${context} returned malformed payload`); + } + return value; +} + +function throwGraphqlFetchError(context, status, message) { + if (status === 401 || status === 403) { + throw new AuthRequiredError('x.com', `Twitter ${context} requires an authenticated x.com session`); + } + if (status === 404) { + throw new EmptyResultError(`twitter download ${context}`, message || 'Twitter returned not found'); + } + const statusText = status ? `HTTP ${status}` : 'fetch failed'; + throw new CommandExecutionError(`Twitter ${context} fetch failed: ${statusText}${message ? ` - ${message}` : ''}`); +} + +function requireFetchPayload(value, context) { + const result = requireObjectPayload(unwrapBrowserResult(value), context); + if (result.ok === true) { + return result.payload; + } + if (result.ok === false) { + throwGraphqlFetchError(context, Number(result.status) || 0, typeof result.error === 'string' ? result.error : ''); + } + throw new CommandExecutionError(`Twitter ${context} returned malformed fetch result`); +} + +function requireUserMediaPayload(data) { + const payload = requireObjectPayload(data, 'UserMedia'); + if (Array.isArray(payload.errors) && payload.errors.length > 0) { + throw new CommandExecutionError(`Twitter UserMedia returned GraphQL errors: ${JSON.stringify(payload.errors).slice(0, 200)}`); + } + const result = payload.data?.user?.result; + if (!result || typeof result !== 'object') { + throw new CommandExecutionError('Twitter UserMedia returned malformed user result'); + } + const instructions = result.timeline_v2?.timeline?.instructions || result.timeline?.timeline?.instructions; + if (!Array.isArray(instructions)) { + throw new CommandExecutionError('Twitter UserMedia returned malformed timeline instructions'); + } + return payload; +} + +function parseUserMedia(data, seen) { + const items = []; + let nextCursor = null; + const result = requireUserMediaPayload(data).data.user.result; + const instructionSets = [ + result.timeline_v2?.timeline?.instructions, + result.timeline?.timeline?.instructions, + ].filter(Array.isArray); + const instructions = instructionSets.flat(); + const visit = (value) => { + if (!value || typeof value !== 'object') return; + if (value.type === 'TimelinePinEntry') return; + if (value.tweet_results?.result) { + const raw = value.tweet_results.result; + const tw = raw.__typename === 'TweetWithVisibilityResults' && raw.tweet + ? raw.tweet + : (raw.tweet || raw); + const tweetId = typeof tw.rest_id === 'string' || typeof tw.rest_id === 'number' ? String(tw.rest_id) : ''; + if (!tweetId) { + throw new CommandExecutionError('Twitter UserMedia returned a tweet without rest_id'); + } + if (!seen.has(tweetId)) { + seen.add(tweetId); + const { media_urls } = extractMedia(tw.legacy || {}); + for (const url of media_urls) { + items.push({ tweet_id: tweetId, url, type: classifyMediaUrl(url) }); + } + } + } + if ( + (value.entryType === 'TimelineTimelineCursor' || value.__typename === 'TimelineTimelineCursor') + && (value.cursorType === 'Bottom' || value.cursorType === 'ShowMore') + && value.value + ) { + nextCursor = value.value; + } + if (Array.isArray(value)) { + for (const item of value) visit(item); + return; + } + for (const child of Object.values(value)) { + if (child && typeof child === 'object') visit(child); + } + }; + visit(instructions); + return { items, nextCursor }; +} + cli({ site: 'twitter', name: 'download', access: 'read', - description: 'Download Twitter/X media (images and videos). Provide either to scan a profile\'s media tab, or --tweet-url to download a single tweet.', + description: 'Download Twitter/X media (images and videos). Provide either to fetch every media item from their profile via the GraphQL UserMedia endpoint with cursor pagination, or --tweet-url to download a single tweet.', domain: 'x.com', strategy: Strategy.COOKIE, + browser: true, args: [ - { name: 'username', positional: true, help: 'Twitter username (with or without @) to scan their /media tab. Either or --tweet-url is required.' }, + { name: 'username', positional: true, help: 'Twitter username (with or without @) to scan their profile media. Either or --tweet-url is required.' }, { name: 'tweet-url', help: 'Single tweet URL to download. Use this OR , not both required at once.' }, { name: 'limit', type: 'int', default: 10, help: 'Maximum number of media items to download when scanning a profile (default 10). Ignored when --tweet-url is used.' }, { name: 'output', default: './twitter-downloads', help: 'Output directory (default ./twitter-downloads). A per-source subdir is created inside.' }, ], - columns: ['index', 'type', 'status', 'size'], + columns: ['index', 'tweet_id', 'url', 'type', 'status', 'size'], func: async (page, kwargs) => { - const username = kwargs.username; - const tweetUrl = kwargs['tweet-url']; - const limit = kwargs.limit; - const output = kwargs.output; - if (!username && !tweetUrl) { - return [{ - index: 0, - type: '-', - status: 'failed', - size: 'Must provide a username or --tweet-url', - }]; + try { + const rawUsername = String(kwargs.username ?? '').trim(); + const tweetUrl = String(kwargs['tweet-url'] ?? '').trim(); + const output = kwargs.output; + if (!rawUsername && !tweetUrl) { + throw new ArgumentError('twitter download requires either or --tweet-url'); + } + if (rawUsername && tweetUrl) { + throw new ArgumentError('Use either or --tweet-url, not both'); + } + if (tweetUrl) { + return downloadSingleTweet(page, tweetUrl, output); + } + const limit = requireLimit(kwargs.limit); + const username = normalizeTwitterScreenName(rawUsername); + if (!username) { + throw new ArgumentError('twitter download username must be a valid Twitter/X handle', 'Example: opencli twitter download @jack --limit 20'); + } + return downloadUserMedia(page, username, limit, output); } - // Navigate to the appropriate page - if (tweetUrl) { - await page.goto(tweetUrl); + catch (err) { + if (err instanceof CliError) throw err; + throw new CommandExecutionError(`twitter download failed: ${err?.message ?? String(err)}`); } - else { - await page.goto(`https://x.com/${username}/media`); - } - await page.wait(3); - // Scroll to load more content - if (!tweetUrl) { - await page.autoScroll({ times: Math.ceil(limit / 5) }); + }, +}); + +async function downloadUserMedia(page, username, limit, output) { + await page.goto(`https://x.com/${username}`); + await page.wait({ selector: '[data-testid="primaryColumn"]' }); + + const cookies = await page.getCookies({ url: 'https://x.com' }); + const ct0 = cookies.find((c) => c.name === 'ct0')?.value || null; + if (!ct0) throw new AuthRequiredError('x.com', 'Not logged into x.com (no ct0 cookie)'); + + const userMediaOperation = await resolveTwitterOperationMetadata(page, 'UserMedia', USER_MEDIA_OPERATION); + const userByScreenNameOperation = await resolveTwitterOperationMetadata(page, 'UserByScreenName', USER_BY_SCREEN_NAME_OPERATION); + + const headers = JSON.stringify({ + 'Authorization': `Bearer ${decodeURIComponent(TWITTER_BEARER_TOKEN)}`, + 'X-Csrf-Token': ct0, + 'X-Twitter-Auth-Type': 'OAuth2Session', + 'X-Twitter-Active-User': 'yes', + }); + + const ubsUrl = buildUserByScreenNameUrl(userByScreenNameOperation, username); + const userLookup = requireFetchPayload(await page.evaluate(`async () => { + try { + const resp = await fetch("${ubsUrl}", { headers: ${headers}, credentials: 'include' }); + if (!resp.ok) return { ok: false, status: resp.status }; + const payload = await resp.json(); + return { ok: true, payload }; + } catch (err) { + return { ok: false, error: err?.message ?? String(err) }; + } + }`)); + const normalizedUserLookup = normalizeTwitterGraphqlPayload(userLookup); + if (Array.isArray(normalizedUserLookup?.errors) && normalizedUserLookup.errors.length > 0) { + throw new CommandExecutionError(`Twitter UserByScreenName returned GraphQL errors: ${JSON.stringify(normalizedUserLookup.errors).slice(0, 200)}`); + } + const userId = normalizedUserLookup?.data?.user?.result?.rest_id; + if (!userId) throw new EmptyResultError(`twitter download @${username}`, `Could not resolve @${username}`); + + const seen = new Set(); + const all = []; + let cursor = null; + for (let i = 0; i < MAX_PAGINATION_PAGES && all.length < limit; i++) { + const fetchCount = nextUserMediaFetchCount(limit, all.length); + if (fetchCount === 0) break; + const url = buildUserMediaUrl(userMediaOperation, userId, fetchCount, cursor); + const data = normalizeTwitterGraphqlPayload(requireFetchPayload(await page.evaluate(`async () => { + try { + const r = await fetch("${url}", { headers: ${headers}, credentials: 'include' }); + if (!r.ok) return { ok: false, status: r.status }; + return { ok: true, payload: await r.json() }; + } catch (err) { + return { ok: false, error: err?.message ?? String(err) }; } - // Extract media URLs - const data = await page.evaluate(` - (() => { - const media = []; + }`))); + const { items, nextCursor } = parseUserMedia(data, seen); + all.push(...items); + if (!nextCursor || nextCursor === cursor) break; + cursor = nextCursor; + } + + if (all.length === 0) throw new EmptyResultError(`@${username} has no media`, 'Account may be private, suspended, or have no media posts'); + + const trimmed = all.slice(0, limit); + return downloadTwitterMedia(trimmed, { + output, + subdir: username, + cookies: formatCookieHeader(cookies), + browserCookies: cookies, + filenamePrefix: username, + ytdlpExtraArgs: ['--merge-output-format', 'mp4'], + }); +} - // Find images (high quality) +async function downloadSingleTweet(page, tweetUrl, output) { + const target = parseTweetUrl(tweetUrl); + await page.goto(target.url); + await page.wait(3); + const items = unwrapBrowserResult(await page.evaluate(` + (() => { + const out = []; document.querySelectorAll('img[src*="pbs.twimg.com/media"]').forEach(img => { let src = img.src || ''; - // Get large version src = src.replace(/&name=\\w+$/, '&name=large'); - src = src.replace(/\\?format=/, '?format='); - if (!src.includes('&name=')) { - src = src + '&name=large'; - } - media.push({ type: 'image', url: src }); + if (!src.includes('&name=')) src = src + '&name=large'; + out.push({ type: 'image', url: src }); }); - - // Find videos document.querySelectorAll('video').forEach(video => { const src = video.src || ''; - if (src) { - media.push({ type: 'video', url: src, poster: video.poster || '' }); - } + if (src) out.push({ type: 'video', url: src }); }); - - // Find video tweets (for yt-dlp) document.querySelectorAll('[data-testid="videoPlayer"]').forEach(player => { const tweetLink = player.closest('article')?.querySelector('a[href*="/status/"]'); const href = tweetLink?.getAttribute('href') || ''; - if (href) { - const tweetUrl = 'https://x.com' + href; - media.push({ type: 'video-tweet', url: tweetUrl }); - } + if (href) out.push({ type: 'video-tweet', url: 'https://x.com' + href }); }); - - return media; + return out; })() - `); - if (!data || data.length === 0) { - return [{ index: 0, type: '-', status: 'failed', size: 'No media found' }]; - } - // Extract cookies - const browserCookies = await page.getCookies({ domain: 'x.com' }); - // Deduplicate media - const seen = new Set(); - const uniqueMedia = data.filter((m) => { - if (seen.has(m.url)) - return false; - seen.add(m.url); - return true; - }).slice(0, limit); - const subdir = tweetUrl ? 'tweets' : (username || 'media'); - return downloadMedia(uniqueMedia, { - output, - subdir, - cookies: formatCookieHeader(browserCookies), - browserCookies, - filenamePrefix: username || 'tweet', - ytdlpExtraArgs: ['--merge-output-format', 'mp4'], - }); - }, -}); + `)); + if (!Array.isArray(items)) { + throw new CommandExecutionError('Twitter tweet media extraction returned malformed payload'); + } + if (items.length === 0) { + throw new EmptyResultError(`twitter download ${target.id}`, 'No media found in the tweet'); + } + const cookies = await page.getCookies({ domain: 'x.com' }); + const seen = new Set(); + const unique = items.filter((m) => { + if (seen.has(m.url)) return false; + seen.add(m.url); + return true; + }).map((m) => { + return { ...m, tweet_id: target.id }; + }); + return downloadTwitterMedia(unique, { + output, + subdir: 'tweets', + cookies: formatCookieHeader(cookies), + browserCookies: cookies, + filenamePrefix: 'tweet', + ytdlpExtraArgs: ['--merge-output-format', 'mp4'], + }); +} + +export const __test__ = { + buildUserMediaUrl, + buildUserByScreenNameUrl, + parseUserMedia, + classifyMediaUrl, + requireLimit, + nextUserMediaFetchCount, +}; diff --git a/clis/twitter/download.test.js b/clis/twitter/download.test.js new file mode 100644 index 000000000..cc60d203e --- /dev/null +++ b/clis/twitter/download.test.js @@ -0,0 +1,428 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; +const { mockDownloadMedia, mockFormatCookieHeader } = vi.hoisted(() => ({ + mockDownloadMedia: vi.fn(), + mockFormatCookieHeader: vi.fn(() => 'ct0=token'), +})); +vi.mock('@jackwener/opencli/download/media-download', () => ({ + downloadMedia: mockDownloadMedia, +})); +vi.mock('@jackwener/opencli/download', () => ({ + formatCookieHeader: mockFormatCookieHeader, +})); +import { getRegistry } from '@jackwener/opencli/registry'; +import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors'; +import { __test__ } from './download.js'; + +const { + buildUserMediaUrl, + buildUserByScreenNameUrl, + parseUserMedia, + classifyMediaUrl, + requireLimit, + nextUserMediaFetchCount, +} = __test__; + +function createPageMock(evaluateResults = []) { + const evaluate = vi.fn(); + for (const result of evaluateResults) evaluate.mockResolvedValueOnce(result); + evaluate.mockResolvedValue(undefined); + return { + goto: vi.fn().mockResolvedValue(undefined), + wait: vi.fn().mockResolvedValue(undefined), + evaluate, + getCookies: vi.fn().mockResolvedValue([{ name: 'ct0', value: 'token', domain: '.x.com' }]), + }; +} + +function userLookupPayload(userId = '42') { + return { + ok: true, + payload: { + data: { + user: { + result: { rest_id: userId }, + }, + }, + }, + }; +} + +function userMediaPayload(entries) { + return { + ok: true, + payload: { + data: { + user: { + result: { + timeline_v2: { + timeline: { + instructions: [{ entries }], + }, + }, + }, + }, + }, + }, + }; +} + +function tweetEntry(id, url = `https://pbs.twimg.com/media/${id}.jpg`) { + return { + content: { + itemContent: { + tweet_results: { + result: { + rest_id: id, + legacy: { + extended_entities: { + media: [{ type: 'photo', media_url_https: url }], + }, + }, + }, + }, + }, + }, + }; +} + +describe('twitter download helpers', () => { + beforeEach(() => { + mockDownloadMedia.mockReset(); + mockDownloadMedia.mockResolvedValue([{ index: 1, type: 'image', status: 'success', size: '1 KB' }]); + mockFormatCookieHeader.mockClear(); + }); + + it('registers the canonical download columns', () => { + const cmd = getRegistry().get('twitter/download'); + expect(cmd?.columns).toEqual(['index', 'tweet_id', 'url', 'type', 'status', 'size']); + }); + + it('makes username positional and tweet-url a flag', () => { + const cmd = getRegistry().get('twitter/download'); + const usernameArg = cmd?.args?.find((a) => a.name === 'username'); + const tweetUrlArg = cmd?.args?.find((a) => a.name === 'tweet-url'); + expect(usernameArg?.positional).toBe(true); + expect(tweetUrlArg?.positional).not.toBe(true); + }); + + it('builds a UserMedia URL with userId, count and cursor', () => { + const url = buildUserMediaUrl( + { queryId: 'QID', features: { fa: true }, fieldToggles: { fb: true } }, + '42', + 50, + 'cursor-xyz', + ); + expect(url.startsWith('/i/api/graphql/QID/UserMedia?')).toBe(true); + const vars = JSON.parse(decodeURIComponent(url.match(/variables=([^&]+)/)[1])); + expect(vars.userId).toBe('42'); + expect(vars.count).toBe(50); + expect(vars.cursor).toBe('cursor-xyz'); + expect(vars.includePromotedContent).toBe(false); + }); + + it('omits cursor variable when not paging', () => { + const url = buildUserMediaUrl({ queryId: 'QID', features: {}, fieldToggles: {} }, '42', 10, null); + const vars = JSON.parse(decodeURIComponent(url.match(/variables=([^&]+)/)[1])); + expect(vars.cursor).toBeUndefined(); + }); + + it('builds a UserByScreenName URL with the screen_name variable', () => { + const url = buildUserByScreenNameUrl( + { queryId: 'UBSN', features: {}, fieldToggles: {} }, + 'jack', + ); + expect(url.startsWith('/i/api/graphql/UBSN/UserByScreenName?')).toBe(true); + expect(decodeURIComponent(url)).toContain('"screen_name":"jack"'); + }); + + it('classifies twimg video URLs as video and pbs URLs as image', () => { + expect(classifyMediaUrl('https://video.twimg.com/amplify_video/123/vid/avc1/720x1280/abc.mp4?tag=27')).toBe('video'); + expect(classifyMediaUrl('https://pbs.twimg.com/media/AbCdEf.jpg')).toBe('image'); + expect(classifyMediaUrl('https://example.com/clip.m3u8')).toBe('video'); + expect(classifyMediaUrl(null)).toBe('unknown'); + }); + + it('strictly validates profile download limit', () => { + expect(requireLimit(undefined)).toBe(10); + expect(requireLimit(1)).toBe(1); + for (const value of [0, -1, 1.5, 'abc', 1001]) { + expect(() => requireLimit(value)).toThrow(ArgumentError); + } + }); + + it('calculates profile media page sizes without silently clamping user input', () => { + expect(nextUserMediaFetchCount(1, 0)).toBe(11); + expect(nextUserMediaFetchCount(1000, 0)).toBe(100); + expect(nextUserMediaFetchCount(1000, 950)).toBe(60); + expect(nextUserMediaFetchCount(10, 10)).toBe(0); + }); + + it('extracts media urls and the bottom cursor from a UserMedia payload', () => { + const payload = { + data: { + user: { + result: { + timeline_v2: { + timeline: { + instructions: [ + { + entries: [ + { + content: { + itemContent: { + tweet_results: { + result: { + rest_id: 'tweet-1', + legacy: { + extended_entities: { + media: [ + { type: 'photo', media_url_https: 'https://pbs.twimg.com/media/IMG1.jpg' }, + { type: 'video', video_info: { variants: [{ content_type: 'video/mp4', url: 'https://video.twimg.com/v/1.mp4' }] } }, + ], + }, + }, + }, + }, + }, + }, + }, + { + content: { + entryType: 'TimelineTimelineCursor', + cursorType: 'Bottom', + value: 'next-cursor-abc', + }, + }, + ], + }, + ], + }, + }, + }, + }, + }, + }; + const seen = new Set(); + const { items, nextCursor } = parseUserMedia(payload, seen); + expect(nextCursor).toBe('next-cursor-abc'); + expect(items).toHaveLength(2); + expect(items[0]).toMatchObject({ tweet_id: 'tweet-1', url: 'https://pbs.twimg.com/media/IMG1.jpg', type: 'image' }); + expect(items[1]).toMatchObject({ tweet_id: 'tweet-1', url: 'https://video.twimg.com/v/1.mp4', type: 'video' }); + expect(seen.has('tweet-1')).toBe(true); + }); + + it('skips already-seen tweets across pages', () => { + const tweetEntry = (id) => ({ + content: { + itemContent: { + tweet_results: { + result: { + rest_id: id, + legacy: { + extended_entities: { + media: [{ type: 'photo', media_url_https: `https://pbs.twimg.com/media/${id}.jpg` }], + }, + }, + }, + }, + }, + }, + }); + const payload = { + data: { + user: { + result: { + timeline_v2: { + timeline: { + instructions: [{ entries: [tweetEntry('A'), tweetEntry('A'), tweetEntry('B')] }], + }, + }, + }, + }, + }, + }; + const seen = new Set(); + const { items } = parseUserMedia(payload, seen); + expect(items.map((item) => item.tweet_id)).toEqual(['A', 'B']); + }); + + it('treats TweetWithVisibilityResults wrappers as tweets', () => { + const payload = { + data: { + user: { + result: { + timeline_v2: { + timeline: { + instructions: [ + { + entries: [ + { + content: { + itemContent: { + tweet_results: { + result: { + __typename: 'TweetWithVisibilityResults', + tweet: { + rest_id: 'wrapped-1', + legacy: { + extended_entities: { + media: [{ type: 'photo', media_url_https: 'https://pbs.twimg.com/media/W.jpg' }], + }, + }, + }, + }, + }, + }, + }, + }, + ], + }, + ], + }, + }, + }, + }, + }, + }; + const { items } = parseUserMedia(payload, new Set()); + expect(items).toHaveLength(1); + expect(items[0].tweet_id).toBe('wrapped-1'); + }); + + it('fails typed when UserMedia payload has no timeline instructions', () => { + expect(() => parseUserMedia({ data: { user: { result: {} } } }, new Set())) + .toThrow(CommandExecutionError); + }); + + it('rejects missing, mixed, invalid username and invalid limit before navigation', async () => { + const cmd = getRegistry().get('twitter/download'); + for (const args of [ + {}, + { username: 'jack', 'tweet-url': 'https://x.com/jack/status/123' }, + { username: 'bad/name' }, + { username: 'jack', limit: 0 }, + ]) { + const page = createPageMock(); + await expect(cmd.func(page, args)).rejects.toBeInstanceOf(ArgumentError); + expect(page.goto).not.toHaveBeenCalled(); + } + }); + + it('downloads profile media through UserByScreenName and UserMedia GraphQL payloads', async () => { + const cmd = getRegistry().get('twitter/download'); + mockDownloadMedia.mockResolvedValueOnce([ + { index: 1, type: 'image', status: 'success', size: '1 KB' }, + { index: 2, type: 'image', status: 'success', size: '2 KB' }, + ]); + const page = createPageMock([ + { queryId: 'UM', features: { a: true }, fieldToggles: {} }, + { queryId: 'UB', features: {}, fieldToggles: {} }, + userLookupPayload('42'), + userMediaPayload([ + tweetEntry('A'), + { + content: { + entryType: 'TimelineTimelineCursor', + cursorType: 'Bottom', + value: 'cursor-1', + }, + }, + ]), + userMediaPayload([tweetEntry('B')]), + ]); + const rows = await cmd.func(page, { username: '@jack', limit: 2, output: './out' }); + expect(page.goto).toHaveBeenCalledWith('https://x.com/jack'); + expect(page.evaluate).toHaveBeenCalledTimes(5); + expect(mockDownloadMedia).toHaveBeenCalledWith([ + { tweet_id: 'A', url: 'https://pbs.twimg.com/media/A.jpg', type: 'image' }, + { tweet_id: 'B', url: 'https://pbs.twimg.com/media/B.jpg', type: 'image' }, + ], expect.objectContaining({ + output: './out', + subdir: 'jack', + filenamePrefix: 'jack', + cookies: 'ct0=token', + })); + expect(rows).toEqual([ + { + index: 1, + tweet_id: 'A', + url: 'https://pbs.twimg.com/media/A.jpg', + type: 'image', + status: 'success', + size: '1 KB', + }, + { + index: 2, + tweet_id: 'B', + url: 'https://pbs.twimg.com/media/B.jpg', + type: 'image', + status: 'success', + size: '2 KB', + }, + ]); + }); + + it('maps missing ct0 and GraphQL auth failures to AuthRequiredError', async () => { + const cmd = getRegistry().get('twitter/download'); + const noCt0Page = createPageMock(); + noCt0Page.getCookies.mockResolvedValueOnce([]); + await expect(cmd.func(noCt0Page, { username: 'jack', limit: 1 })) + .rejects.toBeInstanceOf(AuthRequiredError); + + const authPage = createPageMock([ + { queryId: 'UM', features: {}, fieldToggles: {} }, + { queryId: 'UB', features: {}, fieldToggles: {} }, + { ok: false, status: 401 }, + ]); + await expect(cmd.func(authPage, { username: 'jack', limit: 1 })) + .rejects.toBeInstanceOf(AuthRequiredError); + }); + + it('fails typed for malformed UserMedia and fetch failures instead of partial success', async () => { + const cmd = getRegistry().get('twitter/download'); + const malformedPage = createPageMock([ + { queryId: 'UM', features: {}, fieldToggles: {} }, + { queryId: 'UB', features: {}, fieldToggles: {} }, + userLookupPayload('42'), + { ok: true, payload: { data: { user: { result: {} } } } }, + ]); + await expect(cmd.func(malformedPage, { username: 'jack', limit: 1 })) + .rejects.toBeInstanceOf(CommandExecutionError); + + const partialPage = createPageMock([ + { queryId: 'UM', features: {}, fieldToggles: {} }, + { queryId: 'UB', features: {}, fieldToggles: {} }, + userLookupPayload('42'), + userMediaPayload([ + tweetEntry('A'), + { + content: { + entryType: 'TimelineTimelineCursor', + cursorType: 'Bottom', + value: 'cursor-1', + }, + }, + ]), + { ok: false, status: 500 }, + ]); + await expect(cmd.func(partialPage, { username: 'jack', limit: 2 })) + .rejects.toBeInstanceOf(CommandExecutionError); + expect(mockDownloadMedia).not.toHaveBeenCalled(); + }); + + it('uses typed empty result for profile or tweet media absence', async () => { + const cmd = getRegistry().get('twitter/download'); + const profilePage = createPageMock([ + { queryId: 'UM', features: {}, fieldToggles: {} }, + { queryId: 'UB', features: {}, fieldToggles: {} }, + userLookupPayload('42'), + userMediaPayload([]), + ]); + await expect(cmd.func(profilePage, { username: 'jack', limit: 1 })) + .rejects.toBeInstanceOf(EmptyResultError); + + const tweetPage = createPageMock([[]]); + await expect(cmd.func(tweetPage, { 'tweet-url': 'https://x.com/jack/status/123' })) + .rejects.toBeInstanceOf(EmptyResultError); + }); +}); diff --git a/docs/adapters/browser/twitter.md b/docs/adapters/browser/twitter.md index df5397460..6ca393a41 100644 --- a/docs/adapters/browser/twitter.md +++ b/docs/adapters/browser/twitter.md @@ -32,7 +32,7 @@ | `opencli twitter block` | | | `opencli twitter unblock` | | | `opencli twitter hide-reply` | | -| `opencli twitter download` | | +| `opencli twitter download` | Download media from a profile via GraphQL UserMedia pagination, or from one tweet URL | | `opencli twitter accept` | | | `opencli twitter reply-dm` | | | `opencli twitter unlike` | | @@ -56,6 +56,12 @@ opencli twitter search "react 19" --filter live opencli twitter following @elonmusk --limit 200 opencli twitter followers @elonmusk --limit 100 +# Download profile media with cursor pagination +opencli twitter download @elonmusk --limit 50 --output ./twitter-media + +# Download media from a single tweet +opencli twitter download --tweet-url https://x.com/jack/status/20 --output ./twitter-media + # Write actions (require login). Idempotent — calling twice is safe. opencli twitter like https://x.com/jack/status/20 opencli twitter unlike https://x.com/jack/status/20 diff --git a/scripts/silent-column-drop-baseline.json b/scripts/silent-column-drop-baseline.json index b84d2c50e..b9d81bca1 100644 --- a/scripts/silent-column-drop-baseline.json +++ b/scripts/silent-column-drop-baseline.json @@ -612,21 +612,6 @@ "name" ] }, - { - "command": "twitter/download", - "file": "clis/twitter/download.js", - "missing": [ - "poster", - "url" - ] - }, - { - "command": "twitter/download", - "file": "clis/twitter/download.js", - "missing": [ - "url" - ] - }, { "command": "twitter/list-remove", "file": "clis/twitter/list-remove.js", From 2d2039eb7fa3e7df019fa2f7ea4ff6c22582352e Mon Sep 17 00:00:00 2001 From: jackwener Date: Mon, 18 May 2026 19:29:44 +0800 Subject: [PATCH 2/2] fix(twitter): fail closed on repeated media cursor --- clis/twitter/download.js | 12 ++++++++++-- clis/twitter/download.test.js | 29 +++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/clis/twitter/download.js b/clis/twitter/download.js index d9dfc8f50..c1ec72d8c 100644 --- a/clis/twitter/download.js +++ b/clis/twitter/download.js @@ -381,6 +381,7 @@ async function downloadUserMedia(page, username, limit, output) { const seen = new Set(); const all = []; let cursor = null; + let hasMorePages = false; for (let i = 0; i < MAX_PAGINATION_PAGES && all.length < limit; i++) { const fetchCount = nextUserMediaFetchCount(limit, all.length); if (fetchCount === 0) break; @@ -393,14 +394,21 @@ async function downloadUserMedia(page, username, limit, output) { } catch (err) { return { ok: false, error: err?.message ?? String(err) }; } - }`))); + }`))); const { items, nextCursor } = parseUserMedia(data, seen); all.push(...items); - if (!nextCursor || nextCursor === cursor) break; + hasMorePages = Boolean(nextCursor); + if (!nextCursor) break; + if (nextCursor === cursor) { + throw new CommandExecutionError('Twitter UserMedia pagination returned the same cursor twice'); + } cursor = nextCursor; } if (all.length === 0) throw new EmptyResultError(`@${username} has no media`, 'Account may be private, suspended, or have no media posts'); + if (all.length < limit && hasMorePages) { + throw new CommandExecutionError(`Twitter UserMedia pagination reached the ${MAX_PAGINATION_PAGES}-page safety cap before collecting ${limit} media items`); + } const trimmed = all.slice(0, limit); return downloadTwitterMedia(trimmed, { diff --git a/clis/twitter/download.test.js b/clis/twitter/download.test.js index cc60d203e..672ffe418 100644 --- a/clis/twitter/download.test.js +++ b/clis/twitter/download.test.js @@ -408,6 +408,35 @@ describe('twitter download helpers', () => { await expect(cmd.func(partialPage, { username: 'jack', limit: 2 })) .rejects.toBeInstanceOf(CommandExecutionError); expect(mockDownloadMedia).not.toHaveBeenCalled(); + + const repeatedCursorPage = createPageMock([ + { queryId: 'UM', features: {}, fieldToggles: {} }, + { queryId: 'UB', features: {}, fieldToggles: {} }, + userLookupPayload('42'), + userMediaPayload([ + tweetEntry('A'), + { + content: { + entryType: 'TimelineTimelineCursor', + cursorType: 'Bottom', + value: 'cursor-1', + }, + }, + ]), + userMediaPayload([ + tweetEntry('B'), + { + content: { + entryType: 'TimelineTimelineCursor', + cursorType: 'Bottom', + value: 'cursor-1', + }, + }, + ]), + ]); + await expect(cmd.func(repeatedCursorPage, { username: 'jack', limit: 3 })) + .rejects.toThrowError(/same cursor twice/); + expect(mockDownloadMedia).not.toHaveBeenCalled(); }); it('uses typed empty result for profile or tweet media absence', async () => {