From 033d5e7aeb1fffe1b2e2fce6b57a922f0f7085ca Mon Sep 17 00:00:00 2001 From: LiamVDB Date: Fri, 11 Jul 2025 16:46:10 +0200 Subject: [PATCH 1/7] feat: add article parsing and handling to timeline and scraper modules --- src/scraper.ts | 30 +++++++++- src/timeline-v1.ts | 74 +++++++++++++++++++++++ src/timeline-v2.ts | 144 ++++++++++++++++++++++++++++++++++++++++++++- src/tweets.ts | 15 ++++- 4 files changed, 259 insertions(+), 4 deletions(-) diff --git a/src/scraper.ts b/src/scraper.ts index 70e4f94a..6e3a3641 100644 --- a/src/scraper.ts +++ b/src/scraper.ts @@ -480,10 +480,36 @@ export class Scraper { * Set cookies for the current session. * @param cookies The cookies to set for the current session. */ - public async setCookies(cookies: (string | Cookie)[]): Promise { + public async setCookies(cookies: (string | Cookie | any)[]): Promise { const userAuth = new TwitterUserAuth(this.token, this.getAuthOptions()); for (const cookie of cookies) { - await userAuth.cookieJar().setCookie(cookie, twUrl); + let cookieToSet: string | Cookie; + + // If it's a plain object (from JSON.parse), convert it to a Cookie instance + if (typeof cookie === 'object' && cookie !== null && !(cookie instanceof Cookie) && typeof cookie !== 'string') { + // Fix domain issue and name/key property issue + const cookieData = { ...cookie }; + if (cookieData.domain && cookieData.domain.startsWith('.x.com')) { + cookieData.domain = 'x.com'; + } + // tough-cookie expects 'key' property, but browser cookies use 'name' + if (cookieData.name && !cookieData.key) { + cookieData.key = cookieData.name; + delete cookieData.name; + } + + const parsedCookie = Cookie.fromJSON(cookieData); + if (parsedCookie) { + cookieToSet = parsedCookie; + } else { + console.warn('Failed to parse cookie:', cookie); + continue; + } + } else { + cookieToSet = cookie; + } + + await userAuth.cookieJar().setCookie(cookieToSet, twUrl); } this.auth = userAuth; diff --git a/src/timeline-v1.ts b/src/timeline-v1.ts index df085d04..19969b80 100644 --- a/src/timeline-v1.ts +++ b/src/timeline-v1.ts @@ -85,6 +85,7 @@ export interface SearchResultRaw { result?: SearchResultRaw; }; legacy?: LegacyTweetRaw; + article?: ArticleRaw; } export interface TimelineResultRaw { @@ -118,6 +119,79 @@ export interface TimelineResultRaw { }; legacy?: LegacyTweetRaw; tweet?: TimelineResultRaw; + article?: ArticleRaw; +} + +export interface ArticleRaw { + article_results: { + result: ArticleResultRaw; + }; +} + +export interface ArticleResultRaw { + rest_id: string; + title: string; + cover_media?: ArticleCoverMediaRaw; + content_state: ArticleContentStateRaw; + media_entities?: ArticleMediaEntityRaw[]; +} + +export interface ArticleCoverMediaRaw { + media_key: string; + media_info: { + original_img_url: string; + }; +} + +export interface ArticleContentStateRaw { + blocks: ArticleBlockRaw[]; + entityMap: ArticleEntityRaw[]; +} + +export interface ArticleBlockRaw { + key: string; + text: string; + type: string; + inlineStyleRanges: { + offset: number; + length: number; + style: string; + }[]; + entityRanges: { + key: number; + offset: number; + length: number; + }[]; +} + +export interface ArticleEntityValueMediaItemRaw { + localMediaId: string; + mediaCategory: string; + mediaId: string; +} + +export interface ArticleEntityValueRaw { + type: string; + mutability?: string; + data: { + url?: string; + entityKey?: string; + mediaItems?: ArticleEntityValueMediaItemRaw[]; + }; +} + +export interface ArticleEntityRaw { + key: number; + value: ArticleEntityValueRaw; +} + +export interface ArticleMediaEntityRaw { + media_key: string; + media_id: string; + media_info: { + __typename: 'ApiImage' | 'ApiGif' | 'ApiVideo'; + original_img_url: string; + }; } export interface LegacyTweetRaw { diff --git a/src/timeline-v2.ts b/src/timeline-v2.ts index 213661c7..3d8531f7 100644 --- a/src/timeline-v2.ts +++ b/src/timeline-v2.ts @@ -1,6 +1,8 @@ import { CoreUserRaw, LegacyUserRaw } from './profile'; import { parseMediaGroups, reconstructTweetHtml } from './timeline-tweet-util'; import { + ArticleEntityValueMediaItemRaw, + ArticleResultRaw, EditControlInitialRaw, LegacyTweetRaw, ParseTweetResult, @@ -8,7 +10,7 @@ import { SearchResultRaw, TimelineResultRaw, } from './timeline-v1'; -import { Tweet } from './tweets'; +import { Article, Tweet } from './tweets'; import { isFieldDefined } from './type-util'; export interface TimelineUserResultRaw { @@ -256,6 +258,139 @@ export function parseLegacyTweet( return { success: true, tweet: tw }; } +function parseArticleToMarkdown(article: Readonly): string { + const { blocks, entityMap } = article.content_state; + let markdown = `# ${article.title}\\n\\n`; + + for (const block of blocks) { + let text = block.text; + + const sortedEntityRanges = [...block.entityRanges].sort( + (a, b) => b.offset - a.offset, + ); // Reverse order to prevent messing up the offsets + for (const range of sortedEntityRanges) { + const entityWrapper = entityMap.find( + (e) => String(e.key) === String(range.key), + ); + if (!entityWrapper) continue; + const entity = entityWrapper.value; + + const chars = Array.from(text); + const originalText = chars + .slice(range.offset, range.offset + range.length) + .join(''); + let replacement = originalText; + + let textToWrap = originalText; + let trailingNewline = ''; + + if (textToWrap.endsWith('\n')) { + textToWrap = textToWrap.slice(0, -1); + trailingNewline = '\n'; + } + + if (entity.type === 'LINK' && entity.data.url) { + replacement = `[${textToWrap}](${entity.data.url})${trailingNewline}`; + } + + const prefix = chars.slice(0, range.offset).join(''); + const suffix = chars.slice(range.offset + range.length).join(''); + text = prefix + replacement + suffix; + } + + const sortedStyleRanges = [...block.inlineStyleRanges].sort( + (a, b) => b.offset - a.offset, + ); + for (const range of sortedStyleRanges) { + const chars = Array.from(text); + const originalText = chars + .slice(range.offset, range.offset + range.length) + .join(''); + let replacement = originalText; + + let textToWrap = originalText; + let trailingNewline = ''; + + if (textToWrap.endsWith('\n')) { + textToWrap = textToWrap.slice(0, -1); + trailingNewline = '\n'; + } + + if (range.style.toLowerCase() === 'bold') { + replacement = `**${textToWrap}**${trailingNewline}`; + } else if (range.style.toLowerCase() === 'italic') { + replacement = `*${textToWrap}*${trailingNewline}`; + } + + const prefix = chars.slice(0, range.offset).join(''); + const suffix = chars.slice(range.offset + range.length).join(''); + text = prefix + replacement + suffix; + } + + switch (block.type) { + case 'header-one': + markdown += `# ${text}\\n\\n`; + break; + case 'header-two': + markdown += `## ${text}\\n\\n`; + break; + case 'unordered-list-item': + markdown += `* ${text}\\n`; + break; + case 'atomic': + for (const range of block.entityRanges) { + const entityWrapper = entityMap.find( + (e) => String(e.key) === String(range.key), + ); + if (!entityWrapper) continue; + const entity = entityWrapper.value; + if (entity?.type === 'MEDIA' && entity.data.mediaItems) { + for (const mediaItem of entity.data.mediaItems) { + if (mediaItem?.mediaId) { + const mediaEntity = article.media_entities?.find( + (m) => m.media_id === mediaItem.mediaId, + ); + if (mediaEntity) { + markdown += `![image](${mediaEntity.media_info.original_img_url})\\n\\n`; + } + } + } + } + } + break; + case 'unstyled': + default: + markdown += `${text}\\n\\n`; + break; + } + } + + return markdown.trim(); +} + +function parseArticle(articleRaw: Readonly): Article { + const article: Article = { + id: articleRaw.rest_id, + title: articleRaw.title, + blocks: articleRaw.content_state.blocks, + }; + + if (articleRaw.cover_media) { + const coverMedia = articleRaw.media_entities?.find( + (m) => m.media_key === articleRaw.cover_media?.media_key, + ); + if (coverMedia) { + article.cover = { + id: coverMedia.media_id, + url: coverMedia.media_info.original_img_url, + alt_text: undefined, // not available + }; + } + } + + return article; +} + function parseResult(result?: TimelineResultRaw): ParseTweetResult { const noteTweetResultText = result?.note_tweet?.note_tweet_results?.result?.text; @@ -281,6 +416,13 @@ function parseResult(result?: TimelineResultRaw): ParseTweetResult { } } + const articleRaw = result?.article?.article_results?.result; + if (articleRaw) { + tweetResult.tweet.isArticle = true; + tweetResult.tweet.article = parseArticle(articleRaw); + tweetResult.tweet.text = parseArticleToMarkdown(articleRaw); + } + const quotedResult = result?.quoted_status_result?.result; if (quotedResult) { if (quotedResult.legacy && quotedResult.rest_id) { diff --git a/src/tweets.ts b/src/tweets.ts index 2522da31..fb989454 100644 --- a/src/tweets.ts +++ b/src/tweets.ts @@ -1,7 +1,11 @@ import { addApiFeatures, requestApi } from './api'; import { TwitterAuth } from './auth'; import { getUserIdByScreenName } from './profile'; -import { LegacyTweetRaw, QueryTweetsResponse } from './timeline-v1'; +import { + ArticleBlockRaw, + LegacyTweetRaw, + QueryTweetsResponse, +} from './timeline-v1'; import { parseTimelineTweetsV2, TimelineV2, @@ -33,6 +37,13 @@ export interface Video { url?: string; } +export interface Article { + id: string; + title: string; + cover?: Photo; + blocks: ArticleBlockRaw[]; +} + export interface PlaceRaw { id?: string; place_type?: string; @@ -65,6 +76,8 @@ export interface Tweet { isReply?: boolean; isRetweet?: boolean; isSelfThread?: boolean; + isArticle?: boolean; + article?: Article; likes?: number; name?: string; mentions: Mention[]; From e1266dcdec73b2a3f73c9d4954bd57e068e525ad Mon Sep 17 00:00:00 2001 From: LiamVDB Date: Fri, 11 Jul 2025 16:48:49 +0200 Subject: [PATCH 2/7] updated unnecessary changes --- src/scraper.ts | 30 ++---------------------------- 1 file changed, 2 insertions(+), 28 deletions(-) diff --git a/src/scraper.ts b/src/scraper.ts index 6e3a3641..70e4f94a 100644 --- a/src/scraper.ts +++ b/src/scraper.ts @@ -480,36 +480,10 @@ export class Scraper { * Set cookies for the current session. * @param cookies The cookies to set for the current session. */ - public async setCookies(cookies: (string | Cookie | any)[]): Promise { + public async setCookies(cookies: (string | Cookie)[]): Promise { const userAuth = new TwitterUserAuth(this.token, this.getAuthOptions()); for (const cookie of cookies) { - let cookieToSet: string | Cookie; - - // If it's a plain object (from JSON.parse), convert it to a Cookie instance - if (typeof cookie === 'object' && cookie !== null && !(cookie instanceof Cookie) && typeof cookie !== 'string') { - // Fix domain issue and name/key property issue - const cookieData = { ...cookie }; - if (cookieData.domain && cookieData.domain.startsWith('.x.com')) { - cookieData.domain = 'x.com'; - } - // tough-cookie expects 'key' property, but browser cookies use 'name' - if (cookieData.name && !cookieData.key) { - cookieData.key = cookieData.name; - delete cookieData.name; - } - - const parsedCookie = Cookie.fromJSON(cookieData); - if (parsedCookie) { - cookieToSet = parsedCookie; - } else { - console.warn('Failed to parse cookie:', cookie); - continue; - } - } else { - cookieToSet = cookie; - } - - await userAuth.cookieJar().setCookie(cookieToSet, twUrl); + await userAuth.cookieJar().setCookie(cookie, twUrl); } this.auth = userAuth; From 006d322c23f31a024b6bf82db9ea9bc3b30babef Mon Sep 17 00:00:00 2001 From: LiamVDB Date: Fri, 11 Jul 2025 16:56:07 +0200 Subject: [PATCH 3/7] Updated to include content state in final Tweet Article, instead of only the blocks. --- src/timeline-v2.ts | 2 +- src/tweets.ts | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/timeline-v2.ts b/src/timeline-v2.ts index 3d8531f7..c16fcc18 100644 --- a/src/timeline-v2.ts +++ b/src/timeline-v2.ts @@ -372,7 +372,7 @@ function parseArticle(articleRaw: Readonly): Article { const article: Article = { id: articleRaw.rest_id, title: articleRaw.title, - blocks: articleRaw.content_state.blocks, + content_state: articleRaw.content_state, }; if (articleRaw.cover_media) { diff --git a/src/tweets.ts b/src/tweets.ts index fb989454..c2c76ce8 100644 --- a/src/tweets.ts +++ b/src/tweets.ts @@ -2,7 +2,7 @@ import { addApiFeatures, requestApi } from './api'; import { TwitterAuth } from './auth'; import { getUserIdByScreenName } from './profile'; import { - ArticleBlockRaw, + ArticleContentStateRaw, LegacyTweetRaw, QueryTweetsResponse, } from './timeline-v1'; @@ -41,7 +41,7 @@ export interface Article { id: string; title: string; cover?: Photo; - blocks: ArticleBlockRaw[]; + content_state: ArticleContentStateRaw; } export interface PlaceRaw { From 5a2d20fec13a9211de1351e0c346583061e59582 Mon Sep 17 00:00:00 2001 From: LiamVDB Date: Sat, 12 Jul 2025 00:51:03 +0200 Subject: [PATCH 4/7] feat: Add prepare script for automatic build on install --- package.json | 1 + 1 file changed, 1 insertion(+) diff --git a/package.json b/package.json index 4bc61e94..7400f691 100644 --- a/package.json +++ b/package.json @@ -30,6 +30,7 @@ "packageManager": "yarn@1.22.19", "scripts": { "build": "rimraf dist && rollup -c", + "prepare": "yarn build", "commit": "cz", "docs:generate": "typedoc --options typedoc.json", "docs:deploy": "yarn docs:generate && gh-pages -d docs", From 3795c11a20253355f8ff31a08a5381be26343d10 Mon Sep 17 00:00:00 2001 From: LiamVDB Date: Sat, 12 Jul 2025 00:59:04 +0200 Subject: [PATCH 5/7] feat: Add prepare script for automatic build on install --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 7400f691..07a27099 100644 --- a/package.json +++ b/package.json @@ -30,7 +30,7 @@ "packageManager": "yarn@1.22.19", "scripts": { "build": "rimraf dist && rollup -c", - "prepare": "yarn build", + "prepare": "yarn install & yarn build", "commit": "cz", "docs:generate": "typedoc --options typedoc.json", "docs:deploy": "yarn docs:generate && gh-pages -d docs", From 261fc36a445d029051bcb2c70628ca1922f91228 Mon Sep 17 00:00:00 2001 From: LiamVDB Date: Sat, 12 Jul 2025 01:01:26 +0200 Subject: [PATCH 6/7] feat: Add prepare script for automatic build on install --- package.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/package.json b/package.json index 07a27099..bee6ae47 100644 --- a/package.json +++ b/package.json @@ -30,12 +30,11 @@ "packageManager": "yarn@1.22.19", "scripts": { "build": "rimraf dist && rollup -c", - "prepare": "yarn install & yarn build", "commit": "cz", "docs:generate": "typedoc --options typedoc.json", "docs:deploy": "yarn docs:generate && gh-pages -d docs", "format": "prettier --write src/**/*.ts", - "prepare": "husky install", + "prepare": "husky install && yarn build", "test": "jest" }, "dependencies": { From 960dd7a81e1ae1149007c0403840ba2d2b7eb31a Mon Sep 17 00:00:00 2001 From: LiamVDB Date: Sat, 12 Jul 2025 15:03:28 +0200 Subject: [PATCH 7/7] fix: Set isArticle already from `UserTweets` endpoint (full information not available), only parse Article from `TweetDetail` endpoint. --- src/timeline-v2.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/timeline-v2.ts b/src/timeline-v2.ts index c16fcc18..966b5673 100644 --- a/src/timeline-v2.ts +++ b/src/timeline-v2.ts @@ -419,8 +419,10 @@ function parseResult(result?: TimelineResultRaw): ParseTweetResult { const articleRaw = result?.article?.article_results?.result; if (articleRaw) { tweetResult.tweet.isArticle = true; - tweetResult.tweet.article = parseArticle(articleRaw); - tweetResult.tweet.text = parseArticleToMarkdown(articleRaw); + if (articleRaw.content_state) { + tweetResult.tweet.article = parseArticle(articleRaw); + tweetResult.tweet.text = parseArticleToMarkdown(articleRaw); + } } const quotedResult = result?.quoted_status_result?.result;