From 8b6d2e39358d6ebd83fcb4d0a38bcd605c660ada Mon Sep 17 00:00:00 2001 From: eelcovdw Date: Mon, 6 Oct 2025 14:01:41 +0200 Subject: [PATCH 1/3] add browser stealth --- packages/omni/examples/scrape_test.py | 54 +++++ .../omni/data_fetchers/browser_stealth.py | 214 ++++++++++++++++++ packages/omni/omni/data_fetchers/x_fetcher.py | 49 +++- 3 files changed, 315 insertions(+), 2 deletions(-) create mode 100644 packages/omni/examples/scrape_test.py create mode 100644 packages/omni/omni/data_fetchers/browser_stealth.py diff --git a/packages/omni/examples/scrape_test.py b/packages/omni/examples/scrape_test.py new file mode 100644 index 0000000..045157b --- /dev/null +++ b/packages/omni/examples/scrape_test.py @@ -0,0 +1,54 @@ +import random +import time + +from omni.data_fetchers.x_fetcher import XDataFetcher + +# List of Twitter handles to scrape +HANDLES = [ + "elonmusk", + "BillGates", + "BarackObama", + "naval", + "paulg", + "sama", + "vitalikbuterin", + "balajis", + "pmarca", + "ycombinator", + "a16z", + "naval", + "cdixon", + "openmined", +] + +# Number of accounts to scrape per job +N = 2 + +# Initialize the data fetcher +scraper = XDataFetcher(headless=True) +scraper.start() + + +max_run_time = 5 * 60 * 60 +start_time = time.time() +while True: + # Wait random interval between 300-3600 seconds + wait_time = random.randint(600, 3600) + print(f"Waiting {wait_time} seconds until next scrape...") + time.sleep(wait_time) + + # Check if max run time exceeded + if time.time() - start_time > max_run_time: + print("Max run time exceeded, exiting...") + break + # Choose N random accounts + if HANDLES: + selected_handles = random.sample(HANDLES, min(N, len(HANDLES))) + print(f"Submitting scrape job for: {selected_handles}") + scraper.add_follow_users_job( + handles=selected_handles, fetch_timeline_duration=10 + ) + else: + print("No handles configured, skipping scrape") + +scraper.stop() diff --git a/packages/omni/omni/data_fetchers/browser_stealth.py b/packages/omni/omni/data_fetchers/browser_stealth.py new file mode 100644 index 0000000..3814df2 --- /dev/null +++ b/packages/omni/omni/data_fetchers/browser_stealth.py @@ -0,0 +1,214 @@ +from playwright.async_api import BrowserContext + +BROWSER_ARGS = [ + "--no-sandbox", + "--disable-dev-shm-usage", + "--disable-blink-features=AutomationControlled", +] + +USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36" + +STEALTH_JS = """ +// Store original functions to maintain [native code] appearance +const originalDefineProperty = Object.defineProperty; +const originalGetOwnPropertyDescriptor = Object.getOwnPropertyDescriptor; + +// Try to make our overrides look as native as possible +const createNativeLookingFunction = (func, name) => { + // Wrap function to return [native code] on toString() + const wrapper = new Proxy(func, { + apply(target, thisArg, args) { + return target.apply(thisArg, args); + }, + }); + + // Override toString to return [native code] + try { + originalDefineProperty(wrapper, "toString", { + value: function () { + return `function ${name}() { [native code] }`; + }, + configurable: false, + writable: false, + }); + } catch (e) {} + + return wrapper; +}; + +// 1. userAgentData - THE CRITICAL ONE +try { + // Try to make it look native by defining on prototype + const navProto = Object.getPrototypeOf(navigator); + const originalGetter = originalGetOwnPropertyDescriptor( + navProto, + "userAgentData" + )?.get; + + originalDefineProperty(navProto, "userAgentData", { + get: createNativeLookingFunction(function () { + return { + brands: [ + { brand: "Chromium", version: "136" }, + { brand: "Google Chrome", version: "136" }, + { brand: "Not.A/Brand", version: "99" }, + ], + mobile: false, + platform: "macOS", + getHighEntropyValues: createNativeLookingFunction( + () => + Promise.resolve({ + brands: [ + { brand: "Chromium", version: "136" }, + { brand: "Google Chrome", version: "136" }, + { brand: "Not.A/Brand", version: "99" }, + ], + mobile: false, + platform: "macOS", + platformVersion: "15.0.0", + architecture: "arm64", + bitness: "64", + model: "", + uaFullVersion: "136.0.6961.0", + }), + "getHighEntropyValues" + ), + }; + }, "get userAgentData"), + configurable: true, + enumerable: true, + }); +} catch (e) { + console.error("Failed to override userAgentData:", e); +} + +// 2. Remove webdriver - try to delete completely +try { + delete Object.getPrototypeOf(navigator).webdriver; + delete navigator.__proto__.webdriver; + delete navigator.webdriver; +} catch (e) {} + +// 3. Plugins - use native-looking array +try { + const createPlugin = (name, desc, filename) => { + return { + 0: { type: "application/pdf", suffixes: "pdf", description: desc }, + 1: { type: "text/pdf", suffixes: "pdf", description: desc }, + description: desc, + filename: filename, + length: 2, + name: name, + item: createNativeLookingFunction(function (index) { + return this[index] || null; + }, "item"), + namedItem: createNativeLookingFunction(function (name) { + return null; + }, "namedItem"), + }; + }; + + const plugins = [ + createPlugin( + "PDF Viewer", + "Portable Document Format", + "internal-pdf-viewer" + ), + createPlugin( + "Chrome PDF Viewer", + "Portable Document Format", + "internal-pdf-viewer" + ), + createPlugin( + "Chromium PDF Viewer", + "Portable Document Format", + "internal-pdf-viewer" + ), + createPlugin( + "Microsoft Edge PDF Viewer", + "Portable Document Format", + "internal-pdf-viewer" + ), + createPlugin( + "WebKit built-in PDF", + "Portable Document Format", + "internal-pdf-viewer" + ), + ]; + + plugins.item = createNativeLookingFunction(function (index) { + return this[index] || null; + }, "item"); + + plugins.namedItem = createNativeLookingFunction(function (name) { + return Array.from(this).find((p) => p.name === name) || null; + }, "namedItem"); + + plugins.refresh = createNativeLookingFunction(function () {}, "refresh"); + + originalDefineProperty(Object.getPrototypeOf(navigator), "plugins", { + get: createNativeLookingFunction(() => plugins, "get plugins"), + configurable: true, + enumerable: true, + }); +} catch (e) { + console.error("Failed to override plugins:", e); +} + +// 4. MimeTypes +try { + const mimeTypes = [ + { + type: "application/pdf", + suffixes: "pdf", + description: "Portable Document Format", + }, + { + type: "text/pdf", + suffixes: "pdf", + description: "Portable Document Format", + }, + ]; + + mimeTypes.item = createNativeLookingFunction(function (index) { + return this[index] || null; + }, "item"); + + mimeTypes.namedItem = createNativeLookingFunction(function (name) { + return Array.from(this).find((m) => m.type === name) || null; + }, "namedItem"); + + originalDefineProperty(Object.getPrototypeOf(navigator), "mimeTypes", { + get: createNativeLookingFunction(() => mimeTypes, "get mimeTypes"), + configurable: true, + enumerable: true, + }); +} catch (e) { + console.error("Failed to override mimeTypes:", e); +} + +// 5. Chrome object +if (!window.chrome || !window.chrome.runtime) { + window.chrome = { + runtime: {}, + loadTimes: createNativeLookingFunction(function () {}, "loadTimes"), + csi: createNativeLookingFunction(function () {}, "csi"), + app: {}, + }; +} + +// 6. Permissions +const origQuery = navigator.permissions?.query; +if (origQuery) { + navigator.permissions.query = createNativeLookingFunction(function (params) { + if (params?.name === "notifications") { + return Promise.resolve({ state: Notification.permission }); + } + return origQuery.call(this, params); + }, "query"); +} +""" + + +async def apply_stealth_mode(context: BrowserContext) -> None: + await context.add_init_script(STEALTH_JS) diff --git a/packages/omni/omni/data_fetchers/x_fetcher.py b/packages/omni/omni/data_fetchers/x_fetcher.py index 09633be..fa0d512 100644 --- a/packages/omni/omni/data_fetchers/x_fetcher.py +++ b/packages/omni/omni/data_fetchers/x_fetcher.py @@ -11,6 +11,11 @@ import browser_cookie3 from playwright.async_api import Browser, BrowserContext, Page, async_playwright +from omni.data_fetchers.browser_stealth import ( + BROWSER_ARGS, + USER_AGENT, + apply_stealth_mode, +) from omni.data_fetchers.job_queue import DataFetcherJobQueue from omni.data_fetchers.x_utils import parse_tweets_json, parse_user_tweets_json from omni.db import get_tweet_store @@ -71,6 +76,32 @@ def load_cookies_from_file() -> list[dict] | None: return +async def simulate_user_activity(page: Page) -> None: + """Simulate user activity to bypass hasBeenActive detection""" + try: + await page.mouse.move(random.randint(100, 500), random.randint(100, 500)) + + # Random keypress + safe_keys = [ + "Tab", + "Shift", + "Escape", + "ArrowDown", + "ArrowUp", + "ArrowLeft", + "ArrowRight", + "Home", + ] + await page.keyboard.press(random.choice(safe_keys)) + + # Small scroll + await page.mouse.wheel(0, random.randint(5, 25)) + + except Exception as e: + print(f"simulate_user_activity error: {e}") + pass + + def get_cookies_for_playwright( use_cached_x_cookies: bool = settings.use_cached_x_cookies, ) -> list[dict]: @@ -93,8 +124,19 @@ async def setup_browser( """Setup browser with authentication cookies""" p = await async_playwright().start() - browser = await p.chromium.launch(headless=headless) - context = await browser.new_context() + browser = await p.chromium.launch( + headless=headless, + args=BROWSER_ARGS, + ) + + # Create context + context = await browser.new_context( + viewport={"width": 1920, "height": 1080}, + user_agent=USER_AGENT, + ) + + # Apply stealth measures + await apply_stealth_mode(context) # Set cookies before navigating await context.add_cookies(x_cookies) @@ -106,6 +148,7 @@ async def setup_browser( print(cookie) page = await context.new_page() + await simulate_user_activity(page) return browser, context, page @@ -178,6 +221,7 @@ async def fetch_timeline( # Navigate to X.com await page.goto("https://x.com") + await simulate_user_activity(page) await asyncio.sleep(3) # Click "Following" to switch to chronological timeline @@ -275,6 +319,7 @@ async def follow_user( # Navigate to user's profile profile_url = f"https://x.com/{handle.lstrip('@')}" await page.goto(profile_url) + await simulate_user_activity(page) await asyncio.sleep(3) # Find follow button for this specific user using partial aria-label (for localization) From e154bc7413ab43f492ba158064549a3a3ba9580b Mon Sep 17 00:00:00 2001 From: eelcovdw Date: Mon, 6 Oct 2025 14:02:36 +0200 Subject: [PATCH 2/3] remove script --- packages/omni/examples/scrape_test.py | 54 --------------------------- 1 file changed, 54 deletions(-) delete mode 100644 packages/omni/examples/scrape_test.py diff --git a/packages/omni/examples/scrape_test.py b/packages/omni/examples/scrape_test.py deleted file mode 100644 index 045157b..0000000 --- a/packages/omni/examples/scrape_test.py +++ /dev/null @@ -1,54 +0,0 @@ -import random -import time - -from omni.data_fetchers.x_fetcher import XDataFetcher - -# List of Twitter handles to scrape -HANDLES = [ - "elonmusk", - "BillGates", - "BarackObama", - "naval", - "paulg", - "sama", - "vitalikbuterin", - "balajis", - "pmarca", - "ycombinator", - "a16z", - "naval", - "cdixon", - "openmined", -] - -# Number of accounts to scrape per job -N = 2 - -# Initialize the data fetcher -scraper = XDataFetcher(headless=True) -scraper.start() - - -max_run_time = 5 * 60 * 60 -start_time = time.time() -while True: - # Wait random interval between 300-3600 seconds - wait_time = random.randint(600, 3600) - print(f"Waiting {wait_time} seconds until next scrape...") - time.sleep(wait_time) - - # Check if max run time exceeded - if time.time() - start_time > max_run_time: - print("Max run time exceeded, exiting...") - break - # Choose N random accounts - if HANDLES: - selected_handles = random.sample(HANDLES, min(N, len(HANDLES))) - print(f"Submitting scrape job for: {selected_handles}") - scraper.add_follow_users_job( - handles=selected_handles, fetch_timeline_duration=10 - ) - else: - print("No handles configured, skipping scrape") - -scraper.stop() From 9a0e28a06493b6280d3144d456e2c5465d250b7e Mon Sep 17 00:00:00 2001 From: Koen van der Veen Date: Mon, 6 Oct 2025 15:49:48 +0200 Subject: [PATCH 3/3] - --- packages/omni/omni/data_fetchers/browser_stealth.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/packages/omni/omni/data_fetchers/browser_stealth.py b/packages/omni/omni/data_fetchers/browser_stealth.py index 3814df2..f84c638 100644 --- a/packages/omni/omni/data_fetchers/browser_stealth.py +++ b/packages/omni/omni/data_fetchers/browser_stealth.py @@ -89,6 +89,11 @@ delete navigator.webdriver; } catch (e) {} +// then redefine it +Object.defineProperty(navigator, 'webdriver', { + get: () => false +}); + // 3. Plugins - use native-looking array try { const createPlugin = (name, desc, filename) => {