From 6e502dbf163c7b689f00063611d6bcbce85393eb Mon Sep 17 00:00:00 2001 From: Mathis Verstrepen Date: Mon, 5 Jan 2026 18:40:19 -0500 Subject: [PATCH 1/4] fix: update _preprocess_url to handle arXiv links locally --- api/app/services/web/web_extract.py | 23 ++++++++++++++++++----- api/requirements.txt | 4 +++- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/api/app/services/web/web_extract.py b/api/app/services/web/web_extract.py index 85a3adfa..749da499 100644 --- a/api/app/services/web/web_extract.py +++ b/api/app/services/web/web_extract.py @@ -1,7 +1,8 @@ import asyncio import json import logging - +import aiofiles.os +from arxiv2text import arxiv_to_md import sentry_sdk from bs4 import BeautifulSoup from curl_cffi.requests import AsyncSession @@ -184,7 +185,7 @@ async def _attempt_browser_fetch(url: str) -> str: await browser.close() -def _preprocess_url(url: str) -> str: +async def _preprocess_url(url: str) -> tuple[str, bool]: """ Preprocesses the URL to ensure it is well-formed. """ @@ -194,13 +195,23 @@ def _preprocess_url(url: str) -> str: # Use https://arxivmd.org/ for arXiv links to get Markdown directly if "arxiv.org" in url: + parts = url.split("/") paper_id = parts[-1] if parts[-1] else parts[-2] - url = f"https://arxivmd.org/format/{paper_id}" + url = f"https://arxiv.org/pdf/{paper_id}" + md = arxiv_to_md(url, ".") + + try: + await aiofiles.os.remove(f"{paper_id}.md") + except OSError: + pass + + return str(md), True if not url.startswith("http") and not url.startswith("https"): url = "https://" + url - return url + + return url, False async def url_to_markdown(url: str) -> str | None: @@ -217,7 +228,9 @@ async def url_to_markdown(url: str) -> str | None: MAX_PROXY_ATTEMPTS = 3 RETRY_DELAY_SECONDS = 2 - url = _preprocess_url(url) + url, is_direct_content = await _preprocess_url(url) + if is_direct_content: + return url async def fetch_and_convert(content: str, base_url: str) -> str | None: """Cleans HTML or parses JSON and converts it to Markdown.""" diff --git a/api/requirements.txt b/api/requirements.txt index 3904d495..535fe01a 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -19,4 +19,6 @@ patchright python-dateutil Jinja2 aiosmtplib -pillow \ No newline at end of file +pillow +arxiv2text +requests \ No newline at end of file From 76d2d6e30bbc58d11d4573103d81b6b2341e6bf1 Mon Sep 17 00:00:00 2001 From: Mathis Verstrepen Date: Mon, 5 Jan 2026 18:40:40 -0500 Subject: [PATCH 2/4] fix: linter --- api/app/services/web/web_extract.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/api/app/services/web/web_extract.py b/api/app/services/web/web_extract.py index 749da499..d2c346b9 100644 --- a/api/app/services/web/web_extract.py +++ b/api/app/services/web/web_extract.py @@ -1,9 +1,10 @@ import asyncio import json import logging + import aiofiles.os -from arxiv2text import arxiv_to_md import sentry_sdk +from arxiv2text import arxiv_to_md from bs4 import BeautifulSoup from curl_cffi.requests import AsyncSession from markdownify import markdownify as md From ea847894bedffdc938e12ba0239c2da246cb30de Mon Sep 17 00:00:00 2001 From: Mathis Verstrepen Date: Mon, 5 Jan 2026 18:47:46 -0500 Subject: [PATCH 3/4] fix: enhance _preprocess_url to handle arXiv links locally with temporary directory --- api/app/services/web/web_extract.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/api/app/services/web/web_extract.py b/api/app/services/web/web_extract.py index d2c346b9..05be516e 100644 --- a/api/app/services/web/web_extract.py +++ b/api/app/services/web/web_extract.py @@ -1,4 +1,7 @@ import asyncio +import tempfile +import os +from functools import partial import json import logging @@ -190,25 +193,26 @@ async def _preprocess_url(url: str) -> tuple[str, bool]: """ Preprocesses the URL to ensure it is well-formed. """ + url = url.strip() + # Add /.json for Reddit URLs to get cleaner content if "www.reddit.com" in url: url += ".json" - # Use https://arxivmd.org/ for arXiv links to get Markdown directly if "arxiv.org" in url: - parts = url.split("/") paper_id = parts[-1] if parts[-1] else parts[-2] - url = f"https://arxiv.org/pdf/{paper_id}" - md = arxiv_to_md(url, ".") + pdf_url = f"https://arxiv.org/pdf/{paper_id}" try: - await aiofiles.os.remove(f"{paper_id}.md") - except OSError: + loop = asyncio.get_running_loop() + with tempfile.TemporaryDirectory() as temp_dir: + content = await loop.run_in_executor(None, partial(arxiv_to_md, pdf_url, temp_dir)) + return str(content), True + except Exception as e: + logging.error(f"Failed to process arXiv URL locally: {e}") pass - return str(md), True - if not url.startswith("http") and not url.startswith("https"): url = "https://" + url From fd079906340840e98fe022972809f442ad72bf3a Mon Sep 17 00:00:00 2001 From: Mathis Verstrepen Date: Mon, 5 Jan 2026 18:48:11 -0500 Subject: [PATCH 4/4] fix: linter --- api/app/services/web/web_extract.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/api/app/services/web/web_extract.py b/api/app/services/web/web_extract.py index 05be516e..2f35ede7 100644 --- a/api/app/services/web/web_extract.py +++ b/api/app/services/web/web_extract.py @@ -1,11 +1,9 @@ import asyncio -import tempfile -import os -from functools import partial import json import logging +import tempfile +from functools import partial -import aiofiles.os import sentry_sdk from arxiv2text import arxiv_to_md from bs4 import BeautifulSoup