From 6e502dbf163c7b689f00063611d6bcbce85393eb Mon Sep 17 00:00:00 2001
From: Mathis Verstrepen <mathisverstrepen@gmail.com>
Date: Mon, 5 Jan 2026 18:40:19 -0500
Subject: [PATCH 1/4] fix: update _preprocess_url to handle arXiv links locally

---
 api/app/services/web/web_extract.py | 23 ++++++++++++++++++-----
 api/requirements.txt                |  4 +++-
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/api/app/services/web/web_extract.py b/api/app/services/web/web_extract.py
index 85a3adfa..749da499 100644
--- a/api/app/services/web/web_extract.py
+++ b/api/app/services/web/web_extract.py
@@ -1,7 +1,8 @@
 import asyncio
 import json
 import logging
-
+import aiofiles.os
+from arxiv2text import arxiv_to_md
 import sentry_sdk
 from bs4 import BeautifulSoup
 from curl_cffi.requests import AsyncSession
@@ -184,7 +185,7 @@ async def _attempt_browser_fetch(url: str) -> str:
                 await browser.close()
 
 
-def _preprocess_url(url: str) -> str:
+async def _preprocess_url(url: str) -> tuple[str, bool]:
     """
     Preprocesses the URL to ensure it is well-formed.
     """
@@ -194,13 +195,23 @@ def _preprocess_url(url: str) -> str:
 
     # Use https://arxivmd.org/ for arXiv links to get Markdown directly
     if "arxiv.org" in url:
+
         parts = url.split("/")
         paper_id = parts[-1] if parts[-1] else parts[-2]
-        url = f"https://arxivmd.org/format/{paper_id}"
+        url = f"https://arxiv.org/pdf/{paper_id}"
+        md = arxiv_to_md(url, ".")
+
+        try:
+            await aiofiles.os.remove(f"{paper_id}.md")
+        except OSError:
+            pass
+
+        return str(md), True
 
     if not url.startswith("http") and not url.startswith("https"):
         url = "https://" + url
-    return url
+
+    return url, False
 
 
 async def url_to_markdown(url: str) -> str | None:
@@ -217,7 +228,9 @@ async def url_to_markdown(url: str) -> str | None:
     MAX_PROXY_ATTEMPTS = 3
     RETRY_DELAY_SECONDS = 2
 
-    url = _preprocess_url(url)
+    url, is_direct_content = await _preprocess_url(url)
+    if is_direct_content:
+        return url
 
     async def fetch_and_convert(content: str, base_url: str) -> str | None:
         """Cleans HTML or parses JSON and converts it to Markdown."""
diff --git a/api/requirements.txt b/api/requirements.txt
index 3904d495..535fe01a 100644
--- a/api/requirements.txt
+++ b/api/requirements.txt
@@ -19,4 +19,6 @@ patchright
 python-dateutil
 Jinja2
 aiosmtplib
-pillow
\ No newline at end of file
+pillow
+arxiv2text
+requests
\ No newline at end of file

From 76d2d6e30bbc58d11d4573103d81b6b2341e6bf1 Mon Sep 17 00:00:00 2001
From: Mathis Verstrepen <mathisverstrepen@gmail.com>
Date: Mon, 5 Jan 2026 18:40:40 -0500
Subject: [PATCH 2/4] fix: linter

---
 api/app/services/web/web_extract.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/api/app/services/web/web_extract.py b/api/app/services/web/web_extract.py
index 749da499..d2c346b9 100644
--- a/api/app/services/web/web_extract.py
+++ b/api/app/services/web/web_extract.py
@@ -1,9 +1,10 @@
 import asyncio
 import json
 import logging
+
 import aiofiles.os
-from arxiv2text import arxiv_to_md
 import sentry_sdk
+from arxiv2text import arxiv_to_md
 from bs4 import BeautifulSoup
 from curl_cffi.requests import AsyncSession
 from markdownify import markdownify as md

From ea847894bedffdc938e12ba0239c2da246cb30de Mon Sep 17 00:00:00 2001
From: Mathis Verstrepen <mathisverstrepen@gmail.com>
Date: Mon, 5 Jan 2026 18:47:46 -0500
Subject: [PATCH 3/4] fix: enhance _preprocess_url to handle arXiv links
 locally with temporary directory

---
 api/app/services/web/web_extract.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/api/app/services/web/web_extract.py b/api/app/services/web/web_extract.py
index d2c346b9..05be516e 100644
--- a/api/app/services/web/web_extract.py
+++ b/api/app/services/web/web_extract.py
@@ -1,4 +1,7 @@
 import asyncio
+import tempfile
+import os
+from functools import partial
 import json
 import logging
 
@@ -190,25 +193,26 @@ async def _preprocess_url(url: str) -> tuple[str, bool]:
     """
     Preprocesses the URL to ensure it is well-formed.
     """
+    url = url.strip()
+
     # Add /.json for Reddit URLs to get cleaner content
     if "www.reddit.com" in url:
         url += ".json"
 
-    # Use https://arxivmd.org/ for arXiv links to get Markdown directly
     if "arxiv.org" in url:
-
         parts = url.split("/")
         paper_id = parts[-1] if parts[-1] else parts[-2]
-        url = f"https://arxiv.org/pdf/{paper_id}"
-        md = arxiv_to_md(url, ".")
+        pdf_url = f"https://arxiv.org/pdf/{paper_id}"
 
         try:
-            await aiofiles.os.remove(f"{paper_id}.md")
-        except OSError:
+            loop = asyncio.get_running_loop()
+            with tempfile.TemporaryDirectory() as temp_dir:
+                content = await loop.run_in_executor(None, partial(arxiv_to_md, pdf_url, temp_dir))
+                return str(content), True
+        except Exception as e:
+            logging.error(f"Failed to process arXiv URL locally: {e}")
             pass
 
-        return str(md), True
-
     if not url.startswith("http") and not url.startswith("https"):
         url = "https://" + url
 

From fd079906340840e98fe022972809f442ad72bf3a Mon Sep 17 00:00:00 2001
From: Mathis Verstrepen <mathisverstrepen@gmail.com>
Date: Mon, 5 Jan 2026 18:48:11 -0500
Subject: [PATCH 4/4] fix: linter

---
 api/app/services/web/web_extract.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/api/app/services/web/web_extract.py b/api/app/services/web/web_extract.py
index 05be516e..2f35ede7 100644
--- a/api/app/services/web/web_extract.py
+++ b/api/app/services/web/web_extract.py
@@ -1,11 +1,9 @@
 import asyncio
-import tempfile
-import os
-from functools import partial
 import json
 import logging
+import tempfile
+from functools import partial
 
-import aiofiles.os
 import sentry_sdk
 from arxiv2text import arxiv_to_md
 from bs4 import BeautifulSoup