From 3e95596958d548ccf7e66126ce4f930f03296e19 Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Wed, 15 Apr 2026 13:07:37 -0400 Subject: [PATCH 01/17] Introduce cached, concurrent crawl and ingest APIs --- src/raghilda/__init__.py | 3 +- src/raghilda/_store.py | 109 ++- src/raghilda/crawl.py | 1705 ++++++++++++++++++++++++++++++++++++ src/raghilda/store.py | 3 +- tests/test_api_contract.py | 86 +- tests/test_crawl.py | 1590 +++++++++++++++++++++++++++++++++ tests/test_store_ingest.py | 444 ++++++++++ 7 files changed, 3931 insertions(+), 9 deletions(-) create mode 100644 src/raghilda/crawl.py create mode 100644 tests/test_crawl.py create mode 100644 tests/test_store_ingest.py diff --git a/src/raghilda/__init__.py b/src/raghilda/__init__.py index b123fc6..768b9e8 100644 --- a/src/raghilda/__init__.py +++ b/src/raghilda/__init__.py @@ -1,6 +1,7 @@ -from . import embedding, store, types, chunk, chunker, document, read, scrape +from . import crawl, embedding, store, types, chunk, chunker, document, read, scrape __all__ = [ + "crawl", "embedding", "store", "types", diff --git a/src/raghilda/_store.py b/src/raghilda/_store.py index e6fdd44..f7b7290 100644 --- a/src/raghilda/_store.py +++ b/src/raghilda/_store.py @@ -1,13 +1,16 @@ from __future__ import annotations from abc import ABC, abstractmethod +from concurrent.futures import FIRST_COMPLETED, CancelledError, ThreadPoolExecutor, wait from dataclasses import dataclass -from typing import Generic, Literal, Sequence, TypeVar +import threading +from typing import Any, Callable, Generic, Iterable, Literal, Sequence, TypeVar from .chunk import RetrievedChunk from .document import Document TDocument = TypeVar("TDocument", bound=Document, covariant=True) +_RECENT_INGEST_ORIGIN_WINDOW = 10_000 @dataclass(frozen=True) @@ -17,6 +20,13 @@ class WriteResult(Generic[TDocument]): replaced_document: TDocument | None = None +@dataclass(frozen=True) +class IngestSummary: + inserted: int + replaced: int + skipped: int + + class BaseStore(ABC): """Abstract base class for vector stores. @@ -77,6 +87,103 @@ def upsert( """ pass + def ingest( + self, + documents: Iterable[Any], + *, + prepare: Callable[[Any], Document] | None = None, + max_workers: int = 1, + ) -> IngestSummary: + """Prepare and upsert a stream of documents. + + Inputs are consumed lazily and submitted incrementally. After + ``prepare`` is applied, recent non-empty string origins are checked for + duplicates as the stream is consumed. Duplicate detection is best + effort: a duplicate raises ``ValueError`` when encountered, after any + writes already in flight complete. No rollback is attempted. + """ + assert max_workers >= 1 + stop_event = threading.Event() + recent_origins: dict[str, None] = {} + recent_origins_lock = threading.Lock() + + def remember_origin(origin: str | None) -> None: + if not isinstance(origin, str) or not origin: + return + with recent_origins_lock: + if origin in recent_origins: + raise ValueError(f"Duplicate origin during ingest: {origin}") + recent_origins[origin] = None + if len(recent_origins) > _RECENT_INGEST_ORIGIN_WINDOW: + # dict preserves insertion order, so the first key is the oldest. + recent_origins.pop(next(iter(recent_origins))) + + def process_document(item: Any) -> WriteResult[Document]: + if stop_event.is_set(): + raise CancelledError() + document = prepare(item) if prepare is not None else item + if stop_event.is_set(): + raise CancelledError() + remember_origin(document.origin) + if stop_event.is_set(): + raise CancelledError() + return self.upsert(document) + + iterator = iter(documents) + pending = set() + inserted = 0 + replaced = 0 + skipped = 0 + exhausted = False + executor = ThreadPoolExecutor(max_workers=max_workers) + try: + while not exhausted and len(pending) < max_workers: + try: + document = next(iterator) + except StopIteration: + exhausted = True + continue + pending.add(executor.submit(process_document, document)) + + while pending: + done, pending = wait(pending, return_when=FIRST_COMPLETED) + results = [] + for future in done: + try: + results.append(future.result()) + except CancelledError: + continue + for result in results: + if result.action == "inserted": + inserted += 1 + elif result.action == "replaced": + replaced += 1 + elif result.action == "skipped": + skipped += 1 + else: + raise ValueError(f"Unknown write action: {result.action}") + + while not exhausted and len(pending) < max_workers: + try: + document = next(iterator) + except StopIteration: + exhausted = True + continue + pending.add(executor.submit(process_document, document)) + except Exception: + stop_event.set() + for future in pending: + future.cancel() + executor.shutdown(wait=True, cancel_futures=True) + raise + + executor.shutdown(wait=True, cancel_futures=False) + return IngestSummary( + inserted=inserted, + replaced=replaced, + skipped=skipped, + ) + @abstractmethod def retrieve( self, text: str, top_k: int, *args, **kwargs diff --git a/src/raghilda/crawl.py b/src/raghilda/crawl.py new file mode 100644 index 0000000..fe82f99 --- /dev/null +++ b/src/raghilda/crawl.py @@ -0,0 +1,1705 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from collections import deque +from concurrent.futures import ThreadPoolExecutor +from contextlib import contextmanager +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +import hashlib +import json +import mimetypes +from pathlib import Path +import re +import shutil +import tempfile +import time +from typing import Any, Callable, Iterable, Iterator, Mapping, Sequence, TypeVar +import threading +import unicodedata +from urllib.parse import unquote, urlparse + +import requests + +from .document import MarkdownDocument +from .read import _convert_to_markdown +from .scrape import _canonicalize, _extract_links + +try: + from magika import Magika +except ImportError: # pragma: no cover - optional at runtime + Magika = None + +__all__ = [ + "BaseCrawler", + "CrawlScope", + "FetchedSource", + "WebCrawler", + "DirectoryCrawler", + "CloudflareCrawler", +] + +_TYPE_ALIASES = { + ".htm": "html", + ".html": "html", + ".ipynb": "jupyter-notebook", + ".markdown": "markdown", + ".md": "markdown", + ".pdf": "pdf", + ".py": "python", + ".rst": "rst", + ".txt": "text", +} +_CONTENT_TYPE_LABELS = { + "application/json": "json", + "application/pdf": "pdf", + "application/xml": "xml", + "text/html": "html", + "text/markdown": "markdown", + "text/plain": "text", + "text/x-python": "python", + "text/xml": "xml", +} +_MAGIKA_LABELS = { + "html": "html", + "ipynb": "jupyter-notebook", + "markdown": "markdown", + "pdf": "pdf", + "python": "python", + "rst": "rst", + "txt": "text", +} +_TERMINAL_CLOUDFLARE_STATUSES = { + "cancelled_by_user", + "cancelled_due_to_limits", + "cancelled_due_to_timeout", + "completed", + "errored", +} +_MAGIKA = Magika() if Magika is not None else None +_DEFAULT_CRAWL_DEPTH = 100_000 + +RootInput = str | Path +RootsInput = RootInput | Sequence[RootInput] +CacheValue = tuple[Path | None, dict[str, Any] | None] +CacheEntry = tuple[str, Path | None, dict[str, Any] | None] +TInput = TypeVar("TInput") +TOutput = TypeVar("TOutput") + + +@dataclass(frozen=True) +class CrawlScope: + roots: RootsInput + include_patterns: Sequence[str] | None = None + exclude_patterns: Sequence[str] | None = None + depth: int | None = None + limit: int | None = None + include_types: Sequence[str] | None = None + exclude_types: Sequence[str] | None = None + include_external_links: bool = False + include_subdomains: bool = False + + def __post_init__(self) -> None: + if self.depth is not None: + assert self.depth >= 0 + if self.limit is not None: + assert self.limit >= 0 + + +@dataclass(frozen=True) +class FetchedSource: + origin: str + body_path: Path + resolved_origin: str | None = None + content_type: str | None = None + status_code: int | None = None + metadata: dict[str, Any] | None = None + fetched_at: datetime | None = None + revalidated_at: datetime | None = None + markdown_path: Path | None = None + + +@dataclass(frozen=True) +class _CloudflareRootCacheEntry: + fetched_at: datetime + records: list[dict[str, Any]] + + +@dataclass(frozen=True) +class _CloudflareRecordCacheEntry: + fetched_at: datetime + record: dict[str, Any] + + +@dataclass(frozen=True) +class _ResolvedCrawlScope: + roots: list[RootInput] + include_patterns: list[str] + exclude_patterns: list[str] + depth: int + limit: int | None + include_types: set[str] + exclude_types: set[str] + include_external_links: bool + include_subdomains: bool + + +@dataclass +class _EntryLockState: + lock: threading.RLock + users: int = 0 + + +class _FilesystemCrawlerCache: + """ + Filesystem-backed cache rooted at one directory. + + Each logical key is stored as: + /--.metadata.json + /-- + + The metadata file is the source of truth and stores: + { + "key": , + "content_path": , + "metadata": , + } + """ + + _METADATA_SUFFIX = ".metadata.json" + _HASH_LEN = 12 + _MAX_STEM_LEN = 180 + + _WINDOWS_RESERVED = { + "CON", + "PRN", + "AUX", + "NUL", + "COM1", + "COM2", + "COM3", + "COM4", + "COM5", + "COM6", + "COM7", + "COM8", + "COM9", + "LPT1", + "LPT2", + "LPT3", + "LPT4", + "LPT5", + "LPT6", + "LPT7", + "LPT8", + "LPT9", + } + + def __init__(self, root: Path | None) -> None: + """Create a filesystem-backed cache rooted at one directory.""" + self.root = root + self._entry_locks_guard = threading.Lock() + self._entry_locks: dict[str, _EntryLockState] = {} + if self.root is not None: + self.root.mkdir(parents=True, exist_ok=True) + + def fetch(self, key: str) -> CacheValue | None: + """ + Return the materialized cache entry for one key, if present. + + This method does not lock for normal reads. If it encounters a broken + metadata file, it triggers a locked re-check and best-effort cleanup, + then returns None. + """ + if self.root is None: + return None + + metadata_path = self._metadata_path_for_key(key) + if not metadata_path.exists(): + return None + + record = self._read_record(metadata_path) + if record is None: + self._cleanup_broken_metadata_path(metadata_path) + return None + + content_path: Path | None = None + content_name = record["content_path"] + if content_name is not None: + candidate = self.root / content_name + if candidate.exists(): + content_path = candidate + + return content_path, record["metadata"] + + def upsert( + self, + key: str, + *, + content: bytes | str | Path | None, + metadata: Mapping[str, Any] | None, + content_ext: str | None, + ) -> CacheValue | None: + """ + Create or replace one cache entry. + + Semantics: + - content=None means no content file for this entry + - metadata=None means no user metadata for this entry + - the metadata sidecar is always written, unless both are None + - (content=None, metadata=None) deletes the entry and returns None + """ + if self.root is None: + return None + + if content is None and metadata is None: + self.delete(key) + return None + + base = self._base_for_key(key) + metadata_path = self.root / f"{base}{self._METADATA_SUFFIX}" + stored_metadata = dict(metadata) if metadata is not None else None + new_content_path: Path | None = None + new_content_name: str | None = None + if content is not None: + ext = self._choose_content_ext( + content=content, + content_ext=content_ext, + ) + new_content_path = self.root / f"{base}{ext}" + new_content_name = new_content_path.name + record = { + "key": key, + "content_path": new_content_name, + "metadata": stored_metadata, + } + keep = {metadata_path.name} + if new_content_name is not None: + keep.add(new_content_name) + + with self._locked_base(base): + if metadata_path.exists() and self._read_record(metadata_path) is None: + self._delete_base_files_locked(base) + + if content is not None: + assert new_content_path is not None + self._write_content(new_content_path, content) + + self._write_json(metadata_path, record) + self._delete_extra_base_files_locked(base, keep=keep) + + return new_content_path, stored_metadata + + def delete(self, key: str) -> int: + """ + Delete one cache entry. + + Returns the number of files removed. + """ + if self.root is None: + return 0 + + base = self._base_for_key(key) + with self._locked_base(base): + return self._delete_base_files_locked(base) + + def entries(self) -> Iterable[CacheEntry]: + """ + Yield all cache entries currently described by metadata files. + + This method does not lock for normal reads. Broken metadata files are + re-checked under the write lock and cleaned up if still invalid. + """ + if self.root is None: + return + + for metadata_path in sorted(self.root.glob(f"*{self._METADATA_SUFFIX}")): + record = self._read_record(metadata_path) + if record is None: + self._cleanup_broken_metadata_path(metadata_path) + continue + + content_path: Path | None = None + content_name = record["content_path"] + if content_name is not None: + candidate = self.root / content_name + if candidate.exists(): + content_path = candidate + + yield record["key"], content_path, record["metadata"] + + def _metadata_path_for_key(self, key: str) -> Path: + """Return the deterministic metadata path for one logical key.""" + assert self.root is not None + return self.root / f"{self._base_for_key(key)}{self._METADATA_SUFFIX}" + + def _base_for_key(self, key: str) -> str: + """Build the shared basename for the metadata file and content file.""" + return f"{self._sanitize_stem(key)}--{self._hash_fragment(key)}" + + def _hash_fragment(self, key: str) -> str: + """Return a stable hash fragment of the original key.""" + return hashlib.sha256(key.encode("utf-8")).hexdigest()[: self._HASH_LEN] + + def _sanitize_stem(self, key: str) -> str: + """Make the key visible in the filename, but safe enough for Windows.""" + value = unicodedata.normalize("NFC", key) + value = value.replace("://", "__") + value = value.replace("\\", "_") + value = value.replace("/", "_") + value = re.sub(r'[\x00-\x1f<>:"|?*]+', "_", value) + value = re.sub(r"\s+", "_", value) + value = re.sub(r"[^A-Za-z0-9._-]+", "_", value) + value = value.strip(" ._") + + if not value: + value = "entry" + + root = value.split(".", 1)[0].rstrip(" .").upper() + if root in self._WINDOWS_RESERVED: + value = f"_{value}" + + if len(value) > self._MAX_STEM_LEN: + head = self._MAX_STEM_LEN // 2 - 2 + tail = self._MAX_STEM_LEN - head - 2 + value = f"{value[:head]}..{value[-tail:]}" + + value = value.rstrip(" .") + return value or "entry" + + def _choose_content_ext( + self, + *, + content: bytes | str | Path, + content_ext: str | None, + ) -> str: + """Choose the content file extension.""" + ext = self._normalize_ext(content_ext) + if ext is not None: + return ext + + if isinstance(content, Path): + ext = self._normalize_ext(content.suffix) + if ext is not None: + return ext + + ext = self._infer_ext_with_magika(content) + if ext is not None: + return ext + + if isinstance(content, str): + return ".txt" + + return ".raw" + + def _infer_ext_with_magika(self, content: bytes | str | Path) -> str | None: + """Best-effort extension inference using Magika.""" + if _MAGIKA is None: + return None + + if isinstance(content, Path): + if not content.exists(): + return None + result = _MAGIKA.identify_path(content) + elif isinstance(content, str): + result = _MAGIKA.identify_bytes(content.encode("utf-8")) + else: + result = _MAGIKA.identify_bytes(content) + + extensions = getattr(result.output, "extensions", None) + if not extensions: + return None + return self._normalize_ext(extensions[0]) + + def _normalize_ext(self, ext: str | None) -> str | None: + """Normalize an extension string into a safe canonical form.""" + if ext is None: + return None + + ext = ext.strip() + if not ext: + return None + + if not ext.startswith("."): + ext = "." + ext + + parts = [part for part in ext.split(".") if part] + if not parts: + return None + + cleaned: list[str] = [] + for part in parts: + token = re.sub(r"[^A-Za-z0-9_-]+", "", part) + if token: + cleaned.append(token.lower()) + + if not cleaned: + return None + + return "".join(f".{part}" for part in cleaned) + + def _read_record(self, path: Path) -> dict[str, Any] | None: + """Read and validate one metadata JSON file.""" + try: + with path.open("r", encoding="utf-8") as handle: + obj = json.load(handle) + except (OSError, json.JSONDecodeError): + return None + + if not isinstance(obj, dict): + return None + + key = obj.get("key") + content_path = obj.get("content_path") + metadata = obj.get("metadata") + + if not isinstance(key, str): + return None + if content_path is not None and not isinstance(content_path, str): + return None + if metadata is not None and not isinstance(metadata, dict): + return None + + return { + "key": key, + "content_path": content_path, + "metadata": metadata, + } + + def _cleanup_broken_metadata_path(self, metadata_path: Path) -> None: + """Best-effort cleanup for a broken metadata file.""" + if self.root is None: + return + if not metadata_path.name.endswith(self._METADATA_SUFFIX): + return + + base = metadata_path.name[: -len(self._METADATA_SUFFIX)] + with self._locked_base(base): + if not metadata_path.exists(): + return + if self._read_record(metadata_path) is not None: + return + + self._delete_base_files_locked(base) + + @contextmanager + def _locked_base(self, base: str) -> Iterator[None]: + state = self._acquire_entry_lock_state(base) + state.lock.acquire() + try: + yield + finally: + self._release_entry_lock_state(base, state) + + def _acquire_entry_lock_state(self, base: str) -> _EntryLockState: + with self._entry_locks_guard: + state = self._entry_locks.get(base) + if state is None: + state = _EntryLockState(lock=threading.RLock()) + self._entry_locks[base] = state + state.users += 1 + return state + + def _release_entry_lock_state(self, base: str, state: _EntryLockState) -> None: + state.lock.release() + with self._entry_locks_guard: + current = self._entry_locks.get(base) + assert current is state + state.users -= 1 + if state.users == 0: + del self._entry_locks[base] + + def _delete_base_files_locked(self, base: str) -> int: + """Delete all files belonging to one logical base.""" + assert self.root is not None + + deleted = 0 + for path in self.root.glob(f"{base}.*"): + if not path.is_file(): + continue + try: + path.unlink() + deleted += 1 + except FileNotFoundError: + pass + return deleted + + def _delete_extra_base_files_locked(self, base: str, *, keep: set[str]) -> None: + """Delete stale files for one base, keeping the current pair.""" + assert self.root is not None + + for path in self.root.glob(f"{base}.*"): + if not path.is_file(): + continue + if path.name in keep: + continue + try: + path.unlink() + except FileNotFoundError: + pass + + def _write_content(self, content_path: Path, content: bytes | str | Path) -> None: + """Write content directly to its destination path.""" + if isinstance(content, bytes): + with content_path.open("wb") as handle: + handle.write(content) + return + + if isinstance(content, str): + with content_path.open("w", encoding="utf-8") as handle: + handle.write(content) + return + + if isinstance(content, Path): + if content == content_path: + return + shutil.copyfile(content, content_path) + return + + raise TypeError(f"Unsupported content type: {type(content)!r}") + + def _write_json(self, path: Path, obj: Mapping[str, Any]) -> None: + """Write metadata JSON directly to its destination path.""" + text = json.dumps(obj, indent=2, sort_keys=True, ensure_ascii=False) + "\n" + with path.open("w", encoding="utf-8") as handle: + handle.write(text) + + +class _DirectoryCrawlerCache(_FilesystemCrawlerCache): + pass + + +class _WebCrawlerCache(_FilesystemCrawlerCache): + pass + + +class _CloudflareCrawlerCache(_FilesystemCrawlerCache): + pass + + +def _map_ordered( + items: Iterable[TInput], + *, + max_workers: int, + fn: Callable[[TInput], TOutput], +) -> Iterator[TOutput]: + assert max_workers >= 1 + iterator = iter(items) + if max_workers == 1: + for item in iterator: + yield fn(item) + return + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + pending: deque[Any] = deque() + while len(pending) < max_workers: + try: + item = next(iterator) + except StopIteration: + break + pending.append(executor.submit(fn, item)) + + while pending: + future = pending.popleft() + yield future.result() + try: + item = next(iterator) + except StopIteration: + continue + pending.append(executor.submit(fn, item)) + + +class BaseCrawler(ABC): + max_workers: int + + @abstractmethod + def origins( + self, + scope: CrawlScope, + *, + progress: bool = True, + cache_force_refresh: bool = False, + ) -> Iterator[str]: + pass + + @abstractmethod + def fetch_raw( + self, + origin: str, + *, + cache_force_refresh: bool = False, + ) -> FetchedSource: + pass + + def fetch_markdown( + self, + origin: str, + *, + convert: Callable[[FetchedSource], MarkdownDocument] | None = None, + cache_force_refresh: bool = False, + ) -> MarkdownDocument: + source = self.fetch_raw(origin, cache_force_refresh=cache_force_refresh) + converter = convert or self._default_convert + return converter(source) + + def markdown_documents( + self, + scope: CrawlScope, + *, + convert: Callable[[FetchedSource], MarkdownDocument] | None = None, + progress: bool = True, + cache_force_refresh: bool = False, + ) -> Iterator[MarkdownDocument]: + origins = self.origins( + scope, + progress=progress, + cache_force_refresh=cache_force_refresh, + ) + yield from _map_ordered( + origins, + max_workers=self.max_workers, + fn=lambda origin: self.fetch_markdown( + origin, + convert=convert, + cache_force_refresh=False, + ), + ) + + def _default_convert(self, source: FetchedSource) -> MarkdownDocument: + raise NotImplementedError + + +class DirectoryCrawler(BaseCrawler): + def __init__( + self, + *, + cache_dir: bool | str | Path | None = None, + max_workers: int = 1, + ) -> None: + assert max_workers >= 1 + self.cache_dir = _resolve_cache_dir( + cache_dir, + backend_name="directory", + default_factory=lambda: None, + ) + self.max_workers = max_workers + self._cache = _DirectoryCrawlerCache(self.cache_dir) + + def origins( + self, + scope: CrawlScope, + *, + progress: bool = True, + cache_force_refresh: bool = False, + ) -> Iterator[str]: + del progress, cache_force_refresh + resolved_scope = _resolve_crawl_scope(scope) + if resolved_scope.limit == 0: + return + count = 0 + for root in resolved_scope.roots: + path = _to_directory_path(root) + assert path.exists(), f"Root does not exist: {path}" + if path.is_file(): + origin = path.resolve().as_uri() + if self._include_path( + path, + origin, + include_patterns=resolved_scope.include_patterns, + exclude_patterns=resolved_scope.exclude_patterns, + include_types=resolved_scope.include_types, + exclude_types=resolved_scope.exclude_types, + ): + yield origin + count += 1 + if ( + resolved_scope.limit is not None + and count >= resolved_scope.limit + ): + return + continue + for file_path in sorted(path.rglob("*")): + if not file_path.is_file(): + continue + relative_depth = len(file_path.relative_to(path).parts) - 1 + if relative_depth > resolved_scope.depth: + continue + origin = file_path.resolve().as_uri() + if not self._include_path( + file_path, + origin, + include_patterns=resolved_scope.include_patterns, + exclude_patterns=resolved_scope.exclude_patterns, + include_types=resolved_scope.include_types, + exclude_types=resolved_scope.exclude_types, + ): + continue + yield origin + count += 1 + if resolved_scope.limit is not None and count >= resolved_scope.limit: + return + + def fetch_raw( + self, + origin: str, + *, + cache_force_refresh: bool = False, + ) -> FetchedSource: + path = _path_from_file_origin(origin).resolve() + assert path.is_file(), f"File origin must exist: {origin}" + canonical_origin = path.as_uri() + content_type = mimetypes.guess_type(path.name)[0] + type_label = _detect_type_label(path=path, content_type=content_type) + source_hash = _sha256_path(path) + markdown_path: Path | None = None + if self.cache_dir is not None and not cache_force_refresh: + cached_entry = self._cache.fetch(canonical_origin) + if cached_entry is not None: + cached_markdown_path, cached_meta = cached_entry + if ( + cached_markdown_path is not None + and cached_meta is not None + and ( + cached_meta.get("source_hash") == source_hash + and cached_meta.get("mtime_ns") == path.stat().st_mtime_ns + ) + ): + markdown_path = cached_markdown_path + return FetchedSource( + origin=canonical_origin, + resolved_origin=canonical_origin, + content_type=content_type, + status_code=None, + metadata={ + "mtime_ns": path.stat().st_mtime_ns, + "size": path.stat().st_size, + "source_hash": source_hash, + "type_label": type_label, + }, + fetched_at=datetime.fromtimestamp(path.stat().st_mtime, tz=timezone.utc), + body_path=path, + markdown_path=markdown_path, + ) + + def markdown_documents( + self, + scope: CrawlScope, + *, + convert: Callable[[FetchedSource], MarkdownDocument] | None = None, + progress: bool = True, + cache_force_refresh: bool = False, + ) -> Iterator[MarkdownDocument]: + origins = self.origins( + scope, + progress=progress, + cache_force_refresh=cache_force_refresh, + ) + yield from _map_ordered( + origins, + max_workers=self.max_workers, + fn=lambda origin: self.fetch_markdown( + origin, + convert=convert, + cache_force_refresh=cache_force_refresh, + ), + ) + + def _default_convert(self, source: FetchedSource) -> MarkdownDocument: + if source.markdown_path is not None and source.markdown_path.exists(): + markdown = source.markdown_path.read_text(encoding="utf-8") + return MarkdownDocument(origin=source.origin, content=markdown) + + type_label = (source.metadata or {}).get("type_label") + if type_label == "markdown": + markdown = source.body_path.read_text(encoding="utf-8") + else: + markdown = _convert_to_markdown(str(source.body_path)) + + if self.cache_dir is not None: + self._cache.upsert( + source.origin, + content=markdown, + metadata={ + "origin": source.origin, + "mtime_ns": (source.metadata or {}).get("mtime_ns"), + "source_hash": (source.metadata or {}).get("source_hash"), + }, + content_ext=".md", + ) + + return MarkdownDocument(origin=source.origin, content=markdown) + + def _include_path( + self, + path: Path, + origin: str, + *, + include_patterns: Sequence[str], + exclude_patterns: Sequence[str], + include_types: set[str], + exclude_types: set[str], + ) -> bool: + if not _matches_patterns( + origin, + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + ): + return False + label = _detect_type_label( + path=path, content_type=mimetypes.guess_type(path.name)[0] + ) + return _matches_types( + label, + include_types=include_types, + exclude_types=exclude_types, + ) + + +class WebCrawler(BaseCrawler): + def __init__( + self, + *, + session: requests.Session | None = None, + cache_dir: bool | str | Path | None = None, + cache_stale_after: timedelta | None = None, + max_workers: int = 1, + ) -> None: + assert max_workers >= 1 + self.session = session or requests.Session() + self.cache_dir = _resolve_cache_dir( + cache_dir, + backend_name="web", + default_factory=lambda: Path( + tempfile.mkdtemp(prefix="raghilda-web-cache-") + ), + ) + self.cache_stale_after = cache_stale_after + self.max_workers = max_workers + self._cache = _WebCrawlerCache(self.cache_dir) + + def origins( + self, + scope: CrawlScope, + *, + progress: bool = True, + cache_force_refresh: bool = False, + ) -> Iterator[str]: + del progress + resolved_scope = _resolve_crawl_scope(scope) + if resolved_scope.limit == 0: + return + visited: set[str] = set() + yielded = 0 + frontier: list[tuple[str, str]] = [] + + for root in resolved_scope.roots: + canonical_root = _canonicalize(str(root)) + assert canonical_root is not None + parsed = urlparse(canonical_root) + assert parsed.scheme in {"http", "https"} + root_host = parsed.hostname or "" + frontier.append((canonical_root, root_host)) + + current_depth = 0 + while frontier: + batch: list[tuple[str, str]] = [] + for origin, root_host in frontier: + if origin in visited: + continue + visited.add(origin) + if not self._allow_origin( + origin, + root_host, + include_external_links=resolved_scope.include_external_links, + include_subdomains=resolved_scope.include_subdomains, + ): + continue + batch.append((origin, root_host)) + + next_frontier: list[tuple[str, str]] = [] + fetched_sources = _map_ordered( + batch, + max_workers=self.max_workers, + fn=lambda item: ( + item, + self.fetch_raw( + item[0], + cache_force_refresh=cache_force_refresh, + ), + ), + ) + for (origin, root_host), source in fetched_sources: + type_label = (source.metadata or {}).get("type_label") + matches_patterns = _matches_patterns( + origin, + include_patterns=resolved_scope.include_patterns, + exclude_patterns=resolved_scope.exclude_patterns, + ) + matches_types = _matches_types( + type_label, + include_types=resolved_scope.include_types, + exclude_types=resolved_scope.exclude_types, + ) + if matches_patterns and matches_types: + yield origin + yielded += 1 + if ( + resolved_scope.limit is not None + and yielded >= resolved_scope.limit + ): + return + if current_depth >= resolved_scope.depth: + continue + + text = _read_text(source.body_path) + resolved_origin = source.resolved_origin or origin + resolved_host = urlparse(resolved_origin).hostname or root_host + for link in sorted(_extract_links(text)): + canonical = _canonicalize(link, base=resolved_origin) + if canonical is None: + continue + parsed = urlparse(canonical) + if parsed.scheme not in {"http", "https"}: + continue + next_frontier.append((canonical, resolved_host)) + frontier = next_frontier + current_depth += 1 + + def fetch_raw( + self, + origin: str, + *, + cache_force_refresh: bool = False, + ) -> FetchedSource: + canonical_origin = _canonicalize(origin) + assert canonical_origin is not None + parsed = urlparse(canonical_origin) + assert parsed.scheme in {"http", "https"} + + cached_entry = self._cache.fetch(canonical_origin) + body_path: Path | None = None + cached_meta: dict[str, Any] | None = None + if cached_entry is not None: + body_path, cached_meta = cached_entry + has_cache = body_path is not None and cached_meta is not None + now = _utcnow() + + if has_cache and not cache_force_refresh: + assert cached_meta is not None + assert body_path is not None + if self._is_fresh(cached_meta, now): + return self._source_from_meta(cached_meta, body_path=body_path) + + headers: dict[str, str] = {} + if has_cache and not cache_force_refresh: + assert cached_meta is not None + etag = cached_meta.get("etag") + last_modified = cached_meta.get("last_modified") + if etag: + headers["If-None-Match"] = etag + if last_modified: + headers["If-Modified-Since"] = last_modified + + response = self.session.get(canonical_origin, headers=headers, timeout=30.0) + if response.status_code == 304 and has_cache: + assert cached_meta is not None + assert body_path is not None + cached_meta["revalidated_at"] = now.isoformat() + cached_entry = self._cache.upsert( + canonical_origin, + content=body_path, + metadata=cached_meta, + content_ext=None, + ) + assert cached_entry is not None + body_path, cached_meta = cached_entry + assert body_path is not None + assert cached_meta is not None + return self._source_from_meta(cached_meta, body_path=body_path) + + response.raise_for_status() + content_type = response.headers.get("Content-Type") + resolved_origin = _canonicalize(response.url) or response.url + type_label = _detect_type_label( + path=_type_hint_path(canonical_origin, content_type=content_type), + content_type=content_type, + ) + meta = { + "origin": canonical_origin, + "resolved_origin": resolved_origin, + "content_type": content_type, + "status_code": response.status_code, + "etag": response.headers.get("ETag"), + "last_modified": response.headers.get("Last-Modified"), + "type_label": type_label, + "fetched_at": now.isoformat(), + "revalidated_at": None, + } + cached_entry = self._cache.upsert( + canonical_origin, + content=response.content, + metadata=meta, + content_ext=_known_body_suffix( + canonical_origin, + content_type=content_type, + ), + ) + assert cached_entry is not None + body_path, meta = cached_entry + assert body_path is not None + assert meta is not None + return self._source_from_meta(meta, body_path=body_path) + + def _default_convert(self, source: FetchedSource) -> MarkdownDocument: + type_label = (source.metadata or {}).get("type_label") + if type_label == "markdown": + markdown = _read_text(source.body_path) + else: + path_for_conversion = source.body_path + if source.body_path.suffix == "": + suffix = _body_suffix( + source.origin, + content_type=source.content_type, + ) + with tempfile.NamedTemporaryFile( + prefix="raghilda-convert-", + suffix=suffix, + delete=False, + ) as temporary_file: + temporary_path = Path(temporary_file.name) + temporary_file.write(source.body_path.read_bytes()) + try: + path_for_conversion = temporary_path + markdown = _convert_to_markdown(str(path_for_conversion)) + finally: + temporary_path.unlink(missing_ok=True) + else: + markdown = _convert_to_markdown(str(path_for_conversion)) + return MarkdownDocument(origin=source.origin, content=markdown) + + def _source_from_meta( + self, + meta: dict[str, Any], + *, + body_path: Path, + ) -> FetchedSource: + return FetchedSource( + origin=meta["origin"], + resolved_origin=meta.get("resolved_origin"), + content_type=meta.get("content_type"), + status_code=meta.get("status_code"), + metadata={ + "etag": meta.get("etag"), + "last_modified": meta.get("last_modified"), + "type_label": meta.get("type_label"), + }, + fetched_at=_parse_datetime(meta.get("fetched_at")), + revalidated_at=_parse_datetime(meta.get("revalidated_at")), + body_path=body_path, + ) + + def _is_fresh(self, cached_meta: dict[str, Any], now: datetime) -> bool: + if self.cache_stale_after is None: + return True + timestamps = [ + _parse_datetime(cached_meta.get("fetched_at")), + _parse_datetime(cached_meta.get("revalidated_at")), + ] + freshest_cache_time = max( + (timestamp for timestamp in timestamps if timestamp is not None), + default=None, + ) + if freshest_cache_time is None: + return False + return now - freshest_cache_time <= self.cache_stale_after + + def _allow_origin( + self, + origin: str, + root_host: str, + *, + include_external_links: bool, + include_subdomains: bool, + ) -> bool: + host = urlparse(origin).hostname or "" + if not host: + return False + if host == root_host: + return True + if include_external_links: + return True + if not include_subdomains: + return False + return host.endswith(f".{root_host}") + + +class CloudflareCrawler(BaseCrawler): + def __init__( + self, + *, + account_id: str, + api_token: str, + cache_dir: bool | str | Path | None = None, + session: requests.Session | Any | None = None, + source: str = "all", + render: bool = True, + cache_stale_after: timedelta | None = None, + modified_since: int | None = None, + poll_interval: float = 5.0, + max_poll_attempts: int = 60, + max_workers: int = 1, + base_url: str = "https://api.cloudflare.com/client/v4", + ) -> None: + assert max_workers >= 1 + self.account_id = account_id + self.api_token = api_token + self.cache_dir = _resolve_cache_dir( + cache_dir, + backend_name="cloudflare", + default_factory=lambda: Path( + tempfile.mkdtemp(prefix="raghilda-cloudflare-cache-") + ), + ) + self.session = session or requests.Session() + self.source = source + self.render = render + self.cache_stale_after = cache_stale_after + self.modified_since = modified_since + self.poll_interval = poll_interval + self.max_poll_attempts = max_poll_attempts + self.max_workers = max_workers + self.base_url = base_url.rstrip("/") + self._records: dict[str, _CloudflareRecordCacheEntry] = {} + self._roots: dict[tuple[Any, ...], _CloudflareRootCacheEntry] = {} + self._cache = _CloudflareCrawlerCache(self.cache_dir) + + def origins( + self, + scope: CrawlScope, + *, + progress: bool = True, + cache_force_refresh: bool = False, + ) -> Iterator[str]: + del progress + resolved_scope = _resolve_crawl_scope(scope) + yielded = 0 + for root in resolved_scope.roots: + if resolved_scope.limit is not None and yielded >= resolved_scope.limit: + return + canonical_root = _canonicalize(str(root)) + assert canonical_root is not None + remaining = ( + None if resolved_scope.limit is None else resolved_scope.limit - yielded + ) + records = self._crawl_root( + canonical_root, + cache_force_refresh=cache_force_refresh, + depth=resolved_scope.depth, + include_patterns=resolved_scope.include_patterns, + exclude_patterns=resolved_scope.exclude_patterns, + include_external_links=resolved_scope.include_external_links, + include_subdomains=resolved_scope.include_subdomains, + limit=remaining, + ) + for record in records: + origin = record["url"] + label = _detect_type_label( + path=None, + content_type="text/markdown", + ) + if not _matches_types( + label, + include_types=resolved_scope.include_types, + exclude_types=resolved_scope.exclude_types, + ): + continue + yield origin + yielded += 1 + if resolved_scope.limit is not None and yielded >= resolved_scope.limit: + return + + def fetch_raw( + self, + origin: str, + *, + cache_force_refresh: bool = False, + ) -> FetchedSource: + canonical_origin = _canonicalize(origin) + assert canonical_origin is not None + record_entry = ( + None if cache_force_refresh else self._records.get(canonical_origin) + ) + if record_entry is not None and not self._cloudflare_cache_is_fresh( + record_entry.fetched_at + ): + record_entry = None + if record_entry is None and not cache_force_refresh: + record_entry = self._load_record_cache_entry(canonical_origin) + if record_entry is not None: + self._records[canonical_origin] = record_entry + if record_entry is None or cache_force_refresh: + records = self._crawl_root( + canonical_origin, + cache_force_refresh=cache_force_refresh, + depth=0, + limit=1, + apply_patterns=False, + include_external_links=False, + include_subdomains=False, + ) + record = next( + (item for item in records if item["url"] == canonical_origin), + None, + ) + if record is None: + raise ValueError(f"Cloudflare crawl did not return record for {origin}") + record_entry = self._records[canonical_origin] + else: + record = record_entry.record + + assert record_entry is not None + content_path, _ = self._store_record_cache_entry( + canonical_origin, + record=record, + fetched_at=record_entry.fetched_at, + ) + assert content_path is not None + return FetchedSource( + origin=canonical_origin, + resolved_origin=record.get("metadata", {}).get("url", canonical_origin), + content_type="text/markdown", + status_code=record.get("metadata", {}).get("status"), + metadata={ + "crawler_status": record.get("status"), + "title": record.get("metadata", {}).get("title"), + "type_label": "markdown", + }, + fetched_at=record_entry.fetched_at, + body_path=content_path, + markdown_path=content_path, + ) + + def _default_convert(self, source: FetchedSource) -> MarkdownDocument: + markdown = source.body_path.read_text(encoding="utf-8") + return MarkdownDocument(origin=source.origin, content=markdown) + + def _crawl_root( + self, + root: str, + *, + cache_force_refresh: bool, + depth: int | None = None, + include_patterns: Sequence[str] | None = None, + exclude_patterns: Sequence[str] | None = None, + include_external_links: bool, + include_subdomains: bool, + limit: int | None = None, + apply_patterns: bool = True, + ) -> list[dict[str, Any]]: + resolved_depth = _DEFAULT_CRAWL_DEPTH if depth is None else depth + resolved_include_patterns = list(include_patterns or []) + resolved_exclude_patterns = list(exclude_patterns or []) + resolved_limit = limit + cache_key = ( + root, + resolved_depth, + resolved_limit, + apply_patterns, + tuple(resolved_include_patterns), + tuple(resolved_exclude_patterns), + include_external_links, + include_subdomains, + ) + cached_entry = self._roots.get(cache_key) + if ( + not cache_force_refresh + and cached_entry is not None + and self._cloudflare_cache_is_fresh(cached_entry.fetched_at) + ): + return cached_entry.records + + endpoint = f"{self.base_url}/accounts/{self.account_id}/browser-rendering/crawl" + payload = self._crawl_payload( + root, + depth=resolved_depth, + limit=resolved_limit, + include_patterns=resolved_include_patterns, + exclude_patterns=resolved_exclude_patterns, + include_external_links=include_external_links, + include_subdomains=include_subdomains, + cache_force_refresh=cache_force_refresh, + apply_patterns=apply_patterns, + ) + headers = { + "Authorization": f"Bearer {self.api_token}", + "Content-Type": "application/json", + } + response = self.session.post( + endpoint, + json=payload, + headers=headers, + timeout=30.0, + ) + response.raise_for_status() + response_payload = response.json() + job_id = response_payload["result"] + + result: dict[str, Any] | None = None + for _ in range(self.max_poll_attempts): + poll_response = self.session.get( + f"{endpoint}/{job_id}", + headers={"Authorization": f"Bearer {self.api_token}"}, + params={"limit": 1}, + timeout=30.0, + ) + poll_response.raise_for_status() + result = poll_response.json()["result"] + assert result is not None + status = result["status"] + if status == "running": + if self.poll_interval > 0: + time.sleep(self.poll_interval) + continue + if status not in _TERMINAL_CLOUDFLARE_STATUSES: + raise ValueError(f"Unexpected Cloudflare crawl status: {status}") + if status != "completed": + raise ValueError(f"Cloudflare crawl ended with status '{status}'") + break + else: + raise TimeoutError("Cloudflare crawl did not complete within the timeout") + + assert result is not None + full_response = self.session.get( + f"{endpoint}/{job_id}", + headers={"Authorization": f"Bearer {self.api_token}"}, + params=None, + timeout=30.0, + ) + full_response.raise_for_status() + result = full_response.json()["result"] + assert result is not None + + records = list(result.get("records") or []) + cursor = result.get("cursor") + while cursor is not None: + page_response = self.session.get( + f"{endpoint}/{job_id}", + headers={"Authorization": f"Bearer {self.api_token}"}, + params={"cursor": cursor, "status": "completed"}, + timeout=30.0, + ) + page_response.raise_for_status() + page_result = page_response.json()["result"] + records.extend(page_result.get("records") or []) + cursor = page_result.get("cursor") + + completed_records = [ + record for record in records if record.get("status") == "completed" + ] + if apply_patterns: + completed_records = [ + record + for record in completed_records + if _matches_cloudflare_patterns( + record["url"], + include_patterns=resolved_include_patterns, + exclude_patterns=resolved_exclude_patterns, + ) + ] + fetched_at = _utcnow() + self._roots[cache_key] = _CloudflareRootCacheEntry( + fetched_at=fetched_at, + records=completed_records, + ) + for record in completed_records: + self._records[record["url"]] = _CloudflareRecordCacheEntry( + fetched_at=fetched_at, + record=record, + ) + self._store_record_cache_entry( + record["url"], + record=record, + fetched_at=fetched_at, + ) + return completed_records + + def _crawl_payload( + self, + root: str, + *, + depth: int, + limit: int | None, + include_patterns: Sequence[str], + exclude_patterns: Sequence[str], + include_external_links: bool, + include_subdomains: bool, + cache_force_refresh: bool, + apply_patterns: bool, + ) -> dict[str, Any]: + payload: dict[str, Any] = { + "url": root, + "depth": depth, + "formats": ["markdown"], + "render": self.render, + "source": self.source, + "options": { + "includeExternalLinks": include_external_links, + "includeSubdomains": include_subdomains, + }, + } + if limit is not None: + payload["limit"] = limit + if apply_patterns and include_patterns: + payload["options"]["includePatterns"] = list(include_patterns) + if apply_patterns and exclude_patterns: + payload["options"]["excludePatterns"] = list(exclude_patterns) + if self.modified_since is not None: + payload["modifiedSince"] = self.modified_since + if cache_force_refresh: + payload["maxAge"] = 0 + elif self.cache_stale_after is not None: + payload["maxAge"] = int(self.cache_stale_after.total_seconds()) + return payload + + def _record_cache_signature(self) -> dict[str, Any]: + return { + "account_id": self.account_id, + "base_url": self.base_url, + "render": self.render, + "source": self.source, + "modified_since": self.modified_since, + } + + def _load_record_cache_entry( + self, + origin: str, + ) -> _CloudflareRecordCacheEntry | None: + cached_entry = self._cache.fetch(origin) + if cached_entry is None: + return None + _, cached_meta = cached_entry + if cached_meta is None: + return None + if cached_meta.get("signature") != self._record_cache_signature(): + return None + fetched_at = _parse_datetime(cached_meta.get("fetched_at")) + if fetched_at is None or not self._cloudflare_cache_is_fresh(fetched_at): + return None + record = cached_meta["record"] + assert record["url"] == origin + return _CloudflareRecordCacheEntry( + fetched_at=fetched_at, + record=record, + ) + + def _store_record_cache_entry( + self, + origin: str, + *, + record: dict[str, Any], + fetched_at: datetime, + ) -> CacheValue: + cached_entry = self._cache.upsert( + origin, + content=record["markdown"], + metadata={ + "origin": origin, + "fetched_at": fetched_at.isoformat(), + "record": record, + "signature": self._record_cache_signature(), + }, + content_ext=".md", + ) + assert cached_entry is not None + return cached_entry + + def _cloudflare_cache_is_fresh(self, fetched_at: datetime) -> bool: + if self.cache_stale_after is None: + return True + return _utcnow() - fetched_at <= self.cache_stale_after + + +def _coerce_roots(roots: RootsInput) -> list[RootInput]: + if isinstance(roots, (str, Path)): + return [roots] + return list(roots) + + +def _resolve_crawl_scope(scope: CrawlScope) -> _ResolvedCrawlScope: + return _ResolvedCrawlScope( + roots=_coerce_roots(scope.roots), + include_patterns=list(scope.include_patterns or []), + exclude_patterns=list(scope.exclude_patterns or []), + depth=_DEFAULT_CRAWL_DEPTH if scope.depth is None else scope.depth, + limit=scope.limit, + include_types=_normalize_types(scope.include_types), + exclude_types=_normalize_types(scope.exclude_types), + include_external_links=scope.include_external_links, + include_subdomains=scope.include_subdomains, + ) + + +def _resolve_cache_dir( + cache_dir: bool | str | Path | None, + *, + backend_name: str, + default_factory: Callable[[], Path | None], +) -> Path | None: + if cache_dir is None: + return default_factory() + if isinstance(cache_dir, bool): + if cache_dir is True: + return Path.cwd() / ".raghilda" / "cache" / backend_name + raise TypeError("cache_dir must be None, True, or a filesystem path") + return Path(cache_dir) + + +def _to_directory_path(root: str | Path) -> Path: + if isinstance(root, Path): + return root + parsed = urlparse(str(root)) + if parsed.scheme == "file": + return Path(unquote(parsed.path)) + assert parsed.scheme in {"", "file"} + return Path(str(root)) + + +def _path_from_file_origin(origin: str) -> Path: + parsed = urlparse(origin) + if parsed.scheme == "file": + return Path(unquote(parsed.path)) + return Path(origin) + + +def _normalize_types(types: Sequence[str] | None) -> set[str]: + return {item.strip().lower() for item in types or []} + + +def _matches_patterns( + origin: str, + *, + include_patterns: Sequence[str], + exclude_patterns: Sequence[str], +) -> bool: + for pattern in exclude_patterns: + if re.search(pattern, origin): + return False + if not include_patterns: + return True + return any(re.search(pattern, origin) for pattern in include_patterns) + + +def _matches_cloudflare_patterns( + origin: str, + *, + include_patterns: Sequence[str], + exclude_patterns: Sequence[str], +) -> bool: + for pattern in exclude_patterns: + if _wildcard_matches(origin, pattern): + return False + if not include_patterns: + return True + return any(_wildcard_matches(origin, pattern) for pattern in include_patterns) + + +def _wildcard_matches(origin: str, pattern: str) -> bool: + placeholder = "\0" + regex = re.escape(pattern) + regex = regex.replace(r"/\*\*", "(?:/.*)?") + regex = regex.replace(r"\*\*", placeholder) + regex = regex.replace(r"\*", "[^/]*") + regex = regex.replace(placeholder, ".*") + return re.fullmatch(regex, origin) is not None + + +def _matches_types( + label: str | None, + *, + include_types: set[str], + exclude_types: set[str], +) -> bool: + normalized = label.lower() if label is not None else None + if normalized is not None and normalized in exclude_types: + return False + if not include_types: + return True + return normalized in include_types + + +def _detect_type_label( + *, + path: Path | None, + content_type: str | None, +) -> str | None: + if path is not None: + alias = _TYPE_ALIASES.get(path.suffix.lower()) + if alias is not None: + return alias + normalized_content_type = _normalize_content_type(content_type) + if normalized_content_type in _CONTENT_TYPE_LABELS: + return _CONTENT_TYPE_LABELS[normalized_content_type] + if path is not None and path.exists() and _MAGIKA is not None: + result = _MAGIKA.identify_path(path) + return _MAGIKA_LABELS.get(result.output.label, result.output.label) + return None + + +def _normalize_content_type(content_type: str | None) -> str | None: + if content_type is None: + return None + return content_type.split(";", 1)[0].strip().lower() + + +def _sha256_path(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + while True: + chunk = handle.read(8192) + if not chunk: + break + digest.update(chunk) + return digest.hexdigest() + + +def _read_text(path: Path) -> str: + return path.read_text(encoding="utf-8", errors="ignore") + + +def _known_body_suffix(origin: str, *, content_type: str | None) -> str | None: + parsed = urlparse(origin) + suffix = Path(parsed.path).suffix + if suffix: + return suffix + normalized = _normalize_content_type(content_type) + if normalized == "text/html": + return ".html" + if normalized == "text/markdown": + return ".md" + if normalized == "application/json": + return ".json" + if normalized == "application/pdf": + return ".pdf" + return None + + +def _body_suffix(origin: str, *, content_type: str | None) -> str: + suffix = _known_body_suffix(origin, content_type=content_type) + if suffix is not None: + return suffix + return ".bin" + + +def _type_hint_path(origin: str, *, content_type: str | None) -> Path: + suffix = _body_suffix(origin, content_type=content_type) + return Path("source").with_suffix(suffix) + + +def _utcnow() -> datetime: + return datetime.now(timezone.utc) + + +def _parse_datetime(value: str | None) -> datetime | None: + if value is None: + return None + return datetime.fromisoformat(value) diff --git a/src/raghilda/store.py b/src/raghilda/store.py index d790399..8847339 100644 --- a/src/raghilda/store.py +++ b/src/raghilda/store.py @@ -1,4 +1,4 @@ -from ._store import BaseStore, WriteResult +from ._store import BaseStore, IngestSummary, WriteResult from ._duckdb_store import DuckDBStore from ._openai_store import OpenAIStore from ._chroma_store import ChromaDBStore @@ -6,6 +6,7 @@ __all__ = [ "BaseStore", "WriteResult", + "IngestSummary", "DuckDBStore", "OpenAIStore", "ChromaDBStore", diff --git a/tests/test_api_contract.py b/tests/test_api_contract.py index 79d4095..df1b53c 100644 --- a/tests/test_api_contract.py +++ b/tests/test_api_contract.py @@ -1,11 +1,27 @@ +import inspect from types import SimpleNamespace import pytest +import raghilda.crawl as crawl_module from raghilda.chunk import MarkdownChunk +from raghilda.crawl import ( + BaseCrawler, + CrawlScope, + CloudflareCrawler, + DirectoryCrawler, + FetchedSource, + WebCrawler, +) from raghilda.document import Document, MarkdownDocument import raghilda.store as store_module -from raghilda.store import ChromaDBStore, DuckDBStore, OpenAIStore, WriteResult +from raghilda.store import ( + ChromaDBStore, + DuckDBStore, + IngestSummary, + OpenAIStore, + WriteResult, +) def test_document_uses_origin_field_not_id(): @@ -15,23 +31,81 @@ def test_document_uses_origin_field_not_id(): assert not hasattr(doc, "id") -def test_store_api_uses_upsert_not_insert(): +def test_store_api_uses_upsert_and_ingest_not_insert(): assert hasattr(DuckDBStore, "upsert") assert hasattr(ChromaDBStore, "upsert") assert hasattr(OpenAIStore, "upsert") - assert not hasattr(DuckDBStore, "ingest") - assert not hasattr(ChromaDBStore, "ingest") - assert not hasattr(OpenAIStore, "ingest") + assert hasattr(DuckDBStore, "ingest") + assert hasattr(ChromaDBStore, "ingest") + assert hasattr(OpenAIStore, "ingest") assert not hasattr(DuckDBStore, "insert") assert not hasattr(ChromaDBStore, "insert") assert not hasattr(OpenAIStore, "insert") -def test_store_exports_write_result_not_insert_result(): +def test_store_exports_write_and_ingest_results_not_insert_result(): assert WriteResult is store_module.WriteResult + assert IngestSummary is store_module.IngestSummary assert not hasattr(store_module, "InsertResult") +def test_store_exports_postgres_store_when_dependency_is_installed(): + pytest.importorskip("psycopg2") + + assert hasattr(store_module, "PostgreSQLStore") + assert "PostgreSQLStore" in store_module.__all__ + + +def test_crawl_exports_public_crawler_types(): + assert crawl_module.BaseCrawler is BaseCrawler + assert crawl_module.CrawlScope is CrawlScope + assert crawl_module.DirectoryCrawler is DirectoryCrawler + assert crawl_module.WebCrawler is WebCrawler + assert crawl_module.CloudflareCrawler is CloudflareCrawler + assert crawl_module.FetchedSource is FetchedSource + + +def test_crawl_scope_owns_traversal_policy() -> None: + assert tuple(inspect.signature(CrawlScope).parameters) == ( + "roots", + "include_patterns", + "exclude_patterns", + "depth", + "limit", + "include_types", + "exclude_types", + "include_external_links", + "include_subdomains", + ) + + +def test_crawler_constructors_keep_backend_and_cache_configuration_only() -> None: + assert tuple(inspect.signature(DirectoryCrawler).parameters) == ( + "cache_dir", + "max_workers", + ) + assert tuple(inspect.signature(WebCrawler).parameters) == ( + "session", + "cache_dir", + "cache_stale_after", + "max_workers", + ) + assert tuple(inspect.signature(CloudflareCrawler).parameters) == ( + "account_id", + "api_token", + "cache_dir", + "session", + "source", + "render", + "cache_stale_after", + "modified_since", + "poll_interval", + "max_poll_attempts", + "max_workers", + "base_url", + ) + + def test_openai_upsert_rejects_chunked_document(): class _SinglePage: def __init__(self): diff --git a/tests/test_crawl.py b/tests/test_crawl.py new file mode 100644 index 0000000..f7cc6a9 --- /dev/null +++ b/tests/test_crawl.py @@ -0,0 +1,1590 @@ +from __future__ import annotations + +from contextlib import contextmanager +from datetime import datetime, timedelta, timezone +import fnmatch +import hashlib +import http.server +import json +from pathlib import Path +import re +import socketserver +import threading +from typing import Any +import unicodedata + +import raghilda.crawl as crawl_module +from raghilda.crawl import ( + CrawlScope, + CloudflareCrawler, + DirectoryCrawler, + FetchedSource, + WebCrawler, +) +from raghilda.document import MarkdownDocument + +_WINDOWS_RESERVED = { + "CON", + "PRN", + "AUX", + "NUL", + "COM1", + "COM2", + "COM3", + "COM4", + "COM5", + "COM6", + "COM7", + "COM8", + "COM9", + "LPT1", + "LPT2", + "LPT3", + "LPT4", + "LPT5", + "LPT6", + "LPT7", + "LPT8", + "LPT9", +} + + +def _write(tmp_path: Path, relative: str, contents: str) -> Path: + path = tmp_path / relative + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(contents, encoding="utf-8") + return path + + +def _expected_cache_base(key: str) -> str: + value = unicodedata.normalize("NFC", key) + value = value.replace("://", "__") + value = value.replace("\\", "_") + value = value.replace("/", "_") + value = re.sub(r'[\x00-\x1f<>:"|?*]+', "_", value) + value = re.sub(r"\s+", "_", value) + value = re.sub(r"[^A-Za-z0-9._-]+", "_", value) + value = value.strip(" ._") + + if not value: + value = "entry" + + root = value.split(".", 1)[0].rstrip(" .").upper() + if root in _WINDOWS_RESERVED: + value = f"_{value}" + + if len(value) > 180: + head = 180 // 2 - 2 + tail = 180 - head - 2 + value = f"{value[:head]}..{value[-tail:]}" + + value = value.rstrip(" .") + stem = value or "entry" + digest = hashlib.sha256(key.encode("utf-8")).hexdigest()[:12] + return f"{stem}--{digest}" + + +def test_directory_crawler_discovers_and_converts_markdown_documents( + tmp_path: Path, +) -> None: + markdown = _write(tmp_path, "docs/readme.md", "# Hello\n\nDirectory crawler") + _write(tmp_path, "docs/skip.py", "print('skip')") + notebook = _write( + tmp_path, + "docs/notebook.ipynb", + json.dumps( + { + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5, + } + ), + ) + + crawler = DirectoryCrawler() + scope = CrawlScope( + roots=[tmp_path], + depth=3, + include_patterns=[r".*/docs/.*"], + exclude_patterns=[r".*/skip\.py$"], + include_types=["markdown", "jupyter-notebook"], + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert markdown.resolve().as_uri() in origins + assert notebook.resolve().as_uri() in origins + assert all(not origin.endswith("skip.py") for origin in origins) + + source = crawler.fetch_raw(markdown.resolve().as_uri()) + assert isinstance(source, FetchedSource) + assert source.origin == markdown.resolve().as_uri() + assert source.body_path == markdown.resolve() + assert source.status_code is None + + fetched_markdown = crawler.fetch_markdown(markdown.resolve().as_uri()) + assert fetched_markdown == MarkdownDocument( + origin=markdown.resolve().as_uri(), + content="# Hello\n\nDirectory crawler", + ) + + +def test_directory_crawler_convert_override_receives_fetched_source( + tmp_path: Path, +) -> None: + markdown = _write(tmp_path, "docs/readme.md", "# Hello\n\nDirectory crawler") + seen: list[FetchedSource] = [] + + crawler = DirectoryCrawler() + + converted = crawler.fetch_markdown( + markdown.resolve().as_uri(), + convert=lambda source: _record_directory_conversion(source, seen), + ) + + assert [item.origin for item in seen] == [markdown.resolve().as_uri()] + assert converted == MarkdownDocument( + origin=markdown.resolve().as_uri(), + content="# Converted\n", + ) + + +def test_directory_crawler_cache_dir_uses_hashed_file_pair( + tmp_path: Path, +) -> None: + markdown = _write(tmp_path, "docs/readme.md", "# Hello\n") + cache_dir = tmp_path / "cache" + crawler = DirectoryCrawler(cache_dir=cache_dir) + + origin = markdown.resolve().as_uri() + document = crawler.fetch_markdown(origin) + + base = _expected_cache_base(origin) + metadata_path = cache_dir / f"{base}.metadata.json" + content_path = cache_dir / f"{base}.md" + assert document == MarkdownDocument(origin=origin, content="# Hello\n") + assert sorted(path.name for path in cache_dir.iterdir()) == [ + content_path.name, + metadata_path.name, + ] + assert json.loads(metadata_path.read_text(encoding="utf-8")) == { + "content_path": content_path.name, + "key": origin, + "metadata": { + "mtime_ns": markdown.stat().st_mtime_ns, + "origin": origin, + "source_hash": hashlib.sha256(markdown.read_bytes()).hexdigest(), + }, + } + + +def test_directory_crawler_cache_dir_true_uses_default_backend_directory( + tmp_path: Path, + monkeypatch, +) -> None: + markdown = _write(tmp_path, "docs/readme.md", "# Hello\n") + monkeypatch.chdir(tmp_path) + crawler = DirectoryCrawler(cache_dir=True) + + origin = markdown.resolve().as_uri() + crawler.fetch_markdown(origin) + + cache_dir = tmp_path / ".raghilda" / "cache" / "directory" + base = _expected_cache_base(origin) + assert sorted(path.name for path in cache_dir.iterdir()) == [ + f"{base}.md", + f"{base}.metadata.json", + ] + + +def _record_directory_conversion( + source: FetchedSource, seen: list[FetchedSource] +) -> MarkdownDocument: + seen.append(source) + return MarkdownDocument(origin=source.origin, content="# Converted\n") + + +class _ThreadingHTTPServer(socketserver.ThreadingMixIn, http.server.HTTPServer): + daemon_threads = True + + +class _RequestHandler(http.server.BaseHTTPRequestHandler): + def do_GET(self) -> None: + path = self.path.split("?", 1)[0] + routes = self.server.routes # type: ignore[attr-defined] + route = routes[path] + self.server.requests.append( # type: ignore[attr-defined] + {"path": path, "headers": dict(self.headers.items())} + ) + if route["etag"] and self.headers.get("If-None-Match") == route["etag"]: + self.send_response(304) + self.send_header("ETag", route["etag"]) + self.end_headers() + return + + body = route["body"].encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", route["content_type"]) + self.send_header("Content-Length", str(len(body))) + if route["etag"]: + self.send_header("ETag", route["etag"]) + self.end_headers() + self.wfile.write(body) + + def log_message(self, format: str, *args: Any) -> None: + return + + +class _FakeWebResponse: + def __init__( + self, + *, + body: str, + url: str, + content_type: str = "text/html; charset=utf-8", + status_code: int = 200, + headers: dict[str, str] | None = None, + ) -> None: + self.url = url + self.content = body.encode("utf-8") + self.headers = {"Content-Type": content_type, **(headers or {})} + self.status_code = status_code + + def raise_for_status(self) -> None: + assert self.status_code < 400 + + +class _FakeWebSession: + def __init__(self, routes: dict[str, dict[str, Any]]) -> None: + self.routes = routes + self.requests: list[tuple[str, dict[str, str]]] = [] + + def get( + self, + url: str, + *, + headers: dict[str, str], + timeout: float, + ) -> _FakeWebResponse: + del timeout + self.requests.append((url, headers)) + route = self.routes[url] + return _FakeWebResponse( + body=route["body"], + url=route.get("resolved_url", url), + content_type=route.get("content_type", "text/html; charset=utf-8"), + status_code=route.get("status_code", 200), + headers=route.get("headers"), + ) + + +@contextmanager +def _serve(routes: dict[str, dict[str, str | None]]): + server = _ThreadingHTTPServer(("127.0.0.1", 0), _RequestHandler) + server.routes = routes # type: ignore[attr-defined] + server.requests = [] # type: ignore[attr-defined] + thread = threading.Thread(target=server.serve_forever) + thread.start() + try: + yield server + finally: + server.shutdown() + thread.join() + server.server_close() + + +def test_web_crawler_discovers_origins_and_revalidates_cache(tmp_path: Path) -> None: + with _serve( + { + "/": { + "body": """ + +
+ Guide + Skip + External +
+ + """, + "content_type": "text/html; charset=utf-8", + "etag": "root-v1", + }, + "/guide": { + "body": "

Guide

Hello

", + "content_type": "text/html; charset=utf-8", + "etag": "guide-v1", + }, + "/skip": { + "body": "

Skip

", + "content_type": "text/html; charset=utf-8", + "etag": "skip-v1", + }, + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}/" + crawler = WebCrawler( + cache_dir=tmp_path / "cache", + cache_stale_after=timedelta(seconds=0), + ) + scope = CrawlScope( + roots=[root_url], + depth=1, + include_patterns=[rf"^{re.escape(root_url)}.*"], + exclude_patterns=[r".*/skip$"], + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert root_url in origins + assert f"{root_url}guide" in origins + assert all(not origin.endswith("/skip") for origin in origins) + assert all("external.test" not in origin for origin in origins) + + first = crawler.fetch_raw(root_url) + second = crawler.fetch_raw(root_url) + third = crawler.fetch_raw(root_url, cache_force_refresh=True) + server_requests = getattr(server, "requests") + root_requests = [ + request for request in server_requests if request["path"] == "/" + ] + + assert first.body_path == second.body_path == third.body_path + assert second.revalidated_at is not None + assert root_requests[-2]["headers"]["If-None-Match"] == "root-v1" + assert "If-None-Match" not in root_requests[-1]["headers"] + + guide_doc = crawler.fetch_markdown(f"{root_url}guide") + assert guide_doc.origin == f"{root_url}guide" + assert "Guide" in guide_doc.content + + +def test_web_crawler_resolves_relative_links_from_redirect_target( + tmp_path: Path, +) -> None: + session: Any = _FakeWebSession( + { + "https://example.com/docs": { + "body": 'Page', + "resolved_url": "https://example.com/docs/", + }, + "https://example.com/docs/page": { + "body": "
Page
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "redirect-cache", + session=session, + ) + scope = CrawlScope(roots=["https://example.com/docs"], depth=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert "https://example.com/docs" in origins + assert "https://example.com/docs/page" in origins + assert "https://example.com/page" not in origins + + +def test_web_crawler_follows_links_after_redirect_to_different_host( + tmp_path: Path, +) -> None: + session: Any = _FakeWebSession( + { + "https://example.com": { + "body": 'About', + "resolved_url": "https://www.example.com/landing", + }, + "https://www.example.com/about": { + "body": "
About
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "redirect-host-cache", + session=session, + ) + scope = CrawlScope(roots=["https://example.com"], depth=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert "https://example.com" in origins + assert "https://www.example.com/about" in origins + + +def test_web_crawler_include_subdomains_stays_within_requested_host_tree( + tmp_path: Path, +) -> None: + root = "https://docs.example.co.uk/start" + allowed = "https://api.docs.example.co.uk/page" + disallowed_parent = "https://example.co.uk/root" + disallowed_sibling = "https://other.co.uk/page" + session: Any = _FakeWebSession( + { + root: { + "body": ( + f'Allowed' + f'Parent' + f'Sibling' + ), + }, + allowed: {"body": "
Allowed
"}, + disallowed_parent: { + "body": "
Parent
" + }, + disallowed_sibling: { + "body": "
Sibling
" + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "subdomain-cache", + session=session, + ) + scope = CrawlScope( + roots=[root], + depth=1, + include_subdomains=True, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert root in origins + assert allowed in origins + assert disallowed_parent not in origins + assert disallowed_sibling not in origins + + +def test_web_crawler_discovers_matching_descendants_from_filtered_seed( + tmp_path: Path, +) -> None: + with _serve( + { + "/": { + "body": 'Guide', + "content_type": "text/html; charset=utf-8", + "etag": None, + }, + "/docs/guide": { + "body": "
Guide
", + "content_type": "text/html; charset=utf-8", + "etag": None, + }, + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}/" + crawler = WebCrawler( + cache_dir=tmp_path / "filtered-seed-cache", + ) + scope = CrawlScope( + roots=[root_url], + depth=1, + include_patterns=[rf"^{re.escape(root_url)}docs/.*"], + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert root_url not in origins + assert f"{root_url}docs/guide" in origins + + +def test_web_crawler_accepts_crawl_scope_for_roots_and_patterns( + tmp_path: Path, +) -> None: + with _serve( + { + "/": { + "body": 'Guide', + "content_type": "text/html; charset=utf-8", + "etag": None, + }, + "/docs/guide": { + "body": "
Guide
", + "content_type": "text/html; charset=utf-8", + "etag": None, + }, + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}/" + crawler = WebCrawler(cache_dir=tmp_path / "scope-cache") + scope = CrawlScope( + roots=[root_url], + depth=1, + include_patterns=[rf"^{re.escape(root_url)}docs/.*"], + ) + + origins = list(crawler.origins(scope, progress=False)) + documents = list(crawler.markdown_documents(scope, progress=False)) + + assert origins == [f"{root_url}docs/guide"] + assert documents == [ + MarkdownDocument(origin=f"{root_url}docs/guide", content="Guide") + ] + + +def test_web_markdown_documents_reuses_refreshed_sources( + tmp_path: Path, +) -> None: + with _serve( + { + "/": { + "body": "
Root
", + "content_type": "text/html; charset=utf-8", + "etag": None, + } + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}/" + crawler = WebCrawler( + cache_dir=tmp_path / "markdown-docs-cache", + ) + scope = CrawlScope(roots=[root_url], depth=0) + + documents = list(crawler.markdown_documents(scope, cache_force_refresh=True)) + root_requests = [ + request for request in getattr(server, "requests") if request["path"] == "/" + ] + + assert documents == [MarkdownDocument(origin=root_url, content="Root")] + assert len(root_requests) == 1 + + +def test_web_crawler_fetches_same_depth_frontier_concurrently(tmp_path: Path) -> None: + root = "https://example.com/docs" + first = "https://example.com/docs/one" + second = "https://example.com/docs/two" + + class _ConcurrentWebSession: + def __init__(self) -> None: + self.requests: list[tuple[str, dict[str, str]]] = [] + self._lock = threading.Lock() + self._barrier = threading.Barrier(2) + self.in_flight = 0 + self.max_in_flight = 0 + + def get( + self, url: str, headers: dict[str, str], timeout: float + ) -> _FakeWebResponse: + del timeout + with self._lock: + self.requests.append((url, headers)) + self.in_flight += 1 + self.max_in_flight = max(self.max_in_flight, self.in_flight) + try: + if url == root: + return _FakeWebResponse( + body=( + f'One' + f'Two' + ), + url=url, + ) + if url in {first, second}: + self._barrier.wait(timeout=1.0) + return _FakeWebResponse( + body="
Child
", + url=url, + ) + raise AssertionError(f"Unexpected url: {url}") + finally: + with self._lock: + self.in_flight -= 1 + + session: Any = _ConcurrentWebSession() + crawler = WebCrawler( + cache_dir=tmp_path / "frontier-cache", + session=session, + max_workers=2, + ) + scope = CrawlScope(roots=[root], depth=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root, first, second] + assert session.max_in_flight == 2 + + +def test_web_crawler_treats_304_revalidation_as_fresh_cache_hit( + tmp_path: Path, + monkeypatch, +) -> None: + with _serve( + { + "/": { + "body": "
Cached
", + "content_type": "text/html; charset=utf-8", + "etag": "root-v1", + } + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}/" + times = iter( + [ + datetime(2026, 1, 1, tzinfo=timezone.utc), + datetime(2026, 1, 1, 0, 0, 2, tzinfo=timezone.utc), + datetime(2026, 1, 1, 0, 0, 2, 500000, tzinfo=timezone.utc), + ] + ) + monkeypatch.setattr(crawl_module, "_utcnow", lambda: next(times)) + crawler = WebCrawler( + cache_dir=tmp_path / "fresh-cache", + cache_stale_after=timedelta(seconds=1), + ) + + first = crawler.fetch_raw(root_url) + second = crawler.fetch_raw(root_url) + third = crawler.fetch_raw(root_url) + root_requests = [ + request for request in getattr(server, "requests") if request["path"] == "/" + ] + + assert first.body_path == second.body_path == third.body_path + assert second.revalidated_at is not None + assert len(root_requests) == 2 + assert root_requests[1]["headers"]["If-None-Match"] == "root-v1" + + +def test_web_crawler_cache_dir_uses_hashed_file_pair( + tmp_path: Path, +) -> None: + with _serve( + { + "/": { + "body": "
Root
", + "content_type": "text/html; charset=utf-8", + "etag": None, + } + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}/" + cache_dir = tmp_path / "cache" + crawler = WebCrawler(cache_dir=cache_dir) + + document = crawler.fetch_markdown(root_url) + + base = _expected_cache_base(root_url) + metadata_path = cache_dir / f"{base}.metadata.json" + content_path = cache_dir / f"{base}.html" + assert document == MarkdownDocument(origin=root_url, content="Root") + assert sorted(path.name for path in cache_dir.iterdir()) == [ + content_path.name, + metadata_path.name, + ] + record = json.loads(metadata_path.read_text(encoding="utf-8")) + assert record["key"] == root_url + assert record["content_path"] == content_path.name + assert record["metadata"]["content_type"] == "text/html; charset=utf-8" + assert record["metadata"]["origin"] == root_url + + +def test_web_crawler_cache_dir_true_uses_default_backend_directory( + tmp_path: Path, + monkeypatch, +) -> None: + monkeypatch.chdir(tmp_path) + with _serve( + { + "/": { + "body": "
Root
", + "content_type": "text/html; charset=utf-8", + "etag": None, + } + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}/" + crawler = WebCrawler(cache_dir=True) + + crawler.fetch_markdown(root_url) + + cache_dir = tmp_path / ".raghilda" / "cache" / "web" + base = _expected_cache_base(root_url) + assert sorted(path.name for path in cache_dir.iterdir()) == [ + f"{base}.html", + f"{base}.metadata.json", + ] + + +def test_web_crawler_disambiguates_colliding_sanitized_cache_prefixes( + tmp_path: Path, +) -> None: + first_origin = "https://example.com/docs/page" + second_origin = "https://example.com/docs:page" + third_origin = "https://example.com/docs?page" + session: Any = _FakeWebSession( + { + first_origin: {"body": "
One
"}, + second_origin: {"body": "
Two
"}, + third_origin: {"body": "
Three
"}, + } + ) + cache_dir = tmp_path / "collision-cache" + crawler = WebCrawler(cache_dir=cache_dir, session=session) + + crawler.fetch_raw(first_origin) + crawler.fetch_raw(second_origin) + crawler.fetch_raw(third_origin) + + first_base = _expected_cache_base(first_origin) + second_base = _expected_cache_base(second_origin) + third_base = _expected_cache_base(third_origin) + cached_names = {path.name for path in cache_dir.iterdir()} + assert { + f"{first_base}.html", + f"{first_base}.metadata.json", + f"{second_base}.html", + f"{second_base}.metadata.json", + f"{third_base}.html", + f"{third_base}.metadata.json", + }.issubset(cached_names) + assert len(cached_names) == 6 + + second_session: Any = _FakeWebSession( + { + first_origin: {"body": "
One
"}, + second_origin: {"body": "
Two
"}, + third_origin: {"body": "
Three
"}, + } + ) + second_crawler = WebCrawler(cache_dir=cache_dir, session=second_session) + + assert second_crawler.fetch_raw(first_origin).body_path.exists() + assert second_crawler.fetch_raw(second_origin).body_path.exists() + assert second_crawler.fetch_raw(third_origin).body_path.exists() + assert second_session.requests == [] + + +def test_web_crawler_cache_writes_for_different_keys_do_not_contend( + tmp_path: Path, + monkeypatch, +) -> None: + first_origin = "https://example.com/docs/one" + second_origin = "https://example.com/docs/two" + session: Any = _FakeWebSession( + { + first_origin: {"body": "
One
"}, + second_origin: {"body": "
Two
"}, + } + ) + cache_dir = tmp_path / "concurrency-cache" + crawler = WebCrawler(cache_dir=cache_dir, session=session) + + first_content_path = cache_dir / f"{_expected_cache_base(first_origin)}.html" + second_content_path = cache_dir / f"{_expected_cache_base(second_origin)}.html" + first_started = threading.Event() + release_first = threading.Event() + second_finished = threading.Event() + errors: list[BaseException] = [] + original_write_content = crawl_module._FilesystemCrawlerCache._write_content + + def blocking_write_content( + self, + path: Path, + content: bytes | str | Path, + ) -> None: + if path == first_content_path and not first_started.is_set(): + first_started.set() + assert release_first.wait(timeout=2.0) + original_write_content(self, path, content) + if path == second_content_path: + second_finished.set() + + monkeypatch.setattr( + crawl_module._FilesystemCrawlerCache, + "_write_content", + blocking_write_content, + ) + + def fetch(origin: str) -> None: + try: + crawler.fetch_raw(origin) + except BaseException as exc: + errors.append(exc) + + first_thread = threading.Thread(target=fetch, args=(first_origin,)) + second_thread = threading.Thread(target=fetch, args=(second_origin,)) + first_thread.start() + assert first_started.wait(timeout=1.0) + second_thread.start() + try: + assert second_finished.wait(timeout=1.0) + finally: + release_first.set() + first_thread.join(timeout=1.0) + second_thread.join(timeout=1.0) + + assert errors == [] + + +def test_web_crawler_uses_magika_when_no_explicit_ext_is_available( + tmp_path: Path, +) -> None: + origin = "https://example.com/download" + session: Any = _FakeWebSession( + { + origin: { + "body": "
Download
", + "content_type": "application/octet-stream", + } + } + ) + cache_dir = tmp_path / "magika-cache" + crawler = WebCrawler(cache_dir=cache_dir, session=session) + + source = crawler.fetch_raw(origin) + + base = _expected_cache_base(origin) + assert source.body_path == cache_dir / f"{base}.html" + + +def test_web_crawler_falls_back_to_raw_when_magika_is_unavailable( + tmp_path: Path, + monkeypatch, +) -> None: + origin = "https://example.com/download" + session: Any = _FakeWebSession( + { + origin: { + "body": "opaque payload", + "content_type": "application/octet-stream", + } + } + ) + cache_dir = tmp_path / "raw-cache" + monkeypatch.setattr(crawl_module, "_MAGIKA", None) + crawler = WebCrawler(cache_dir=cache_dir, session=session) + + source = crawler.fetch_raw(origin) + + base = _expected_cache_base(origin) + assert source.body_path == cache_dir / f"{base}.raw" + + +class _CloudflareResponse: + def __init__(self, payload: dict[str, Any]): + self.payload = payload + self.status_code = 200 + + def json(self) -> dict[str, Any]: + return self.payload + + def raise_for_status(self) -> None: + return + + +class _CloudflareSession: + def __init__(self) -> None: + self.post_calls: list[tuple[str, dict[str, Any], dict[str, str]]] = [] + self.get_calls: list[tuple[str, dict[str, Any] | None]] = [] + self._poll_count = 0 + + def post( + self, + url: str, + *, + json: dict[str, Any], + headers: dict[str, str], + timeout: float, + ) -> _CloudflareResponse: + self.post_calls.append((url, json, headers)) + return _CloudflareResponse({"success": True, "result": "job-123"}) + + def get( + self, + url: str, + *, + headers: dict[str, str], + params: dict[str, Any] | None = None, + timeout: float, + ) -> _CloudflareResponse: + self.get_calls.append((url, params)) + self._poll_count += 1 + if self._poll_count == 1: + return _CloudflareResponse( + {"success": True, "result": {"id": "job-123", "status": "running"}} + ) + if params == {"limit": 1}: + return _CloudflareResponse( + {"success": True, "result": {"id": "job-123", "status": "completed"}} + ) + return _CloudflareResponse( + { + "success": True, + "result": { + "id": "job-123", + "status": "completed", + "records": [ + { + "url": "https://example.com/docs", + "status": "completed", + "markdown": "# Docs\n", + "metadata": { + "status": 200, + "title": "Docs", + "url": "https://example.com/docs", + }, + }, + { + "url": "https://example.com/docs/page", + "status": "completed", + "markdown": "## Page\n", + "metadata": { + "status": 200, + "title": "Page", + "url": "https://example.com/docs/page", + }, + }, + ], + }, + } + ) + + +class _ParameterizedCloudflareSession: + def __init__(self) -> None: + self.post_calls: list[tuple[str, dict[str, Any], dict[str, str]]] = [] + self.get_calls: list[tuple[str, dict[str, Any] | None]] = [] + self._jobs: dict[str, dict[str, Any]] = {} + + def post( + self, + url: str, + *, + json: dict[str, Any], + headers: dict[str, str], + timeout: float, + ) -> _CloudflareResponse: + del timeout + job_id = f"job-{len(self.post_calls) + 1}" + self.post_calls.append((url, json, headers)) + self._jobs[job_id] = json + return _CloudflareResponse({"success": True, "result": job_id}) + + def get( + self, + url: str, + *, + headers: dict[str, str], + params: dict[str, Any] | None = None, + timeout: float, + ) -> _CloudflareResponse: + del headers, timeout + self.get_calls.append((url, params)) + job_id = url.rsplit("/", 1)[-1] + payload = self._jobs[job_id] + if params == {"limit": 1}: + return _CloudflareResponse( + {"success": True, "result": {"id": job_id, "status": "completed"}} + ) + records = [ + { + "url": payload["url"], + "status": "completed", + "markdown": "# Docs\n", + "metadata": { + "status": 200, + "title": "Docs", + "url": payload["url"], + }, + } + ] + if payload["depth"] > 0: + records.append( + { + "url": f"{payload['url']}/page", + "status": "completed", + "markdown": "## Page\n", + "metadata": { + "status": 200, + "title": "Page", + "url": f"{payload['url']}/page", + }, + } + ) + return _CloudflareResponse( + { + "success": True, + "result": { + "id": job_id, + "status": "completed", + "records": records, + }, + } + ) + + +class _DiscoveryFilteringCloudflareSession(_ParameterizedCloudflareSession): + def get( + self, + url: str, + *, + headers: dict[str, str], + params: dict[str, Any] | None = None, + timeout: float, + ) -> _CloudflareResponse: + response = super().get(url, headers=headers, params=params, timeout=timeout) + if params == {"limit": 1}: + return response + + payload = self._jobs[url.rsplit("/", 1)[-1]] + include_patterns = payload["options"].get("includePatterns", []) + exclude_patterns = payload["options"].get("excludePatterns", []) + records = response.json()["result"]["records"] + filtered_records = [ + record + for record in records + if ( + ( + not include_patterns + or any( + fnmatch.fnmatchcase(record["url"], pattern) + for pattern in include_patterns + ) + ) + and not any( + fnmatch.fnmatchcase(record["url"], pattern) + for pattern in exclude_patterns + ) + ) + ] + return _CloudflareResponse( + { + "success": True, + "result": { + "id": url.rsplit("/", 1)[-1], + "status": "completed", + "records": filtered_records, + }, + } + ) + + +def test_cloudflare_crawler_polls_job_and_uses_markdown_records( + tmp_path: Path, +) -> None: + session = _CloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-cache", + render=False, + session=session, + poll_interval=0, + ) + scope = CrawlScope( + roots=["https://example.com/docs"], + depth=2, + limit=25, + include_patterns=["https://example.com/docs/**"], + exclude_patterns=["https://example.com/docs/archive/**"], + include_external_links=True, + include_subdomains=True, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [ + "https://example.com/docs", + "https://example.com/docs/page", + ] + assert len(session.post_calls) == 1 + post_url, payload, headers = session.post_calls[0] + assert post_url.endswith("/accounts/account-123/browser-rendering/crawl") + assert headers["Authorization"] == "Bearer token-123" + assert payload["formats"] == ["markdown"] + assert payload["depth"] == 2 + assert payload["limit"] == 25 + assert payload["render"] is False + assert payload["options"]["includePatterns"] == ["https://example.com/docs/**"] + assert payload["options"]["excludePatterns"] == [ + "https://example.com/docs/archive/**" + ] + assert payload["options"]["includeExternalLinks"] is True + assert payload["options"]["includeSubdomains"] is True + + page_source = crawler.fetch_raw("https://example.com/docs/page") + assert page_source.status_code == 200 + assert page_source.markdown_path is not None + assert page_source.markdown_path.read_text(encoding="utf-8") == "## Page\n" + + page_doc = crawler.fetch_markdown("https://example.com/docs/page") + assert page_doc == MarkdownDocument( + origin="https://example.com/docs/page", + content="## Page\n", + ) + assert len(session.post_calls) == 1 + + +def test_cloudflare_crawler_accepts_crawl_scope_for_roots_and_patterns( + tmp_path: Path, +) -> None: + session = _ParameterizedCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-scope-cache", + session=session, + poll_interval=0, + ) + scope = CrawlScope( + roots=["https://example.com/docs"], + depth=1, + include_patterns=["https://example.com/docs/**"], + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [ + "https://example.com/docs", + "https://example.com/docs/page", + ] + assert session.post_calls[0][1]["depth"] == 1 + assert session.post_calls[0][1]["options"]["includePatterns"] == [ + "https://example.com/docs/**" + ] + + +def test_cloudflare_crawler_cache_key_includes_crawl_parameters( + tmp_path: Path, +) -> None: + session = _ParameterizedCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-cache", + session=session, + poll_interval=0, + ) + scope = CrawlScope( + roots=["https://example.com/docs"], + depth=2, + limit=25, + ) + + source = crawler.fetch_raw("https://example.com/docs") + origins = list(crawler.origins(scope, progress=False)) + + assert source.origin == "https://example.com/docs" + assert origins == [ + "https://example.com/docs", + "https://example.com/docs/page", + ] + assert len(session.post_calls) == 2 + assert session.post_calls[0][1]["depth"] == 0 + assert session.post_calls[0][1]["limit"] == 1 + assert session.post_calls[1][1]["depth"] == 2 + assert session.post_calls[1][1]["limit"] == 25 + + +def test_cloudflare_crawler_rechecks_stale_in_memory_records( + tmp_path: Path, + monkeypatch, +) -> None: + session = _ParameterizedCloudflareSession() + times = iter( + [ + datetime(2026, 1, 1, tzinfo=timezone.utc), + datetime(2026, 1, 1, 0, 0, 2, tzinfo=timezone.utc), + datetime(2026, 1, 1, 0, 0, 2, tzinfo=timezone.utc), + datetime(2026, 1, 1, 0, 0, 2, tzinfo=timezone.utc), + ] + ) + monkeypatch.setattr(crawl_module, "_utcnow", lambda: next(times)) + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-cache", + cache_stale_after=timedelta(seconds=1), + session=session, + poll_interval=0, + ) + scope = CrawlScope(roots=["https://example.com/docs"], depth=2) + + origins = list(crawler.origins(scope, progress=False)) + page_source = crawler.fetch_raw("https://example.com/docs/page") + + assert origins == [ + "https://example.com/docs", + "https://example.com/docs/page", + ] + assert page_source.origin == "https://example.com/docs/page" + assert len(session.post_calls) == 2 + assert session.post_calls[1][1]["url"] == "https://example.com/docs/page" + assert session.post_calls[1][1]["depth"] == 0 + assert session.post_calls[1][1]["limit"] == 1 + + +def test_cloudflare_fetch_raw_ignores_discovery_patterns_for_explicit_origin( + tmp_path: Path, +) -> None: + session = _DiscoveryFilteringCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-cache", + session=session, + poll_interval=0, + ) + + source = crawler.fetch_raw("https://example.com/docs") + + assert source.origin == "https://example.com/docs" + assert source.status_code == 200 + assert "includePatterns" not in session.post_calls[0][1]["options"] + + +def test_cloudflare_fetch_raw_reuses_cache_directory_across_instances( + tmp_path: Path, +) -> None: + cache = tmp_path / "cloudflare-cache" + first_session = _ParameterizedCloudflareSession() + first_crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache, + session=first_session, + poll_interval=0, + ) + + first = first_crawler.fetch_raw("https://example.com/docs") + + second_session = _ParameterizedCloudflareSession() + second_crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache, + session=second_session, + poll_interval=0, + ) + + second = second_crawler.fetch_raw("https://example.com/docs") + + assert first.body_path.read_text(encoding="utf-8") == "# Docs\n" + assert second.body_path.read_text(encoding="utf-8") == "# Docs\n" + assert second.status_code == 200 + assert len(first_session.post_calls) == 1 + assert second_session.post_calls == [] + + +def test_cloudflare_crawler_cache_dir_uses_hashed_file_pair( + tmp_path: Path, +) -> None: + session = _ParameterizedCloudflareSession() + cache_dir = tmp_path / "cloudflare-cache" + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache_dir, + session=session, + poll_interval=0, + ) + + source = crawler.fetch_raw("https://example.com/docs") + + base = _expected_cache_base(source.origin) + metadata_path = cache_dir / f"{base}.metadata.json" + content_path = cache_dir / f"{base}.md" + assert source.body_path.read_text(encoding="utf-8") == "# Docs\n" + assert sorted(path.name for path in cache_dir.iterdir()) == [ + content_path.name, + metadata_path.name, + ] + record = json.loads(metadata_path.read_text(encoding="utf-8")) + assert record["key"] == source.origin + assert record["content_path"] == content_path.name + assert record["metadata"]["record"]["url"] == source.origin + + +def test_cloudflare_crawler_cache_dir_true_uses_default_backend_directory( + tmp_path: Path, + monkeypatch, +) -> None: + monkeypatch.chdir(tmp_path) + session = _ParameterizedCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=True, + session=session, + poll_interval=0, + ) + + source = crawler.fetch_raw("https://example.com/docs") + + cache_dir = tmp_path / ".raghilda" / "cache" / "cloudflare" + base = _expected_cache_base(source.origin) + assert sorted(path.name for path in cache_dir.iterdir()) == [ + f"{base}.md", + f"{base}.metadata.json", + ] + + +def test_cloudflare_fetch_raw_scopes_cache_to_account_id( + tmp_path: Path, +) -> None: + cache = tmp_path / "cloudflare-cache" + first_session = _ParameterizedCloudflareSession() + first_crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache, + session=first_session, + poll_interval=0, + ) + first_crawler.fetch_raw("https://example.com/docs") + + second_session = _ParameterizedCloudflareSession() + second_crawler = CloudflareCrawler( + account_id="account-456", + api_token="token-123", + cache_dir=cache, + session=second_session, + poll_interval=0, + ) + + second_crawler.fetch_raw("https://example.com/docs") + + assert len(first_session.post_calls) == 1 + assert len(second_session.post_calls) == 1 + assert "/accounts/account-456/" in second_session.post_calls[0][0] + + +def test_cloudflare_fetch_raw_scopes_cache_to_api_base( + tmp_path: Path, +) -> None: + cache = tmp_path / "cloudflare-cache" + first_session = _ParameterizedCloudflareSession() + first_crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache, + session=first_session, + poll_interval=0, + base_url="https://prod.example/api", + ) + first_crawler.fetch_raw("https://example.com/docs") + + second_session = _ParameterizedCloudflareSession() + second_crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache, + session=second_session, + poll_interval=0, + base_url="https://staging.example/api", + ) + + second_crawler.fetch_raw("https://example.com/docs") + + assert len(first_session.post_calls) == 1 + assert len(second_session.post_calls) == 1 + assert second_session.post_calls[0][0].startswith( + "https://staging.example/api/accounts/account-123/" + ) + + +def test_cloudflare_crawler_applies_limit_across_all_roots( + tmp_path: Path, +) -> None: + session = _ParameterizedCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-cache", + session=session, + poll_interval=0, + ) + scope = CrawlScope( + roots=["https://example.com/docs-a", "https://example.com/docs-b"], + limit=1, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == ["https://example.com/docs-a"] + assert len(session.post_calls) == 1 + + +def test_directory_crawler_counts_file_roots_toward_limit(tmp_path: Path) -> None: + first = _write(tmp_path, "a.md", "# First") + second = _write(tmp_path, "b.md", "# Second") + crawler = DirectoryCrawler() + scope = CrawlScope(roots=[first, second], limit=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [first.resolve().as_uri()] + + +def test_directory_crawler_accepts_crawl_scope_for_roots_and_patterns( + tmp_path: Path, +) -> None: + docs = _write(tmp_path, "docs/readme.md", "# Hello") + _write(tmp_path, "notes/todo.md", "# Skip") + crawler = DirectoryCrawler() + scope = CrawlScope( + roots=[tmp_path], + depth=1, + include_patterns=[r".*/docs/.*"], + ) + + origins = list(crawler.origins(scope, progress=False)) + documents = list(crawler.markdown_documents(scope, progress=False)) + + assert origins == [docs.resolve().as_uri()] + assert documents == [ + MarkdownDocument(origin=docs.resolve().as_uri(), content="# Hello") + ] + + +def test_directory_crawler_returns_no_origins_when_limit_is_zero( + tmp_path: Path, +) -> None: + markdown = _write(tmp_path, "a.md", "# First") + crawler = DirectoryCrawler() + scope = CrawlScope(roots=[markdown], limit=0) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [] + + +def test_directory_crawler_fetch_markdown_force_refresh_rebuilds_cached_markdown( + tmp_path: Path, +) -> None: + markdown = _write(tmp_path, "docs/readme.md", "# Hello") + cache = tmp_path / "cache" + crawler = DirectoryCrawler(cache_dir=cache) + + origin = markdown.resolve().as_uri() + first = crawler.fetch_markdown(origin) + cached_markdown = next( + path for path in cache.iterdir() if not path.name.endswith(".metadata.json") + ) + cached_markdown.write_text("# Stale\n", encoding="utf-8") + + refreshed = crawler.fetch_markdown(origin, cache_force_refresh=True) + + assert first.content == "# Hello" + assert refreshed.content == "# Hello" + + +def test_directory_crawler_markdown_documents_force_refresh_rebuilds_cache( + tmp_path: Path, +) -> None: + markdown = _write(tmp_path, "docs/readme.md", "# Hello") + cache = tmp_path / "cache" + crawler = DirectoryCrawler(cache_dir=cache) + root = tmp_path / "docs" + scope = CrawlScope(roots=[root]) + + documents = list(crawler.markdown_documents(scope, progress=False)) + cached_markdown = next( + path for path in cache.iterdir() if not path.name.endswith(".metadata.json") + ) + cached_markdown.write_text("# Stale\n", encoding="utf-8") + + refreshed = list( + crawler.markdown_documents( + scope, + progress=False, + cache_force_refresh=True, + ) + ) + + assert documents == [ + MarkdownDocument(origin=markdown.resolve().as_uri(), content="# Hello") + ] + assert refreshed == [ + MarkdownDocument(origin=markdown.resolve().as_uri(), content="# Hello") + ] + + +def test_directory_crawler_markdown_documents_converts_in_parallel( + tmp_path: Path, +) -> None: + first = _write(tmp_path, "docs/a.md", "# First") + second = _write(tmp_path, "docs/b.md", "# Second") + crawler = DirectoryCrawler(max_workers=2) + scope = CrawlScope(roots=[tmp_path / "docs"]) + barrier = threading.Barrier(2) + lock = threading.Lock() + in_flight = 0 + max_in_flight = 0 + + def convert(source: FetchedSource) -> MarkdownDocument: + nonlocal in_flight, max_in_flight + with lock: + in_flight += 1 + max_in_flight = max(max_in_flight, in_flight) + try: + barrier.wait(timeout=1.0) + return MarkdownDocument( + origin=source.origin, + content=source.body_path.read_text(encoding="utf-8"), + ) + finally: + with lock: + in_flight -= 1 + + documents = list(crawler.markdown_documents(scope, progress=False, convert=convert)) + + assert documents == [ + MarkdownDocument(origin=first.resolve().as_uri(), content="# First"), + MarkdownDocument(origin=second.resolve().as_uri(), content="# Second"), + ] + assert max_in_flight == 2 + + +def test_directory_crawler_reopens_origins_with_uri_escaped_characters( + tmp_path: Path, +) -> None: + root = tmp_path / "My Docs" + markdown = _write(root, "read me.md", "# Hello") + crawler = DirectoryCrawler() + + origin = next(crawler.origins(CrawlScope(roots=[root]), progress=False)) + document = crawler.fetch_markdown(origin) + + assert "%20" in origin + assert document == MarkdownDocument( + origin=markdown.resolve().as_uri(), + content="# Hello", + ) + + +def test_directory_crawler_accepts_percent_escaped_file_uri_roots( + tmp_path: Path, +) -> None: + root = tmp_path / "My Docs" + markdown = _write(root, "read me.md", "# Hello") + crawler = DirectoryCrawler() + + origins = list( + crawler.origins(CrawlScope(roots=[root.resolve().as_uri()]), progress=False) + ) + + assert origins == [markdown.resolve().as_uri()] + + +def test_web_crawler_returns_no_origins_or_requests_when_limit_is_zero( + tmp_path: Path, +) -> None: + with _serve( + { + "/": { + "body": "
Root
", + "content_type": "text/html; charset=utf-8", + "etag": None, + } + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}/" + crawler = WebCrawler( + cache_dir=tmp_path / "zero-limit-cache", + ) + scope = CrawlScope(roots=[root_url], depth=0, limit=0) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [] + assert getattr(server, "requests") == [] diff --git a/tests/test_store_ingest.py b/tests/test_store_ingest.py new file mode 100644 index 0000000..8fe334d --- /dev/null +++ b/tests/test_store_ingest.py @@ -0,0 +1,444 @@ +from __future__ import annotations + +from concurrent.futures import CancelledError +from dataclasses import replace +from pathlib import Path +import threading +import time +from types import SimpleNamespace +from typing import Any + +import pytest + +import raghilda._store as store_module +from raghilda.chunker import MarkdownChunker +from raghilda.document import Document, MarkdownDocument +from raghilda.store import ( + BaseStore, + ChromaDBStore, + DuckDBStore, + IngestSummary, + OpenAIStore, + WriteResult, +) + + +class _RecordingStore(BaseStore): + def __init__(self) -> None: + self.lock = threading.Lock() + self.prepare_thread_ids: dict[str, int] = {} + self.upsert_thread_ids: dict[str, int] = {} + self.started_origins: list[str] = [] + self.max_in_flight = 0 + self.in_flight = 0 + + @staticmethod + def connect(*args, **kwargs) -> "_RecordingStore": + return _RecordingStore() + + @staticmethod + def create(*args, **kwargs) -> "_RecordingStore": + return _RecordingStore() + + def upsert( + self, + document: Document, + *, + skip_if_unchanged: bool = True, + ) -> WriteResult[Document]: + origin = document.origin + assert isinstance(origin, str) + with self.lock: + self.started_origins.append(origin) + self.in_flight += 1 + self.max_in_flight = max(self.max_in_flight, self.in_flight) + try: + time.sleep(0.02 if origin == "doc-1" else 0) + if origin == "doc-2": + raise RuntimeError("boom") + self.upsert_thread_ids[origin] = threading.get_ident() + action = ( + document.attributes["action"] if document.attributes else "inserted" + ) + return WriteResult(action=action, document=document) + finally: + with self.lock: + self.in_flight -= 1 + + def retrieve(self, text: str, top_k: int, *args, **kwargs): + return [] + + def size(self) -> int: + return len(self.started_origins) + + +class _BlockingFailureStore(BaseStore): + def __init__(self) -> None: + self.blocking_started = threading.Event() + self.release_blocked = threading.Event() + self.blocking_finished = threading.Event() + + @staticmethod + def connect(*args, **kwargs) -> "_BlockingFailureStore": + return _BlockingFailureStore() + + @staticmethod + def create(*args, **kwargs) -> "_BlockingFailureStore": + return _BlockingFailureStore() + + def upsert( + self, + document: Document, + *, + skip_if_unchanged: bool = True, + ) -> WriteResult[Document]: + del skip_if_unchanged + origin = document.origin + assert isinstance(origin, str) + if origin == "doc-1": + self.blocking_started.set() + self.release_blocked.wait(timeout=1.0) + self.blocking_finished.set() + return WriteResult(action="inserted", document=document) + if origin == "doc-2": + assert self.blocking_started.wait(timeout=1.0) + raise RuntimeError("boom") + return WriteResult(action="inserted", document=document) + + def retrieve(self, text: str, top_k: int, *args, **kwargs): + return [] + + def size(self) -> int: + return 0 + + +class _CancelledSiblingStore(BaseStore): + def __init__(self) -> None: + self.allow_failure = threading.Event() + self.release_cancelled = threading.Event() + + @staticmethod + def connect(*args, **kwargs) -> "_CancelledSiblingStore": + return _CancelledSiblingStore() + + @staticmethod + def create(*args, **kwargs) -> "_CancelledSiblingStore": + return _CancelledSiblingStore() + + def upsert( + self, + document: Document, + *, + skip_if_unchanged: bool = True, + ) -> WriteResult[Document]: + del skip_if_unchanged + assert isinstance(document.origin, str) + if document.origin == "doc-1": + self.allow_failure.set() + raise RuntimeError("boom") + return WriteResult(action="inserted", document=document) + + def retrieve(self, text: str, top_k: int, *args, **kwargs): + return [] + + def size(self) -> int: + return 0 + + +def test_base_store_ingest_returns_summary_and_applies_prepare_before_upsert() -> None: + store = _RecordingStore() + main_thread_id = threading.get_ident() + documents = [ + MarkdownDocument( + origin="doc-1", content="# One", attributes={"action": "inserted"} + ), + MarkdownDocument( + origin="doc-3", content="# Three", attributes={"action": "skipped"} + ), + ] + + def prepare(document: MarkdownDocument) -> MarkdownDocument: + assert document.origin is not None + store.prepare_thread_ids[document.origin] = threading.get_ident() + return replace(document, content=document.content + "\nprepared") + + summary = store.ingest(documents, prepare=prepare, max_workers=2) + + assert summary == IngestSummary(inserted=1, replaced=0, skipped=1) + assert set(store.prepare_thread_ids) == {"doc-1", "doc-3"} + assert set(store.upsert_thread_ids) == {"doc-1", "doc-3"} + assert set(store.prepare_thread_ids.values()).isdisjoint({main_thread_id}) + + +def test_base_store_ingest_runs_prepare_in_worker_pool_concurrently() -> None: + store = _RecordingStore() + barrier = threading.Barrier(2) + lock = threading.Lock() + in_prepare = 0 + max_in_prepare = 0 + + documents = [ + MarkdownDocument(origin="doc-1", content="# One"), + MarkdownDocument(origin="doc-3", content="# Three"), + ] + + def prepare(document: MarkdownDocument) -> MarkdownDocument: + nonlocal in_prepare, max_in_prepare + with lock: + in_prepare += 1 + max_in_prepare = max(max_in_prepare, in_prepare) + try: + barrier.wait(timeout=1.0) + return document + finally: + with lock: + in_prepare -= 1 + + summary = store.ingest(documents, prepare=prepare, max_workers=2) + + assert summary == IngestSummary(inserted=2, replaced=0, skipped=0) + assert max_in_prepare == 2 + + +def test_base_store_ingest_starts_writes_before_input_is_exhausted() -> None: + class _StreamingStore(BaseStore): + def __init__(self) -> None: + self.started = threading.Event() + self.started_origins: list[str] = [] + + @staticmethod + def connect(*args, **kwargs) -> "_StreamingStore": + return _StreamingStore() + + @staticmethod + def create(*args, **kwargs) -> "_StreamingStore": + return _StreamingStore() + + def upsert( + self, + document: Document, + *, + skip_if_unchanged: bool = True, + ) -> WriteResult[Document]: + del skip_if_unchanged + assert isinstance(document.origin, str) + self.started_origins.append(document.origin) + self.started.set() + return WriteResult(action="inserted", document=document) + + def retrieve(self, text: str, top_k: int, *args, **kwargs): + return [] + + def size(self) -> int: + return len(self.started_origins) + + store = _StreamingStore() + + def documents(): + yield MarkdownDocument(origin="doc-1", content="# One") + assert store.started.wait(timeout=1.0) + yield MarkdownDocument(origin="doc-2", content="# Two") + + summary = store.ingest(documents(), max_workers=1) + + assert summary == IngestSummary(inserted=2, replaced=0, skipped=0) + assert store.started_origins == ["doc-1", "doc-2"] + + +def test_base_store_ingest_raises_on_duplicate_after_streaming_started() -> None: + store = _RecordingStore() + documents = [ + MarkdownDocument(origin="dup", content="# One"), + MarkdownDocument(origin="dup", content="# Two"), + MarkdownDocument(origin="doc-3", content="# Three"), + ] + + with pytest.raises(ValueError, match="Duplicate origin during ingest: dup"): + store.ingest(documents, max_workers=1) + + assert store.started_origins == ["dup"] + + +def test_base_store_ingest_fails_fast_and_bounds_worker_count() -> None: + store = _RecordingStore() + documents = [ + MarkdownDocument(origin="doc-1", content="# One"), + MarkdownDocument(origin="doc-2", content="# Two"), + MarkdownDocument(origin="doc-3", content="# Three"), + ] + + with pytest.raises(RuntimeError, match="boom"): + store.ingest(documents, max_workers=2) + + assert "doc-3" not in store.started_origins + assert store.max_in_flight <= 2 + + +def test_base_store_ingest_waits_for_running_workers_before_raising() -> None: + store = _BlockingFailureStore() + documents = [ + MarkdownDocument(origin="doc-1", content="# One"), + MarkdownDocument(origin="doc-2", content="# Two"), + ] + + def release_blocked() -> None: + assert store.blocking_started.wait(timeout=1.0) + time.sleep(0.2) + store.release_blocked.set() + + releaser = threading.Thread(target=release_blocked) + releaser.start() + + try: + with pytest.raises(RuntimeError, match="boom"): + store.ingest(documents, max_workers=2) + + assert store.release_blocked.is_set() + assert store.blocking_finished.is_set() + finally: + releaser.join() + + +def test_base_store_ingest_ignores_cancelled_sibling_when_worker_failed( + monkeypatch, +) -> None: + class _FakeFuture: + def __init__( + self, + *, + result: WriteResult[Document] | None = None, + error: BaseException | None = None, + ) -> None: + self._result = result + self._error = error + + def result(self) -> WriteResult[Document]: + if self._error is not None: + raise self._error + assert self._result is not None + return self._result + + def cancel(self) -> None: + return None + + class _FakeExecutor: + def __init__(self, *, max_workers: int) -> None: + del max_workers + self._submissions = [ + _FakeFuture(error=CancelledError()), + _FakeFuture(error=RuntimeError("boom")), + ] + + def submit(self, fn, arg): + del fn, arg + return self._submissions.pop(0) + + def shutdown(self, *, wait: bool, cancel_futures: bool) -> None: + del wait, cancel_futures + return None + + def fake_wait(pending, return_when): + del pending, return_when + return ( + [ + _FakeFuture(error=CancelledError()), + _FakeFuture(error=RuntimeError("boom")), + ], + set(), + ) + + monkeypatch.setattr(store_module, "ThreadPoolExecutor", _FakeExecutor) + monkeypatch.setattr(store_module, "wait", fake_wait) + + store = _RecordingStore() + documents = [ + MarkdownDocument(origin="doc-1", content="# One"), + MarkdownDocument(origin="doc-2", content="# Two"), + ] + + with pytest.raises(RuntimeError, match="boom"): + store.ingest(documents, max_workers=2) + + +def test_duckdb_store_ingest_prepares_chunked_documents() -> None: + store = DuckDBStore.create( + location=":memory:", + embed=None, + overwrite=True, + name="duckdb_ingest", + ) + documents = [ + MarkdownDocument(origin="doc-1", content="# One\n\nHello"), + MarkdownDocument(origin="doc-2", content="# Two\n\nWorld"), + ] + + summary = store.ingest( + documents, + prepare=MarkdownChunker(chunk_size=32, target_overlap=0).chunk, + max_workers=2, + ) + + assert summary == IngestSummary(inserted=2, replaced=0, skipped=0) + assert store.size() == 2 + + +def test_chromadb_store_ingest_prepares_chunked_documents(tmp_path: Path) -> None: + store = ChromaDBStore.create( + location=tmp_path / "chroma", + overwrite=True, + name="chroma_ingest", + embed=None, + ) + documents = [ + MarkdownDocument(origin="doc-1", content="# One\n\nHello"), + MarkdownDocument(origin="doc-2", content="# Two\n\nWorld"), + ] + + summary = store.ingest( + documents, + prepare=MarkdownChunker(chunk_size=32, target_overlap=0).chunk, + max_workers=2, + ) + + assert summary == IngestSummary(inserted=2, replaced=0, skipped=0) + assert store.size() == 2 + + +class _SinglePage: + def __init__(self, data: list[Any]): + self.data = data + + def has_next_page(self) -> bool: + return False + + +class _FakeVectorStoreFiles: + def __init__(self) -> None: + self.uploads: list[dict[str, Any]] = [] + + def list(self, **kwargs): + return _SinglePage([]) + + def upload_and_poll(self, **kwargs): + self.uploads.append(kwargs) + return SimpleNamespace(id=f"file-{len(self.uploads)}") + + def delete(self, **kwargs): + raise AssertionError("delete should not be called") + + +def test_openai_store_ingest_accepts_markdown_documents_without_prepare() -> None: + vector_store_files = _FakeVectorStoreFiles() + fake_client = SimpleNamespace( + vector_stores=SimpleNamespace(files=vector_store_files), + ) + store = OpenAIStore(client=fake_client, store_id="vs_test") + documents = [ + MarkdownDocument(origin="doc-1", content="# One"), + MarkdownDocument(origin="doc-2", content="# Two"), + ] + + summary = store.ingest(documents, max_workers=2) + + assert summary == IngestSummary(inserted=2, replaced=0, skipped=0) + assert len(vector_store_files.uploads) == 2 From 6432af714f8659f09f63e261f20f061eb27263d7 Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Wed, 15 Apr 2026 16:20:27 -0400 Subject: [PATCH 02/17] ci: enable verbose pytest output in checks --- .github/workflows/check.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index 0bb8fbe..1c29f91 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -42,4 +42,4 @@ jobs: run: uv sync --all-extras - name: Run checks - run: uv run task check \ No newline at end of file + run: PYTEST_ADDOPTS=-vv uv run task check From 0874da2c9bda3c84f125090594ab190128f23d85 Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Wed, 15 Apr 2026 16:42:19 -0400 Subject: [PATCH 03/17] Fix Windows file URI decoding in directory crawler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review finding: [P1] Preserve Windows drive letters when parsing file:// origins — /Users/tomasz/github/posit-dev/raghilda/.worktrees/feature-crawl-api/src/raghilda/crawl.py:1567-1570 On Windows, urlparse('file:///C:/docs/readme.md').path is '/C:/docs/readme.md', so Path(unquote(parsed.path)) resolves to \C:\docs\readme.md instead of C:\docs\readme.md. That means DirectoryCrawler.origins() can emit file:// URLs that fetch_raw() / fetch_markdown() cannot reopen, and CrawlScope(roots=[file_uri]) fails for the same reason because the same parsing is duplicated in _to_directory_path() above. Using a file-URI decoder that preserves the drive/netloc avoids breaking the public file-URI workflow on Windows. Response: Addressed. Replace the duplicated urlparse(...).path decoding with a shared stdlib-backed file-URI decoder based on urllib.request.url2pathname so Windows drive letters and UNC netlocs round-trip correctly without custom path-munging logic. Add a Windows-only public API regression test that exercises CrawlScope(roots=[path.as_uri()]), DirectoryCrawler.origins(), and fetch_raw() on the same file URI. --- src/raghilda/crawl.py | 16 +++++++++++++--- tests/test_crawl.py | 21 +++++++++++++++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/src/raghilda/crawl.py b/src/raghilda/crawl.py index fe82f99..a6c2b2b 100644 --- a/src/raghilda/crawl.py +++ b/src/raghilda/crawl.py @@ -17,7 +17,8 @@ from typing import Any, Callable, Iterable, Iterator, Mapping, Sequence, TypeVar import threading import unicodedata -from urllib.parse import unquote, urlparse +from urllib.parse import urlparse +from urllib.request import url2pathname import requests @@ -1559,15 +1560,24 @@ def _to_directory_path(root: str | Path) -> Path: return root parsed = urlparse(str(root)) if parsed.scheme == "file": - return Path(unquote(parsed.path)) + return _path_from_file_uri(str(root)) assert parsed.scheme in {"", "file"} return Path(str(root)) +def _path_from_file_uri(origin: str) -> Path: + parsed = urlparse(origin) + assert parsed.scheme == "file" + raw_path = parsed.path + if parsed.netloc and parsed.netloc != "localhost": + raw_path = f"//{parsed.netloc}{parsed.path}" + return Path(url2pathname(raw_path)) + + def _path_from_file_origin(origin: str) -> Path: parsed = urlparse(origin) if parsed.scheme == "file": - return Path(unquote(parsed.path)) + return _path_from_file_uri(origin) return Path(origin) diff --git a/tests/test_crawl.py b/tests/test_crawl.py index f7cc6a9..95528bb 100644 --- a/tests/test_crawl.py +++ b/tests/test_crawl.py @@ -6,6 +6,7 @@ import hashlib import http.server import json +import os from pathlib import Path import re import socketserver @@ -13,6 +14,7 @@ from typing import Any import unicodedata +import pytest import raghilda.crawl as crawl_module from raghilda.crawl import ( CrawlScope, @@ -1566,6 +1568,25 @@ def test_directory_crawler_accepts_percent_escaped_file_uri_roots( assert origins == [markdown.resolve().as_uri()] +@pytest.mark.skipif(os.name != "nt", reason="Windows-specific file URI handling") +def test_directory_crawler_round_trips_windows_file_uris( + tmp_path: Path, +) -> None: + root = tmp_path / "My Docs" + markdown = _write(root, "read me.md", "# Hello") + crawler = DirectoryCrawler() + + root_uri = root.resolve().as_uri() + origin = markdown.resolve().as_uri() + + origins = list(crawler.origins(CrawlScope(roots=[root_uri]), progress=False)) + source = crawler.fetch_raw(origin) + + assert origins == [origin] + assert source.origin == origin + assert source.body_path == markdown.resolve() + + def test_web_crawler_returns_no_origins_or_requests_when_limit_is_zero( tmp_path: Path, ) -> None: From fb946bd3652df966baecd81799ef06d3a2d5976b Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Wed, 15 Apr 2026 16:42:49 -0400 Subject: [PATCH 04/17] Document markdown refresh reuse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review finding: [P2] Pass cache_force_refresh through markdown_documents — /Users/tomasz/github/posit-dev/raghilda/.worktrees/feature-crawl-api/src/raghilda/crawl.py:661-664 markdown_documents(..., cache_force_refresh=True) refreshes pages while origins() is discovering them, but this second pass hard-codes cache_force_refresh=False. With WebCrawler or CloudflareCrawler and any finite cache_stale_after (for example timedelta(0)), every origin is immediately revalidated/fetched again, so the caller does not actually get a single refreshed crawl and can observe content from a second request/crawl instead of the snapshot used for discovery. Response: Not addressed as a behavior bug. The second pass intentionally keeps cache_force_refresh=False so markdown_documents(..., cache_force_refresh=True) reuses the snapshot refreshed during origins() instead of fetching the same origin twice in one user call. Add an inline comment at the point of concern to make that contract explicit; the existing test_web_markdown_documents_reuses_refreshed_sources regression test continues to encode the single-fetch behavior. --- src/raghilda/crawl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/raghilda/crawl.py b/src/raghilda/crawl.py index a6c2b2b..14ffd66 100644 --- a/src/raghilda/crawl.py +++ b/src/raghilda/crawl.py @@ -662,6 +662,8 @@ def markdown_documents( fn=lambda origin: self.fetch_markdown( origin, convert=convert, + # origins(..., cache_force_refresh=True) already refreshed the source + # for this crawl, so reuse that cached snapshot here. cache_force_refresh=False, ), ) From ded282b10ec4bf0588fb1baa411a27fbf14c1c59 Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Wed, 15 Apr 2026 16:44:04 -0400 Subject: [PATCH 05/17] Limit concurrent frontier fetches by remaining crawl budget MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review finding: [P2] Respect scope.limit before fetching a full frontier batch — /Users/tomasz/github/posit-dev/raghilda/.worktrees/feature-crawl-api/src/raghilda/crawl.py:920-923 Here the entire current-depth batch is submitted to _map_ordered() before yielded >= limit is checked below, so WebCrawler.origins() still requests every root/frontier page even when the first result already satisfies scope.limit. For example, with two roots and limit=1, the method yields only the first URL but still fetches the second one, which creates unexpected network traffic and rate-limit pressure despite the public limit being reached. Response: Addressed. Fetch each current-depth frontier in windows bounded by the remaining crawl limit, and cap worker usage to each window, so concurrent WebCrawler.origins() calls stop issuing same-depth requests once the public limit has been satisfied. Add a regression test covering two roots with max_workers=2 and limit=1, which now yields only the first origin and makes only one HTTP request. --- src/raghilda/crawl.py | 98 ++++++++++++++++++++++++------------------- tests/test_crawl.py | 35 ++++++++++++++++ 2 files changed, 91 insertions(+), 42 deletions(-) diff --git a/src/raghilda/crawl.py b/src/raghilda/crawl.py index 14ffd66..a687b24 100644 --- a/src/raghilda/crawl.py +++ b/src/raghilda/crawl.py @@ -920,51 +920,65 @@ def origins( batch.append((origin, root_host)) next_frontier: list[tuple[str, str]] = [] - fetched_sources = _map_ordered( - batch, - max_workers=self.max_workers, - fn=lambda item: ( - item, - self.fetch_raw( - item[0], - cache_force_refresh=cache_force_refresh, - ), - ), - ) - for (origin, root_host), source in fetched_sources: - type_label = (source.metadata or {}).get("type_label") - matches_patterns = _matches_patterns( - origin, - include_patterns=resolved_scope.include_patterns, - exclude_patterns=resolved_scope.exclude_patterns, + offset = 0 + while offset < len(batch): + remaining = ( + None + if resolved_scope.limit is None + else resolved_scope.limit - yielded ) - matches_types = _matches_types( - type_label, - include_types=resolved_scope.include_types, - exclude_types=resolved_scope.exclude_types, + if remaining == 0: + return + chunk_size = len(batch) - offset + if remaining is not None: + chunk_size = min(chunk_size, remaining) + window = batch[offset : offset + chunk_size] + fetched_sources = _map_ordered( + window, + max_workers=min(self.max_workers, len(window)), + fn=lambda item: ( + item, + self.fetch_raw( + item[0], + cache_force_refresh=cache_force_refresh, + ), + ), ) - if matches_patterns and matches_types: - yield origin - yielded += 1 - if ( - resolved_scope.limit is not None - and yielded >= resolved_scope.limit - ): - return - if current_depth >= resolved_scope.depth: - continue - - text = _read_text(source.body_path) - resolved_origin = source.resolved_origin or origin - resolved_host = urlparse(resolved_origin).hostname or root_host - for link in sorted(_extract_links(text)): - canonical = _canonicalize(link, base=resolved_origin) - if canonical is None: - continue - parsed = urlparse(canonical) - if parsed.scheme not in {"http", "https"}: + for (origin, root_host), source in fetched_sources: + type_label = (source.metadata or {}).get("type_label") + matches_patterns = _matches_patterns( + origin, + include_patterns=resolved_scope.include_patterns, + exclude_patterns=resolved_scope.exclude_patterns, + ) + matches_types = _matches_types( + type_label, + include_types=resolved_scope.include_types, + exclude_types=resolved_scope.exclude_types, + ) + if matches_patterns and matches_types: + yield origin + yielded += 1 + if ( + resolved_scope.limit is not None + and yielded >= resolved_scope.limit + ): + return + if current_depth >= resolved_scope.depth: continue - next_frontier.append((canonical, resolved_host)) + + text = _read_text(source.body_path) + resolved_origin = source.resolved_origin or origin + resolved_host = urlparse(resolved_origin).hostname or root_host + for link in sorted(_extract_links(text)): + canonical = _canonicalize(link, base=resolved_origin) + if canonical is None: + continue + parsed = urlparse(canonical) + if parsed.scheme not in {"http", "https"}: + continue + next_frontier.append((canonical, resolved_host)) + offset += chunk_size frontier = next_frontier current_depth += 1 diff --git a/tests/test_crawl.py b/tests/test_crawl.py index 95528bb..6a71b9c 100644 --- a/tests/test_crawl.py +++ b/tests/test_crawl.py @@ -1609,3 +1609,38 @@ def test_web_crawler_returns_no_origins_or_requests_when_limit_is_zero( assert origins == [] assert getattr(server, "requests") == [] + + +def test_web_crawler_does_not_fetch_extra_root_once_limit_is_reached( + tmp_path: Path, +) -> None: + with _serve( + { + "/first": { + "body": "
First
", + "content_type": "text/html; charset=utf-8", + "etag": None, + }, + "/second": { + "body": "
Second
", + "content_type": "text/html; charset=utf-8", + "etag": None, + }, + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}" + crawler = WebCrawler( + cache_dir=tmp_path / "limit-cache", + max_workers=2, + ) + scope = CrawlScope( + roots=[f"{root_url}/first", f"{root_url}/second"], + depth=0, + limit=1, + ) + + origins = list(crawler.origins(scope, progress=False)) + requests = [request["path"] for request in getattr(server, "requests")] + + assert origins == [f"{root_url}/first"] + assert requests == ["/first"] From fcb36deef154c409763e452a515b110961ecae14 Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Wed, 15 Apr 2026 18:29:07 -0400 Subject: [PATCH 06/17] ci: add live openai store diagnostics --- .github/workflows/check.yml | 2 +- tests/test_store.py | 143 ++++++++++++++++++++++++++++++++---- 2 files changed, 130 insertions(+), 15 deletions(-) diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index 1c29f91..6483cb9 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -42,4 +42,4 @@ jobs: run: uv sync --all-extras - name: Run checks - run: PYTEST_ADDOPTS=-vv uv run task check + run: PYTEST_ADDOPTS='-vv -o log_cli=true --log-cli-level=INFO' uv run task check diff --git a/tests/test_store.py b/tests/test_store.py index 77301b2..5642da2 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -1,6 +1,7 @@ import os import hashlib import json +import logging import subprocess import sys import textwrap @@ -23,6 +24,29 @@ from raghilda.embedding import EmbeddingOpenAI from raghilda._embedding import EmbeddingProvider, EmbedInputType +logger = logging.getLogger(__name__) + + +def _log_openai_ci_event(label: str, event: str, **fields: object) -> None: + suffix = " ".join(f"{key}={value}" for key, value in fields.items()) + if suffix: + logger.info("OPENAI-CI %s %s %s", label, event, suffix) + else: + logger.info("OPENAI-CI %s %s", label, event) + + +def _upsert_openai_fixture_document( + store: OpenAIStore, + document: MarkdownDocument, + *, + label: str, + name: str, +) -> None: + assert isinstance(document.origin, str) + _log_openai_ci_event(label, "upsert_start", name=name, origin=document.origin) + store.upsert(document) + _log_openai_ci_event(label, "upsert_done", name=name, origin=document.origin) + class CountingEmbedding(EmbeddingProvider): def __init__(self): @@ -1558,87 +1582,178 @@ def setup(self): test_helpers.skip_if_no_openai() @pytest.fixture - def store(self): + def store(self, request): + label = request.node.nodeid + _log_openai_ci_event(label, "create_start") store = OpenAIStore.create() + _log_openai_ci_event(label, "create_done", store_id=store.store_id) try: yield store finally: + _log_openai_ci_event(label, "delete_start", store_id=store.store_id) try: store.client.vector_stores.delete(vector_store_id=store.store_id) except openai.AuthenticationError: - pass + _log_openai_ci_event( + label, "delete_auth_error", store_id=store.store_id + ) + else: + _log_openai_ci_event(label, "delete_done", store_id=store.store_id) @pytest.fixture(scope="class") - def store_with_attributes(self): + def store_with_attributes(self, request): + label = request.node.nodeid + _log_openai_ci_event(label, "create_start", fixture="store_with_attributes") store = OpenAIStore.create(attributes={"tenant": str, "priority": int}) - store.upsert( + _log_openai_ci_event( + label, + "create_done", + fixture="store_with_attributes", + store_id=store.store_id, + ) + _upsert_openai_fixture_document( + store, MarkdownDocument( origin="doc-attrs", content="alpha bronze owl", attributes={"tenant": "docs", "priority": 2}, ), + label=label, + name="doc_attrs", ) - store.upsert( + _upsert_openai_fixture_document( + store, MarkdownDocument( origin="docs-priority-1", content="alpha beta", attributes={"tenant": "docs", "priority": 1}, ), + label=label, + name="docs_priority_1", ) - store.upsert( + _upsert_openai_fixture_document( + store, MarkdownDocument( origin="ops-priority-5", content="alpha gamma", attributes={"tenant": "ops", "priority": 5}, ), + label=label, + name="ops_priority_5", ) - store.upsert( + _upsert_openai_fixture_document( + store, MarkdownDocument( origin="docs-priority-3", content="alpha alpha delta", attributes={"tenant": "docs", "priority": 3}, ), + label=label, + name="docs_priority_3", ) try: yield store finally: + _log_openai_ci_event( + label, + "delete_start", + fixture="store_with_attributes", + store_id=store.store_id, + ) try: store.client.vector_stores.delete(vector_store_id=store.store_id) except openai.AuthenticationError: - pass + _log_openai_ci_event( + label, + "delete_auth_error", + fixture="store_with_attributes", + store_id=store.store_id, + ) + else: + _log_openai_ci_event( + label, + "delete_done", + fixture="store_with_attributes", + store_id=store.store_id, + ) @pytest.fixture - def store_with_class_attributes(self): + def store_with_class_attributes(self, request): class AttributesSpec: tenant: str priority: int + label = request.node.nodeid + _log_openai_ci_event( + label, "create_start", fixture="store_with_class_attributes" + ) store = OpenAIStore.create(attributes=AttributesSpec) + _log_openai_ci_event( + label, + "create_done", + fixture="store_with_class_attributes", + store_id=store.store_id, + ) try: yield store finally: + _log_openai_ci_event( + label, + "delete_start", + fixture="store_with_class_attributes", + store_id=store.store_id, + ) try: store.client.vector_stores.delete(vector_store_id=store.store_id) except openai.AuthenticationError: - pass + _log_openai_ci_event( + label, + "delete_auth_error", + fixture="store_with_class_attributes", + store_id=store.store_id, + ) + else: + _log_openai_ci_event( + label, + "delete_done", + fixture="store_with_class_attributes", + store_id=store.store_id, + ) @pytest.fixture - def store_with_docs(self, store): + def store_with_docs(self, store, request): + label = request.node.nodeid doc = MarkdownDocument( origin="test", content="hello world this is a document world world world" ) - store.upsert(doc) + _upsert_openai_fixture_document( + store, + doc, + label=label, + name="store_with_docs", + ) return store def test_create_store(self, store): assert isinstance(store, OpenAIStore) assert isinstance(store.store_id, str) - def test_insert(self, store_with_docs): + def test_insert(self, store_with_docs, request): + label = request.node.nodeid + _log_openai_ci_event(label, "size_start", store_id=store_with_docs.store_id) assert store_with_docs.size() == 1 + _log_openai_ci_event(label, "size_done", store_id=store_with_docs.store_id) - def test_retrieve(self, store_with_docs): + def test_retrieve(self, store_with_docs, request): + label = request.node.nodeid + _log_openai_ci_event(label, "retrieve_start", store_id=store_with_docs.store_id) results = store_with_docs.retrieve("world", top_k=3) + _log_openai_ci_event( + label, + "retrieve_done", + store_id=store_with_docs.store_id, + result_count=len(results), + ) assert len(results) > 0 for chunk in results: assert isinstance(chunk, RetrievedChunk) From ba807cee301145007187a43106182164a0130e46 Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Mon, 1 Jun 2026 19:30:45 -0400 Subject: [PATCH 07/17] ci: rerun crawl api checks From 246dc1cf5fc3a9c1827b7a35bdd8a6b819e8bebc Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Mon, 1 Jun 2026 20:11:31 -0400 Subject: [PATCH 08/17] Remove OpenAI CI diagnostics --- .github/workflows/check.yml | 2 +- tests/test_store.py | 143 ++++-------------------------------- 2 files changed, 15 insertions(+), 130 deletions(-) diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index 6483cb9..0bb8fbe 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -42,4 +42,4 @@ jobs: run: uv sync --all-extras - name: Run checks - run: PYTEST_ADDOPTS='-vv -o log_cli=true --log-cli-level=INFO' uv run task check + run: uv run task check \ No newline at end of file diff --git a/tests/test_store.py b/tests/test_store.py index 5642da2..77301b2 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -1,7 +1,6 @@ import os import hashlib import json -import logging import subprocess import sys import textwrap @@ -24,29 +23,6 @@ from raghilda.embedding import EmbeddingOpenAI from raghilda._embedding import EmbeddingProvider, EmbedInputType -logger = logging.getLogger(__name__) - - -def _log_openai_ci_event(label: str, event: str, **fields: object) -> None: - suffix = " ".join(f"{key}={value}" for key, value in fields.items()) - if suffix: - logger.info("OPENAI-CI %s %s %s", label, event, suffix) - else: - logger.info("OPENAI-CI %s %s", label, event) - - -def _upsert_openai_fixture_document( - store: OpenAIStore, - document: MarkdownDocument, - *, - label: str, - name: str, -) -> None: - assert isinstance(document.origin, str) - _log_openai_ci_event(label, "upsert_start", name=name, origin=document.origin) - store.upsert(document) - _log_openai_ci_event(label, "upsert_done", name=name, origin=document.origin) - class CountingEmbedding(EmbeddingProvider): def __init__(self): @@ -1582,178 +1558,87 @@ def setup(self): test_helpers.skip_if_no_openai() @pytest.fixture - def store(self, request): - label = request.node.nodeid - _log_openai_ci_event(label, "create_start") + def store(self): store = OpenAIStore.create() - _log_openai_ci_event(label, "create_done", store_id=store.store_id) try: yield store finally: - _log_openai_ci_event(label, "delete_start", store_id=store.store_id) try: store.client.vector_stores.delete(vector_store_id=store.store_id) except openai.AuthenticationError: - _log_openai_ci_event( - label, "delete_auth_error", store_id=store.store_id - ) - else: - _log_openai_ci_event(label, "delete_done", store_id=store.store_id) + pass @pytest.fixture(scope="class") - def store_with_attributes(self, request): - label = request.node.nodeid - _log_openai_ci_event(label, "create_start", fixture="store_with_attributes") + def store_with_attributes(self): store = OpenAIStore.create(attributes={"tenant": str, "priority": int}) - _log_openai_ci_event( - label, - "create_done", - fixture="store_with_attributes", - store_id=store.store_id, - ) - _upsert_openai_fixture_document( - store, + store.upsert( MarkdownDocument( origin="doc-attrs", content="alpha bronze owl", attributes={"tenant": "docs", "priority": 2}, ), - label=label, - name="doc_attrs", ) - _upsert_openai_fixture_document( - store, + store.upsert( MarkdownDocument( origin="docs-priority-1", content="alpha beta", attributes={"tenant": "docs", "priority": 1}, ), - label=label, - name="docs_priority_1", ) - _upsert_openai_fixture_document( - store, + store.upsert( MarkdownDocument( origin="ops-priority-5", content="alpha gamma", attributes={"tenant": "ops", "priority": 5}, ), - label=label, - name="ops_priority_5", ) - _upsert_openai_fixture_document( - store, + store.upsert( MarkdownDocument( origin="docs-priority-3", content="alpha alpha delta", attributes={"tenant": "docs", "priority": 3}, ), - label=label, - name="docs_priority_3", ) try: yield store finally: - _log_openai_ci_event( - label, - "delete_start", - fixture="store_with_attributes", - store_id=store.store_id, - ) try: store.client.vector_stores.delete(vector_store_id=store.store_id) except openai.AuthenticationError: - _log_openai_ci_event( - label, - "delete_auth_error", - fixture="store_with_attributes", - store_id=store.store_id, - ) - else: - _log_openai_ci_event( - label, - "delete_done", - fixture="store_with_attributes", - store_id=store.store_id, - ) + pass @pytest.fixture - def store_with_class_attributes(self, request): + def store_with_class_attributes(self): class AttributesSpec: tenant: str priority: int - label = request.node.nodeid - _log_openai_ci_event( - label, "create_start", fixture="store_with_class_attributes" - ) store = OpenAIStore.create(attributes=AttributesSpec) - _log_openai_ci_event( - label, - "create_done", - fixture="store_with_class_attributes", - store_id=store.store_id, - ) try: yield store finally: - _log_openai_ci_event( - label, - "delete_start", - fixture="store_with_class_attributes", - store_id=store.store_id, - ) try: store.client.vector_stores.delete(vector_store_id=store.store_id) except openai.AuthenticationError: - _log_openai_ci_event( - label, - "delete_auth_error", - fixture="store_with_class_attributes", - store_id=store.store_id, - ) - else: - _log_openai_ci_event( - label, - "delete_done", - fixture="store_with_class_attributes", - store_id=store.store_id, - ) + pass @pytest.fixture - def store_with_docs(self, store, request): - label = request.node.nodeid + def store_with_docs(self, store): doc = MarkdownDocument( origin="test", content="hello world this is a document world world world" ) - _upsert_openai_fixture_document( - store, - doc, - label=label, - name="store_with_docs", - ) + store.upsert(doc) return store def test_create_store(self, store): assert isinstance(store, OpenAIStore) assert isinstance(store.store_id, str) - def test_insert(self, store_with_docs, request): - label = request.node.nodeid - _log_openai_ci_event(label, "size_start", store_id=store_with_docs.store_id) + def test_insert(self, store_with_docs): assert store_with_docs.size() == 1 - _log_openai_ci_event(label, "size_done", store_id=store_with_docs.store_id) - def test_retrieve(self, store_with_docs, request): - label = request.node.nodeid - _log_openai_ci_event(label, "retrieve_start", store_id=store_with_docs.store_id) + def test_retrieve(self, store_with_docs): results = store_with_docs.retrieve("world", top_k=3) - _log_openai_ci_event( - label, - "retrieve_done", - store_id=store_with_docs.store_id, - result_count=len(results), - ) assert len(results) > 0 for chunk in results: assert isinstance(chunk, RetrievedChunk) From 9ded2bcff5eb8d60ade63a763d502046f3bd38d9 Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Mon, 1 Jun 2026 20:44:57 -0400 Subject: [PATCH 09/17] Clarify crawl cache and ingest summary contracts --- src/raghilda/_store.py | 7 +++++++ src/raghilda/crawl.py | 7 +++++++ tests/test_crawl.py | 17 +++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/src/raghilda/_store.py b/src/raghilda/_store.py index f7b7290..97fd943 100644 --- a/src/raghilda/_store.py +++ b/src/raghilda/_store.py @@ -101,6 +101,13 @@ def ingest( duplicates as the stream is consumed. Duplicate detection is best effort: a duplicate raises ``ValueError`` when encountered, after any writes already in flight complete. No rollback is attempted. + + Returns + ------- + IngestSummary + Aggregate counts for inserted, replaced, and skipped documents. + Call ``upsert()`` directly when per-document ``WriteResult`` values + are needed. """ assert max_workers >= 1 stop_event = threading.Event() diff --git a/src/raghilda/crawl.py b/src/raghilda/crawl.py index a687b24..ce18a46 100644 --- a/src/raghilda/crawl.py +++ b/src/raghilda/crawl.py @@ -673,6 +673,13 @@ def _default_convert(self, source: FetchedSource) -> MarkdownDocument: class DirectoryCrawler(BaseCrawler): + """Crawl local files and optionally cache converted markdown. + + Directory traversal always reads the current filesystem state. The cache + stores converted markdown per file origin and is reused only when the + current file hash and modification time still match the cached metadata. + """ + def __init__( self, *, diff --git a/tests/test_crawl.py b/tests/test_crawl.py index 6a71b9c..792d8ef 100644 --- a/tests/test_crawl.py +++ b/tests/test_crawl.py @@ -1450,6 +1450,23 @@ def test_directory_crawler_returns_no_origins_when_limit_is_zero( assert origins == [] +def test_directory_crawler_fetch_markdown_refreshes_when_file_changes( + tmp_path: Path, +) -> None: + markdown = _write(tmp_path, "docs/readme.md", "# Hello") + cache = tmp_path / "cache" + crawler = DirectoryCrawler(cache_dir=cache) + + origin = markdown.resolve().as_uri() + first = crawler.fetch_markdown(origin) + markdown.write_text("# Updated\n", encoding="utf-8") + + refreshed = crawler.fetch_markdown(origin) + + assert first == MarkdownDocument(origin=origin, content="# Hello") + assert refreshed == MarkdownDocument(origin=origin, content="# Updated\n") + + def test_directory_crawler_fetch_markdown_force_refresh_rebuilds_cached_markdown( tmp_path: Path, ) -> None: From e7222392929cbdaec860f47908a61f55e6ffbe07 Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Mon, 1 Jun 2026 20:57:13 -0400 Subject: [PATCH 10/17] Fix crawler scope and cache traversal regressions Review findings: - [P2] Skip the directory cache while walking roots - [P2] Preserve the root host for subdomain traversal - [P2] Do not mark out-of-scope URLs visited globally Response: - Exclude DirectoryCrawler's own cache directory from directory-origin walks. - Keep the original scope host while traversing included subdomains so sibling subdomains under the requested host remain in scope. - Mark web origins visited only after they pass the current scope check, so an out-of-scope occurrence for one root does not suppress a later in-scope occurrence for another root. - Add public API regression tests covering all three cases. --- src/raghilda/crawl.py | 22 +++++++++--- tests/test_crawl.py | 82 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 4 deletions(-) diff --git a/src/raghilda/crawl.py b/src/raghilda/crawl.py index ce18a46..de192ff 100644 --- a/src/raghilda/crawl.py +++ b/src/raghilda/crawl.py @@ -706,12 +706,16 @@ def origins( resolved_scope = _resolve_crawl_scope(scope) if resolved_scope.limit == 0: return + cache_root = self.cache_dir.resolve() if self.cache_dir is not None else None count = 0 for root in resolved_scope.roots: path = _to_directory_path(root) assert path.exists(), f"Root does not exist: {path}" if path.is_file(): - origin = path.resolve().as_uri() + resolved_path = path.resolve() + if cache_root is not None and resolved_path.is_relative_to(cache_root): + continue + origin = resolved_path.as_uri() if self._include_path( path, origin, @@ -731,10 +735,15 @@ def origins( for file_path in sorted(path.rglob("*")): if not file_path.is_file(): continue + resolved_file_path = file_path.resolve() + if cache_root is not None and resolved_file_path.is_relative_to( + cache_root + ): + continue relative_depth = len(file_path.relative_to(path).parts) - 1 if relative_depth > resolved_scope.depth: continue - origin = file_path.resolve().as_uri() + origin = resolved_file_path.as_uri() if not self._include_path( file_path, origin, @@ -916,7 +925,6 @@ def origins( for origin, root_host in frontier: if origin in visited: continue - visited.add(origin) if not self._allow_origin( origin, root_host, @@ -924,6 +932,7 @@ def origins( include_subdomains=resolved_scope.include_subdomains, ): continue + visited.add(origin) batch.append((origin, root_host)) next_frontier: list[tuple[str, str]] = [] @@ -977,6 +986,11 @@ def origins( text = _read_text(source.body_path) resolved_origin = source.resolved_origin or origin resolved_host = urlparse(resolved_origin).hostname or root_host + child_root_host = ( + root_host + if resolved_scope.include_subdomains + else resolved_host + ) for link in sorted(_extract_links(text)): canonical = _canonicalize(link, base=resolved_origin) if canonical is None: @@ -984,7 +998,7 @@ def origins( parsed = urlparse(canonical) if parsed.scheme not in {"http", "https"}: continue - next_frontier.append((canonical, resolved_host)) + next_frontier.append((canonical, child_root_host)) offset += chunk_size frontier = next_frontier current_depth += 1 diff --git a/tests/test_crawl.py b/tests/test_crawl.py index 792d8ef..41256c9 100644 --- a/tests/test_crawl.py +++ b/tests/test_crawl.py @@ -457,6 +457,70 @@ def test_web_crawler_include_subdomains_stays_within_requested_host_tree( assert disallowed_sibling not in origins +def test_web_crawler_include_subdomains_keeps_original_scope_host( + tmp_path: Path, +) -> None: + root = "https://docs.example.com/start" + api = "https://api.docs.example.com/page" + cdn = "https://cdn.docs.example.com/asset" + session: Any = _FakeWebSession( + { + root: { + "body": f'API', + }, + api: { + "body": f'CDN', + }, + cdn: {"body": "
CDN
"}, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "subdomain-root-host-cache", + session=session, + ) + scope = CrawlScope( + roots=[root], + depth=2, + include_subdomains=True, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root, api, cdn] + + +def test_web_crawler_allows_later_in_scope_occurrence_of_same_url( + tmp_path: Path, +) -> None: + first_root = "https://alpha.example.com/start" + second_root = "https://docs.example.com/start" + shared = "https://api.docs.example.com/page" + session: Any = _FakeWebSession( + { + first_root: { + "body": f'Shared', + }, + second_root: { + "body": f'Shared', + }, + shared: {"body": "
Shared
"}, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "multi-root-visited-cache", + session=session, + ) + scope = CrawlScope( + roots=[first_root, second_root], + depth=1, + include_subdomains=True, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [first_root, second_root, shared] + + def test_web_crawler_discovers_matching_descendants_from_filtered_seed( tmp_path: Path, ) -> None: @@ -1467,6 +1531,24 @@ def test_directory_crawler_fetch_markdown_refreshes_when_file_changes( assert refreshed == MarkdownDocument(origin=origin, content="# Updated\n") +def test_directory_crawler_excludes_own_cache_files_from_directory_walk( + tmp_path: Path, + monkeypatch, +) -> None: + markdown = _write(tmp_path, "docs/readme.md", "# Hello") + monkeypatch.chdir(tmp_path) + crawler = DirectoryCrawler(cache_dir=True) + scope = CrawlScope(roots=[tmp_path]) + + documents = list(crawler.markdown_documents(scope, progress=False)) + origins = list(crawler.origins(scope, progress=False)) + + assert documents == [ + MarkdownDocument(origin=markdown.resolve().as_uri(), content="# Hello") + ] + assert origins == [markdown.resolve().as_uri()] + + def test_directory_crawler_fetch_markdown_force_refresh_rebuilds_cached_markdown( tmp_path: Path, ) -> None: From 57d39c0b6456676077fb5821bf06640dc0814c9e Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Mon, 1 Jun 2026 21:20:57 -0400 Subject: [PATCH 11/17] Fix web crawler scope revisits and content type handling Review findings: - [P2] Track visited URLs per root host - [P2] Use HTTP Content-Type before URL suffix Response: - Track web crawler visited state by the effective root host while still yielding each origin at most once, so a shared page can be traversed again under a broader multi-root subdomain scope. - Prefer recognized HTTP Content-Type values when choosing the cached body suffix, so rendered HTML served from markdown-like URLs is converted as HTML. - Add public API regression tests for both cases. --- src/raghilda/crawl.py | 23 +++++++++++------ tests/test_crawl.py | 59 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 8 deletions(-) diff --git a/src/raghilda/crawl.py b/src/raghilda/crawl.py index de192ff..7482388 100644 --- a/src/raghilda/crawl.py +++ b/src/raghilda/crawl.py @@ -907,7 +907,8 @@ def origins( resolved_scope = _resolve_crawl_scope(scope) if resolved_scope.limit == 0: return - visited: set[str] = set() + visited: set[tuple[str, str]] = set() + yielded_origins: set[str] = set() yielded = 0 frontier: list[tuple[str, str]] = [] @@ -923,7 +924,8 @@ def origins( while frontier: batch: list[tuple[str, str]] = [] for origin, root_host in frontier: - if origin in visited: + visit_key = (origin, root_host) + if visit_key in visited: continue if not self._allow_origin( origin, @@ -932,7 +934,7 @@ def origins( include_subdomains=resolved_scope.include_subdomains, ): continue - visited.add(origin) + visited.add(visit_key) batch.append((origin, root_host)) next_frontier: list[tuple[str, str]] = [] @@ -972,8 +974,13 @@ def origins( include_types=resolved_scope.include_types, exclude_types=resolved_scope.exclude_types, ) - if matches_patterns and matches_types: + if ( + matches_patterns + and matches_types + and origin not in yielded_origins + ): yield origin + yielded_origins.add(origin) yielded += 1 if ( resolved_scope.limit is not None @@ -1714,10 +1721,6 @@ def _read_text(path: Path) -> str: def _known_body_suffix(origin: str, *, content_type: str | None) -> str | None: - parsed = urlparse(origin) - suffix = Path(parsed.path).suffix - if suffix: - return suffix normalized = _normalize_content_type(content_type) if normalized == "text/html": return ".html" @@ -1727,6 +1730,10 @@ def _known_body_suffix(origin: str, *, content_type: str | None) -> str | None: return ".json" if normalized == "application/pdf": return ".pdf" + parsed = urlparse(origin) + suffix = Path(parsed.path).suffix + if suffix: + return suffix return None diff --git a/tests/test_crawl.py b/tests/test_crawl.py index 41256c9..033d02f 100644 --- a/tests/test_crawl.py +++ b/tests/test_crawl.py @@ -521,6 +521,42 @@ def test_web_crawler_allows_later_in_scope_occurrence_of_same_url( assert origins == [first_root, second_root, shared] +def test_web_crawler_revisits_shared_page_for_broader_subdomain_scope( + tmp_path: Path, +) -> None: + narrow_root = "https://api.docs.example.com/start" + broad_root = "https://docs.example.com/start" + shared = "https://api.docs.example.com/shared" + sibling = "https://cdn.docs.example.com/asset" + session: Any = _FakeWebSession( + { + narrow_root: { + "body": f'Shared', + }, + broad_root: { + "body": f'Shared', + }, + shared: { + "body": f'Sibling', + }, + sibling: {"body": "
Sibling
"}, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "multi-root-subdomain-cache", + session=session, + ) + scope = CrawlScope( + roots=[narrow_root, broad_root], + depth=2, + include_subdomains=True, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [narrow_root, broad_root, shared, sibling] + + def test_web_crawler_discovers_matching_descendants_from_filtered_seed( tmp_path: Path, ) -> None: @@ -902,6 +938,29 @@ def test_web_crawler_uses_magika_when_no_explicit_ext_is_available( assert source.body_path == cache_dir / f"{base}.html" +def test_web_crawler_prefers_content_type_over_misleading_url_suffix( + tmp_path: Path, +) -> None: + origin = "https://example.com/README.md" + session: Any = _FakeWebSession( + { + origin: { + "body": "
Rendered Readme
", + "content_type": "text/html; charset=utf-8", + } + } + ) + cache_dir = tmp_path / "content-type-cache" + crawler = WebCrawler(cache_dir=cache_dir, session=session) + + source = crawler.fetch_raw(origin) + document = crawler.fetch_markdown(origin) + + base = _expected_cache_base(origin) + assert source.body_path == cache_dir / f"{base}.html" + assert document == MarkdownDocument(origin=origin, content="Rendered Readme") + + def test_web_crawler_falls_back_to_raw_when_magika_is_unavailable( tmp_path: Path, monkeypatch, From 4350a9bf65678ad64d2776e8aff6c95bbc17b420 Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Mon, 1 Jun 2026 21:43:34 -0400 Subject: [PATCH 12/17] Preserve escaped web URLs and reuse crawl snapshots Review findings: - [P2] Preserve escaped URL path segments - [P2] Avoid refetching freshly crawled sources Response: - Add a web URL normalizer that preserves reserved percent escapes such as %2F, and use it for WebCrawler and CloudflareCrawler origins. - Reuse sources materialized during origins() when markdown_documents() converts discovered origins, so immediately stale cache policies do not trigger a second network request or crawl. - Add public API regression tests for escaped web origins and immediately stale WebCrawler and CloudflareCrawler markdown conversion. --- src/raghilda/crawl.py | 79 +++++++++++++++++++++++++++++++++++-------- tests/test_crawl.py | 71 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+), 15 deletions(-) diff --git a/src/raghilda/crawl.py b/src/raghilda/crawl.py index 7482388..1df4492 100644 --- a/src/raghilda/crawl.py +++ b/src/raghilda/crawl.py @@ -17,14 +17,14 @@ from typing import Any, Callable, Iterable, Iterator, Mapping, Sequence, TypeVar import threading import unicodedata -from urllib.parse import urlparse +from urllib.parse import urldefrag, urljoin, urlparse from urllib.request import url2pathname import requests from .document import MarkdownDocument from .read import _convert_to_markdown -from .scrape import _canonicalize, _extract_links +from .scrape import _extract_links try: from magika import Magika @@ -643,6 +643,19 @@ def fetch_markdown( converter = convert or self._default_convert return converter(source) + def _fetch_markdown_after_origin_discovery( + self, + origin: str, + *, + convert: Callable[[FetchedSource], MarkdownDocument] | None = None, + ) -> MarkdownDocument: + source = self._fetch_raw_after_origin_discovery(origin) + converter = convert or self._default_convert + return converter(source) + + def _fetch_raw_after_origin_discovery(self, origin: str) -> FetchedSource: + return self.fetch_raw(origin, cache_force_refresh=False) + def markdown_documents( self, scope: CrawlScope, @@ -659,12 +672,9 @@ def markdown_documents( yield from _map_ordered( origins, max_workers=self.max_workers, - fn=lambda origin: self.fetch_markdown( + fn=lambda origin: self._fetch_markdown_after_origin_discovery( origin, convert=convert, - # origins(..., cache_force_refresh=True) already refreshed the source - # for this crawl, so reuse that cached snapshot here. - cache_force_refresh=False, ), ) @@ -913,7 +923,7 @@ def origins( frontier: list[tuple[str, str]] = [] for root in resolved_scope.roots: - canonical_root = _canonicalize(str(root)) + canonical_root = _canonicalize_web_url(str(root)) assert canonical_root is not None parsed = urlparse(canonical_root) assert parsed.scheme in {"http", "https"} @@ -999,7 +1009,7 @@ def origins( else resolved_host ) for link in sorted(_extract_links(text)): - canonical = _canonicalize(link, base=resolved_origin) + canonical = _canonicalize_web_url(link, base=resolved_origin) if canonical is None: continue parsed = urlparse(canonical) @@ -1016,7 +1026,7 @@ def fetch_raw( *, cache_force_refresh: bool = False, ) -> FetchedSource: - canonical_origin = _canonicalize(origin) + canonical_origin = _canonicalize_web_url(origin) assert canonical_origin is not None parsed = urlparse(canonical_origin) assert parsed.scheme in {"http", "https"} @@ -1064,7 +1074,7 @@ def fetch_raw( response.raise_for_status() content_type = response.headers.get("Content-Type") - resolved_origin = _canonicalize(response.url) or response.url + resolved_origin = _canonicalize_web_url(response.url) or response.url type_label = _detect_type_label( path=_type_hint_path(canonical_origin, content_type=content_type), content_type=content_type, @@ -1095,6 +1105,16 @@ def fetch_raw( assert meta is not None return self._source_from_meta(meta, body_path=body_path) + def _fetch_raw_after_origin_discovery(self, origin: str) -> FetchedSource: + canonical_origin = _canonicalize_web_url(origin) + assert canonical_origin is not None + cached_entry = self._cache.fetch(canonical_origin) + assert cached_entry is not None + body_path, cached_meta = cached_entry + assert body_path is not None + assert cached_meta is not None + return self._source_from_meta(cached_meta, body_path=body_path) + def _default_convert(self, source: FetchedSource) -> MarkdownDocument: type_label = (source.metadata or {}).get("type_label") if type_label == "markdown": @@ -1231,7 +1251,7 @@ def origins( for root in resolved_scope.roots: if resolved_scope.limit is not None and yielded >= resolved_scope.limit: return - canonical_root = _canonicalize(str(root)) + canonical_root = _canonicalize_web_url(str(root)) assert canonical_root is not None remaining = ( None if resolved_scope.limit is None else resolved_scope.limit - yielded @@ -1269,7 +1289,7 @@ def fetch_raw( *, cache_force_refresh: bool = False, ) -> FetchedSource: - canonical_origin = _canonicalize(origin) + canonical_origin = _canonicalize_web_url(origin) assert canonical_origin is not None record_entry = ( None if cache_force_refresh else self._records.get(canonical_origin) @@ -1299,16 +1319,32 @@ def fetch_raw( if record is None: raise ValueError(f"Cloudflare crawl did not return record for {origin}") record_entry = self._records[canonical_origin] - else: - record = record_entry.record assert record_entry is not None + return self._source_from_record_entry(canonical_origin, record_entry) + + def _fetch_raw_after_origin_discovery(self, origin: str) -> FetchedSource: + canonical_origin = _canonicalize_web_url(origin) + assert canonical_origin is not None + record_entry = self._records.get(canonical_origin) + if record_entry is None: + record_entry = self._load_record_cache_entry(canonical_origin) + assert record_entry is not None + self._records[canonical_origin] = record_entry + return self._source_from_record_entry(canonical_origin, record_entry) + + def _source_from_record_entry( + self, + canonical_origin: str, + record_entry: _CloudflareRecordCacheEntry, + ) -> FetchedSource: content_path, _ = self._store_record_cache_entry( canonical_origin, - record=record, + record=record_entry.record, fetched_at=record_entry.fetched_at, ) assert content_path is not None + record = record_entry.record return FetchedSource( origin=canonical_origin, resolved_origin=record.get("metadata", {}).get("url", canonical_origin), @@ -1584,6 +1620,19 @@ def _resolve_crawl_scope(scope: CrawlScope) -> _ResolvedCrawlScope: ) +def _canonicalize_web_url(target: str, *, base: str | None = None) -> str | None: + url = urljoin(base, target) if base else target + if not url: + return None + url, _ = urldefrag(url) + parsed = urlparse(url) + if parsed.scheme not in {"http", "https"}: + return None + if not parsed.netloc: + return None + return url + + def _resolve_cache_dir( cache_dir: bool | str | Path | None, *, diff --git a/tests/test_crawl.py b/tests/test_crawl.py index 033d02f..7d1f007 100644 --- a/tests/test_crawl.py +++ b/tests/test_crawl.py @@ -651,6 +651,34 @@ def test_web_markdown_documents_reuses_refreshed_sources( assert len(root_requests) == 1 +def test_web_markdown_documents_reuses_immediately_stale_discovery_cache( + tmp_path: Path, +) -> None: + with _serve( + { + "/": { + "body": "
Root
", + "content_type": "text/html; charset=utf-8", + "etag": None, + } + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}/" + crawler = WebCrawler( + cache_dir=tmp_path / "stale-markdown-docs-cache", + cache_stale_after=timedelta(seconds=0), + ) + scope = CrawlScope(roots=[root_url], depth=0) + + documents = list(crawler.markdown_documents(scope, progress=False)) + root_requests = [ + request for request in getattr(server, "requests") if request["path"] == "/" + ] + + assert documents == [MarkdownDocument(origin=root_url, content="Root")] + assert len(root_requests) == 1 + + def test_web_crawler_fetches_same_depth_frontier_concurrently(tmp_path: Path) -> None: root = "https://example.com/docs" first = "https://example.com/docs/one" @@ -961,6 +989,27 @@ def test_web_crawler_prefers_content_type_over_misleading_url_suffix( assert document == MarkdownDocument(origin=origin, content="Rendered Readme") +def test_web_crawler_preserves_reserved_escapes_in_requested_origin( + tmp_path: Path, +) -> None: + origin = "https://example.com/a%2Fb" + session: Any = _FakeWebSession( + { + origin: { + "body": "
Escaped
", + } + } + ) + cache_dir = tmp_path / "escaped-cache" + crawler = WebCrawler(cache_dir=cache_dir, session=session) + + source = crawler.fetch_raw(origin) + + assert session.requests == [(origin, {})] + assert source.origin == origin + assert source.body_path == cache_dir / f"{_expected_cache_base(origin)}.html" + + def test_web_crawler_falls_back_to_raw_when_magika_is_unavailable( tmp_path: Path, monkeypatch, @@ -1239,6 +1288,28 @@ def test_cloudflare_crawler_polls_job_and_uses_markdown_records( assert len(session.post_calls) == 1 +def test_cloudflare_markdown_documents_reuses_immediately_stale_discovery_cache( + tmp_path: Path, +) -> None: + session = _ParameterizedCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-stale-cache", + session=session, + cache_stale_after=timedelta(seconds=0), + poll_interval=0, + ) + scope = CrawlScope(roots=["https://example.com/docs"], depth=0) + + documents = list(crawler.markdown_documents(scope, progress=False)) + + assert documents == [ + MarkdownDocument(origin="https://example.com/docs", content="# Docs\n") + ] + assert len(session.post_calls) == 1 + + def test_cloudflare_crawler_accepts_crawl_scope_for_roots_and_patterns( tmp_path: Path, ) -> None: From 05751a39f6b7ac46f6a7f3df6bc2668263de2bee Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Tue, 2 Jun 2026 03:56:05 -0400 Subject: [PATCH 13/17] Fix crawl ingest cache and scope correctness Address crawl review findings around cache safety, URL canonicalization, Cloudflare scope enforcement, and duplicate origin handling. Cover the regressions through public crawler and ingest tests. --- src/raghilda/_postgres_store.py | 9 + src/raghilda/_store.py | 17 +- src/raghilda/crawl.py | 460 ++++++++++-- src/raghilda/scrape.py | 2 +- tests/test_crawl.py | 1239 +++++++++++++++++++++++++++++-- tests/test_store_ingest.py | 51 ++ 6 files changed, 1663 insertions(+), 115 deletions(-) diff --git a/src/raghilda/_postgres_store.py b/src/raghilda/_postgres_store.py index 6314122..6aadd29 100644 --- a/src/raghilda/_postgres_store.py +++ b/src/raghilda/_postgres_store.py @@ -1,5 +1,6 @@ from ._store import BaseStore, WriteResult import json +import threading from .embedding import EmbeddingProvider, EmbedInputType, embedding_from_config from .document import Document, ChunkedMarkdownDocument from .chunk import Chunk, MarkdownChunk, RetrievedChunk, Metric @@ -137,6 +138,7 @@ def __init__( self.con = con self._metadata = metadata self._schema = psycopg2.extensions.quote_ident(schema, con) + self._ingest_upsert_lock = threading.Lock() def close(self) -> None: """Close the store's database connection.""" @@ -539,6 +541,13 @@ def upsert( replaced_document=replaced_document, ) + def _ingest_upsert( + self, + document: Document, + ) -> WriteResult[ChunkedMarkdownDocument]: + with self._ingest_upsert_lock: + return self.upsert(document) + def _load_document_snapshot( self, *, origin: str, text: str ) -> ChunkedMarkdownDocument: diff --git a/src/raghilda/_store.py b/src/raghilda/_store.py index 97fd943..c4440a1 100644 --- a/src/raghilda/_store.py +++ b/src/raghilda/_store.py @@ -87,6 +87,9 @@ def upsert( """ pass + def _ingest_upsert(self, document: Document) -> WriteResult[Document]: + return self.upsert(document) + def ingest( self, documents: Iterable[Any], @@ -134,7 +137,7 @@ def process_document(item: Any) -> WriteResult[Document]: remember_origin(document.origin) if stop_event.is_set(): raise CancelledError() - return self.upsert(document) + return self._ingest_upsert(document) iterator = iter(documents) pending = set() @@ -155,11 +158,19 @@ def process_document(item: Any) -> WriteResult[Document]: while pending: done, pending = wait(pending, return_when=FIRST_COMPLETED) results = [] + cancelled_errors = [] + errors = [] for future in done: try: results.append(future.result()) - except CancelledError: - continue + except CancelledError as exc: + cancelled_errors.append(exc) + except Exception as exc: + errors.append(exc) + if errors: + raise errors[0] + if cancelled_errors and not stop_event.is_set(): + raise cancelled_errors[0] for result in results: if result.action == "inserted": inserted += 1 diff --git a/src/raghilda/crawl.py b/src/raghilda/crawl.py index 1df4492..6e46179 100644 --- a/src/raghilda/crawl.py +++ b/src/raghilda/crawl.py @@ -9,6 +9,7 @@ import hashlib import json import mimetypes +import os from pathlib import Path import re import shutil @@ -17,7 +18,7 @@ from typing import Any, Callable, Iterable, Iterator, Mapping, Sequence, TypeVar import threading import unicodedata -from urllib.parse import urldefrag, urljoin, urlparse +from urllib.parse import urldefrag, urljoin, urlparse, urlunparse from urllib.request import url2pathname import requests @@ -84,6 +85,7 @@ RootsInput = RootInput | Sequence[RootInput] CacheValue = tuple[Path | None, dict[str, Any] | None] CacheEntry = tuple[str, Path | None, dict[str, Any] | None] +WebOriginKey = tuple[str, str, int | None] TInput = TypeVar("TInput") TOutput = TypeVar("TOutput") @@ -223,6 +225,9 @@ def fetch(self, key: str) -> CacheValue | None: if record is None: self._cleanup_broken_metadata_path(metadata_path) return None + if record["key"] != key: + self._cleanup_mismatched_metadata_path(metadata_path, key) + return None content_path: Path | None = None content_name = record["content_path"] @@ -458,6 +463,11 @@ def _read_record(self, path: Path) -> dict[str, Any] | None: return None if content_path is not None and not isinstance(content_path, str): return None + if content_path is not None: + if content_path in {"", ".", ".."}: + return None + if Path(content_path).name != content_path or "\\" in content_path: + return None if metadata is not None and not isinstance(metadata, dict): return None @@ -483,6 +493,25 @@ def _cleanup_broken_metadata_path(self, metadata_path: Path) -> None: self._delete_base_files_locked(base) + def _cleanup_mismatched_metadata_path( + self, + metadata_path: Path, + key: str, + ) -> None: + """Best-effort cleanup for a metadata file stored under the wrong key.""" + if self.root is None: + return + + base = self._base_for_key(key) + with self._locked_base(base): + if not metadata_path.exists(): + return + record = self._read_record(metadata_path) + if record is not None and record["key"] == key: + return + + self._delete_base_files_locked(base) + @contextmanager def _locked_base(self, base: str) -> Iterator[None]: state = self._acquire_entry_lock_state(base) @@ -515,7 +544,9 @@ def _delete_base_files_locked(self, base: str) -> int: assert self.root is not None deleted = 0 - for path in self.root.glob(f"{base}.*"): + for path in self.root.iterdir(): + if not self._belongs_to_base(path.name, base): + continue if not path.is_file(): continue try: @@ -529,7 +560,9 @@ def _delete_extra_base_files_locked(self, base: str, *, keep: set[str]) -> None: """Delete stale files for one base, keeping the current pair.""" assert self.root is not None - for path in self.root.glob(f"{base}.*"): + for path in self.root.iterdir(): + if not self._belongs_to_base(path.name, base): + continue if not path.is_file(): continue if path.name in keep: @@ -539,25 +572,41 @@ def _delete_extra_base_files_locked(self, base: str, *, keep: set[str]) -> None: except FileNotFoundError: pass - def _write_content(self, content_path: Path, content: bytes | str | Path) -> None: - """Write content directly to its destination path.""" - if isinstance(content, bytes): - with content_path.open("wb") as handle: - handle.write(content) - return - - if isinstance(content, str): - with content_path.open("w", encoding="utf-8") as handle: - handle.write(content) - return + def _belongs_to_base(self, name: str, base: str) -> bool: + if name == f"{base}{self._METADATA_SUFFIX}": + return True + prefix = f"{base}." + if not name.startswith(prefix): + return False + return "--" not in name[len(prefix) :] + def _write_content(self, content_path: Path, content: bytes | str | Path) -> None: if isinstance(content, Path): if content == content_path: return - shutil.copyfile(content, content_path) - return - raise TypeError(f"Unsupported content type: {type(content)!r}") + temporary_path: Path | None = None + try: + with tempfile.NamedTemporaryFile( + "wb", + dir=content_path.parent, + prefix=f".{content_path.name}.", + delete=False, + ) as handle: + temporary_path = Path(handle.name) + if isinstance(content, bytes): + handle.write(content) + elif isinstance(content, str): + handle.write(content.encode("utf-8")) + elif isinstance(content, Path): + with content.open("rb") as source: + shutil.copyfileobj(source, handle) + else: + raise TypeError(f"Unsupported content type: {type(content)!r}") + os.replace(temporary_path, content_path) + finally: + if temporary_path is not None: + temporary_path.unlink(missing_ok=True) def _write_json(self, path: Path, obj: Mapping[str, Any]) -> None: """Write metadata JSON directly to its destination path.""" @@ -718,6 +767,7 @@ def origins( return cache_root = self.cache_dir.resolve() if self.cache_dir is not None else None count = 0 + yielded_origins: set[str] = set() for root in resolved_scope.roots: path = _to_directory_path(root) assert path.exists(), f"Root does not exist: {path}" @@ -726,6 +776,8 @@ def origins( if cache_root is not None and resolved_path.is_relative_to(cache_root): continue origin = resolved_path.as_uri() + if origin in yielded_origins: + continue if self._include_path( path, origin, @@ -734,6 +786,7 @@ def origins( include_types=resolved_scope.include_types, exclude_types=resolved_scope.exclude_types, ): + yielded_origins.add(origin) yield origin count += 1 if ( @@ -742,18 +795,18 @@ def origins( ): return continue - for file_path in sorted(path.rglob("*")): - if not file_path.is_file(): - continue + for file_path in _iter_directory_files( + path, + max_depth=resolved_scope.depth, + ): resolved_file_path = file_path.resolve() if cache_root is not None and resolved_file_path.is_relative_to( cache_root ): continue - relative_depth = len(file_path.relative_to(path).parts) - 1 - if relative_depth > resolved_scope.depth: - continue origin = resolved_file_path.as_uri() + if origin in yielded_origins: + continue if not self._include_path( file_path, origin, @@ -763,6 +816,7 @@ def origins( exclude_types=resolved_scope.exclude_types, ): continue + yielded_origins.add(origin) yield origin count += 1 if resolved_scope.limit is not None and count >= resolved_scope.limit: @@ -874,6 +928,8 @@ def _include_path( exclude_patterns=exclude_patterns, ): return False + if not include_types and not exclude_types: + return True label = _detect_type_label( path=path, content_type=mimetypes.guess_type(path.name)[0] ) @@ -894,7 +950,8 @@ def __init__( max_workers: int = 1, ) -> None: assert max_workers >= 1 - self.session = session or requests.Session() + self.session = requests.Session() if session is None else session + self._cache_context = None if session is None else f"session:{id(self.session)}" self.cache_dir = _resolve_cache_dir( cache_dir, backend_name="web", @@ -917,10 +974,10 @@ def origins( resolved_scope = _resolve_crawl_scope(scope) if resolved_scope.limit == 0: return - visited: set[tuple[str, str]] = set() + visited: set[tuple[str, WebOriginKey, str]] = set() yielded_origins: set[str] = set() yielded = 0 - frontier: list[tuple[str, str]] = [] + frontier: list[tuple[str, WebOriginKey, str]] = [] for root in resolved_scope.roots: canonical_root = _canonicalize_web_url(str(root)) @@ -928,26 +985,34 @@ def origins( parsed = urlparse(canonical_root) assert parsed.scheme in {"http", "https"} root_host = parsed.hostname or "" - frontier.append((canonical_root, root_host)) + frontier.append( + (canonical_root, _web_origin_key(canonical_root), root_host) + ) current_depth = 0 while frontier: - batch: list[tuple[str, str]] = [] - for origin, root_host in frontier: - visit_key = (origin, root_host) + batch: list[tuple[str, WebOriginKey, str]] = [] + for origin, scope_origin, root_host in frontier: + visit_key = (origin, scope_origin, root_host) if visit_key in visited: continue if not self._allow_origin( origin, + scope_origin, root_host, include_external_links=resolved_scope.include_external_links, include_subdomains=resolved_scope.include_subdomains, ): continue + if _matches_exclude_patterns( + origin, + exclude_patterns=resolved_scope.exclude_patterns, + ): + continue visited.add(visit_key) - batch.append((origin, root_host)) + batch.append((origin, scope_origin, root_host)) - next_frontier: list[tuple[str, str]] = [] + next_frontier: list[tuple[str, WebOriginKey, str]] = [] offset = 0 while offset < len(batch): remaining = ( @@ -972,7 +1037,7 @@ def origins( ), ), ) - for (origin, root_host), source in fetched_sources: + for (origin, scope_origin, root_host), source in fetched_sources: type_label = (source.metadata or {}).get("type_label") matches_patterns = _matches_patterns( origin, @@ -1002,12 +1067,19 @@ def origins( text = _read_text(source.body_path) resolved_origin = source.resolved_origin or origin - resolved_host = urlparse(resolved_origin).hostname or root_host - child_root_host = ( - root_host - if resolved_scope.include_subdomains - else resolved_host - ) + resolved_origin_key = _web_origin_key(resolved_origin) + origin_key = _web_origin_key(origin) + child_root_host = root_host + if ( + resolved_scope.include_subdomains + and resolved_origin_key == origin_key + ): + child_scope_origin = scope_origin + else: + child_scope_origin = resolved_origin_key + child_root_host = ( + urlparse(resolved_origin).hostname or root_host + ) for link in sorted(_extract_links(text)): canonical = _canonicalize_web_url(link, base=resolved_origin) if canonical is None: @@ -1015,7 +1087,9 @@ def origins( parsed = urlparse(canonical) if parsed.scheme not in {"http", "https"}: continue - next_frontier.append((canonical, child_root_host)) + next_frontier.append( + (canonical, child_scope_origin, child_root_host) + ) offset += chunk_size frontier = next_frontier current_depth += 1 @@ -1036,7 +1110,11 @@ def fetch_raw( cached_meta: dict[str, Any] | None = None if cached_entry is not None: body_path, cached_meta = cached_entry - has_cache = body_path is not None and cached_meta is not None + has_cache = ( + body_path is not None + and cached_meta is not None + and self._cache_context_matches(cached_meta) + ) now = _utcnow() if has_cache and not cache_force_refresh: @@ -1074,7 +1152,9 @@ def fetch_raw( response.raise_for_status() content_type = response.headers.get("Content-Type") - resolved_origin = _canonicalize_web_url(response.url) or response.url + resolved_origin = ( + _canonicalize_web_url(response.url, base=canonical_origin) or response.url + ) type_label = _detect_type_label( path=_type_hint_path(canonical_origin, content_type=content_type), content_type=content_type, @@ -1089,6 +1169,7 @@ def fetch_raw( "type_label": type_label, "fetched_at": now.isoformat(), "revalidated_at": None, + "cache_context": self._cache_context, } cached_entry = self._cache.upsert( canonical_origin, @@ -1103,6 +1184,22 @@ def fetch_raw( body_path, meta = cached_entry assert body_path is not None assert meta is not None + actual_type_label = _detect_type_label( + path=body_path, + content_type=content_type, + ) + if actual_type_label != meta.get("type_label"): + meta["type_label"] = actual_type_label + cached_entry = self._cache.upsert( + canonical_origin, + content=body_path, + metadata=meta, + content_ext=None, + ) + assert cached_entry is not None + body_path, meta = cached_entry + assert body_path is not None + assert meta is not None return self._source_from_meta(meta, body_path=body_path) def _fetch_raw_after_origin_discovery(self, origin: str) -> FetchedSource: @@ -1178,24 +1275,25 @@ def _is_fresh(self, cached_meta: dict[str, Any], now: datetime) -> bool: return False return now - freshest_cache_time <= self.cache_stale_after + def _cache_context_matches(self, cached_meta: dict[str, Any]) -> bool: + return cached_meta.get("cache_context") == self._cache_context + def _allow_origin( self, origin: str, + scope_origin: WebOriginKey, root_host: str, *, include_external_links: bool, include_subdomains: bool, ) -> bool: - host = urlparse(origin).hostname or "" - if not host: - return False - if host == root_host: - return True - if include_external_links: - return True - if not include_subdomains: - return False - return host.endswith(f".{root_host}") + return _allow_web_origin( + origin, + scope_origin, + root_host, + include_external_links=include_external_links, + include_subdomains=include_subdomains, + ) class CloudflareCrawler(BaseCrawler): @@ -1248,14 +1346,20 @@ def origins( del progress resolved_scope = _resolve_crawl_scope(scope) yielded = 0 + yielded_origins: set[str] = set() + crawled_roots: set[str] = set() for root in resolved_scope.roots: if resolved_scope.limit is not None and yielded >= resolved_scope.limit: return canonical_root = _canonicalize_web_url(str(root)) assert canonical_root is not None + if canonical_root in crawled_roots: + continue + crawled_roots.add(canonical_root) remaining = ( None if resolved_scope.limit is None else resolved_scope.limit - yielded ) + root_limit = remaining if not yielded_origins else None records = self._crawl_root( canonical_root, cache_force_refresh=cache_force_refresh, @@ -1264,10 +1368,12 @@ def origins( exclude_patterns=resolved_scope.exclude_patterns, include_external_links=resolved_scope.include_external_links, include_subdomains=resolved_scope.include_subdomains, - limit=remaining, + limit=root_limit, ) for record in records: origin = record["url"] + if origin in yielded_origins: + continue label = _detect_type_label( path=None, content_type="text/markdown", @@ -1278,6 +1384,7 @@ def origins( exclude_types=resolved_scope.exclude_types, ): continue + yielded_origins.add(origin) yield origin yielded += 1 if resolved_scope.limit is not None and yielded >= resolved_scope.limit: @@ -1316,9 +1423,13 @@ def fetch_raw( (item for item in records if item["url"] == canonical_origin), None, ) + if record is None and len(records) == 1: + record = records[0] if record is None: raise ValueError(f"Cloudflare crawl did not return record for {origin}") - record_entry = self._records[canonical_origin] + record_entry = self._records.get(record["url"]) + assert record_entry is not None + self._records[canonical_origin] = record_entry assert record_entry is not None return self._source_from_record_entry(canonical_origin, record_entry) @@ -1398,6 +1509,11 @@ def _crawl_root( and self._cloudflare_cache_is_fresh(cached_entry.fetched_at) ): return cached_entry.records + if not cache_force_refresh and apply_patterns: + cached_entry = self._load_root_cache_entry(cache_key) + if cached_entry is not None: + self._roots[cache_key] = cached_entry + return cached_entry.records endpoint = f"{self.base_url}/accounts/{self.account_id}/browser-rendering/crawl" payload = self._crawl_payload( @@ -1474,9 +1590,27 @@ def _crawl_root( records.extend(page_result.get("records") or []) cursor = page_result.get("cursor") - completed_records = [ - record for record in records if record.get("status") == "completed" - ] + scope_origin = _web_origin_key(root) + root_host = urlparse(root).hostname or "" + completed_records = [] + for record in records: + if record.get("status") != "completed": + continue + canonical_url = _canonicalize_web_url(record["url"]) + if canonical_url is None: + continue + if apply_patterns and not _allow_web_origin( + canonical_url, + scope_origin, + root_host, + include_external_links=include_external_links, + include_subdomains=include_subdomains, + ): + continue + if canonical_url != record["url"]: + record = dict(record) + record["url"] = canonical_url + completed_records.append(record) if apply_patterns: completed_records = [ record @@ -1492,6 +1626,12 @@ def _crawl_root( fetched_at=fetched_at, records=completed_records, ) + if apply_patterns: + self._store_root_cache_entry( + cache_key, + records=completed_records, + fetched_at=fetched_at, + ) for record in completed_records: self._records[record["url"]] = _CloudflareRecordCacheEntry( fetched_at=fetched_at, @@ -1551,6 +1691,58 @@ def _record_cache_signature(self) -> dict[str, Any]: "modified_since": self.modified_since, } + def _root_cache_key(self, cache_key: tuple[Any, ...]) -> str: + payload = { + "cache_key": cache_key, + "signature": self._record_cache_signature(), + } + encoded = json.dumps(payload, sort_keys=True, separators=(",", ":")) + return f"cloudflare-root:{encoded}" + + def _load_root_cache_entry( + self, + cache_key: tuple[Any, ...], + ) -> _CloudflareRootCacheEntry | None: + cached_entry = self._cache.fetch(self._root_cache_key(cache_key)) + if cached_entry is None: + return None + _, cached_meta = cached_entry + if cached_meta is None: + return None + if cached_meta.get("signature") != self._record_cache_signature(): + return None + fetched_at = _parse_datetime(cached_meta.get("fetched_at")) + if fetched_at is None or not self._cloudflare_cache_is_fresh(fetched_at): + return None + records = cached_meta["records"] + for record in records: + self._records[record["url"]] = _CloudflareRecordCacheEntry( + fetched_at=fetched_at, + record=record, + ) + return _CloudflareRootCacheEntry( + fetched_at=fetched_at, + records=records, + ) + + def _store_root_cache_entry( + self, + cache_key: tuple[Any, ...], + *, + records: list[dict[str, Any]], + fetched_at: datetime, + ) -> None: + self._cache.upsert( + self._root_cache_key(cache_key), + content=None, + metadata={ + "fetched_at": fetched_at.isoformat(), + "records": records, + "signature": self._record_cache_signature(), + }, + content_ext=None, + ) + def _load_record_cache_entry( self, origin: str, @@ -1567,7 +1759,6 @@ def _load_record_cache_entry( if fetched_at is None or not self._cloudflare_cache_is_fresh(fetched_at): return None record = cached_meta["record"] - assert record["url"] == origin return _CloudflareRecordCacheEntry( fetched_at=fetched_at, record=record, @@ -1609,8 +1800,8 @@ def _coerce_roots(roots: RootsInput) -> list[RootInput]: def _resolve_crawl_scope(scope: CrawlScope) -> _ResolvedCrawlScope: return _ResolvedCrawlScope( roots=_coerce_roots(scope.roots), - include_patterns=list(scope.include_patterns or []), - exclude_patterns=list(scope.exclude_patterns or []), + include_patterns=_coerce_string_sequence(scope.include_patterns), + exclude_patterns=_coerce_string_sequence(scope.exclude_patterns), depth=_DEFAULT_CRAWL_DEPTH if scope.depth is None else scope.depth, limit=scope.limit, include_types=_normalize_types(scope.include_types), @@ -1620,17 +1811,92 @@ def _resolve_crawl_scope(scope: CrawlScope) -> _ResolvedCrawlScope: ) +def _coerce_string_sequence(values: Sequence[str] | str | None) -> list[str]: + if values is None: + return [] + if isinstance(values, str): + return [values] + return list(values) + + def _canonicalize_web_url(target: str, *, base: str | None = None) -> str | None: url = urljoin(base, target) if base else target if not url: return None url, _ = urldefrag(url) parsed = urlparse(url) + scheme = parsed.scheme.lower() + if scheme != parsed.scheme: + parsed = parsed._replace(scheme=scheme) + url = urlunparse(parsed) if parsed.scheme not in {"http", "https"}: return None if not parsed.netloc: return None - return url + try: + parsed.port + except ValueError: + return None + netloc = _canonical_netloc(parsed) + if netloc != parsed.netloc: + parsed = parsed._replace(netloc=netloc) + if parsed.path == "/" and not parsed.params: + parsed = parsed._replace(path="") + return urlunparse(parsed) + + +def _canonical_netloc(parsed: Any) -> str: + userinfo = "" + if "@" in parsed.netloc: + userinfo = f"{parsed.netloc.rsplit('@', 1)[0]}@" + host = parsed.hostname or "" + if ":" in host and not host.startswith("["): + host = f"[{host}]" + port = parsed.port + if port is None: + return f"{userinfo}{host}" + if parsed.scheme == "http" and port == 80: + return f"{userinfo}{host}" + if parsed.scheme == "https" and port == 443: + return f"{userinfo}{host}" + return f"{userinfo}{host}:{port}" + + +def _web_origin_key(origin: str) -> WebOriginKey: + parsed = urlparse(origin) + scheme = parsed.scheme.lower() + port = parsed.port + if port is None and scheme == "http": + port = 80 + elif port is None and scheme == "https": + port = 443 + return scheme, parsed.hostname or "", port + + +def _allow_web_origin( + origin: str, + scope_origin: WebOriginKey, + root_host: str, + *, + include_external_links: bool, + include_subdomains: bool, +) -> bool: + parsed = urlparse(origin) + host = parsed.hostname or "" + if not host: + return False + origin_key = _web_origin_key(origin) + if origin_key == scope_origin: + return True + if include_external_links: + return True + if not include_subdomains: + return False + return ( + origin_key[0] == scope_origin[0] + and origin_key[2] == scope_origin[2] + and host.endswith(f".{root_host}") + ) def _resolve_cache_dir( @@ -1645,17 +1911,56 @@ def _resolve_cache_dir( if cache_dir is True: return Path.cwd() / ".raghilda" / "cache" / backend_name raise TypeError("cache_dir must be None, True, or a filesystem path") - return Path(cache_dir) + return Path(cache_dir).resolve() def _to_directory_path(root: str | Path) -> Path: if isinstance(root, Path): return root - parsed = urlparse(str(root)) + value = str(root) + if re.match(r"^[A-Za-z]:(?:[\\/]|$)", value): + return Path(value) + parsed = urlparse(value) if parsed.scheme == "file": - return _path_from_file_uri(str(root)) + return _path_from_file_uri(value) assert parsed.scheme in {"", "file"} - return Path(str(root)) + return Path(value) + + +def _iter_directory_files(root: Path, *, max_depth: int) -> Iterator[Path]: + yield from _iter_directory_files_from( + root, + root=root, + resolved_root=root.resolve(), + max_depth=max_depth, + ) + + +def _iter_directory_files_from( + directory: Path, + *, + root: Path, + resolved_root: Path, + max_depth: int, +) -> Iterator[Path]: + for child in sorted(directory.iterdir()): + if not child.resolve().is_relative_to(resolved_root): + continue + if child.is_file(): + yield child + continue + if child.is_symlink(): + continue + if not child.is_dir(): + continue + child_depth = len(child.relative_to(root).parts) - 1 + if child_depth < max_depth: + yield from _iter_directory_files_from( + child, + root=root, + resolved_root=resolved_root, + max_depth=max_depth, + ) def _path_from_file_uri(origin: str) -> Path: @@ -1675,7 +1980,11 @@ def _path_from_file_origin(origin: str) -> Path: def _normalize_types(types: Sequence[str] | None) -> set[str]: - return {item.strip().lower() for item in types or []} + if types is None: + return set() + if isinstance(types, str): + types = [types] + return {item.strip().lower() for item in types} def _matches_patterns( @@ -1684,14 +1993,21 @@ def _matches_patterns( include_patterns: Sequence[str], exclude_patterns: Sequence[str], ) -> bool: - for pattern in exclude_patterns: - if re.search(pattern, origin): - return False + if _matches_exclude_patterns(origin, exclude_patterns=exclude_patterns): + return False if not include_patterns: return True return any(re.search(pattern, origin) for pattern in include_patterns) +def _matches_exclude_patterns( + origin: str, + *, + exclude_patterns: Sequence[str], +) -> bool: + return any(re.search(pattern, origin) for pattern in exclude_patterns) + + def _matches_cloudflare_patterns( origin: str, *, @@ -1775,6 +2091,12 @@ def _known_body_suffix(origin: str, *, content_type: str | None) -> str | None: return ".html" if normalized == "text/markdown": return ".md" + if normalized == "text/plain": + return ".txt" + if normalized in {"application/xml", "text/xml"}: + return ".xml" + if normalized == "text/x-python": + return ".py" if normalized == "application/json": return ".json" if normalized == "application/pdf": diff --git a/src/raghilda/scrape.py b/src/raghilda/scrape.py index 15a81cc..bf07c37 100644 --- a/src/raghilda/scrape.py +++ b/src/raghilda/scrape.py @@ -38,7 +38,7 @@ def _extract_links(txt: str) -> set[str]: root = ET.fromstring(txt) for loc in root.findall(".//{*}url/{*}loc"): if loc is not None and loc.text: - links.update(loc.text.strip()) + links.add(loc.text.strip()) except Exception: pass diff --git a/tests/test_crawl.py b/tests/test_crawl.py index 7d1f007..2cb149d 100644 --- a/tests/test_crawl.py +++ b/tests/test_crawl.py @@ -325,6 +325,7 @@ def test_web_crawler_discovers_origins_and_revalidates_cache(tmp_path: Path) -> } ) as server: root_url = f"http://127.0.0.1:{server.server_port}/" + root_origin = root_url.rstrip("/") crawler = WebCrawler( cache_dir=tmp_path / "cache", cache_stale_after=timedelta(seconds=0), @@ -332,13 +333,13 @@ def test_web_crawler_discovers_origins_and_revalidates_cache(tmp_path: Path) -> scope = CrawlScope( roots=[root_url], depth=1, - include_patterns=[rf"^{re.escape(root_url)}.*"], + include_patterns=[rf"^{re.escape(root_origin)}(?:/.*)?$"], exclude_patterns=[r".*/skip$"], ) origins = list(crawler.origins(scope, progress=False)) - assert root_url in origins + assert root_origin in origins assert f"{root_url}guide" in origins assert all(not origin.endswith("/skip") for origin in origins) assert all("external.test" not in origin for origin in origins) @@ -414,6 +415,33 @@ def test_web_crawler_follows_links_after_redirect_to_different_host( assert "https://www.example.com/about" in origins +def test_web_crawler_include_subdomains_uses_redirect_scope( + tmp_path: Path, +) -> None: + root = "http://example.com" + page = "https://example.com/page" + session: Any = _FakeWebSession( + { + root: { + "body": 'Page', + "resolved_url": "https://example.com/landing", + }, + page: { + "body": "
Page
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "redirect-subdomain-cache", + session=session, + ) + scope = CrawlScope(roots=[root], depth=1, include_subdomains=True) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root, page] + + def test_web_crawler_include_subdomains_stays_within_requested_host_tree( tmp_path: Path, ) -> None: @@ -489,6 +517,183 @@ def test_web_crawler_include_subdomains_keeps_original_scope_host( assert origins == [root, api, cdn] +def test_web_crawler_excludes_same_host_different_port_by_default( + tmp_path: Path, +) -> None: + root = "http://127.0.0.1:8000" + other_port = "http://127.0.0.1:9000/page" + session: Any = _FakeWebSession( + { + root: { + "body": f'Other', + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "same-host-port-cache", + session=session, + ) + scope = CrawlScope(roots=[root], depth=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root] + assert session.requests == [(root, {})] + + +def test_web_crawler_include_subdomains_excludes_same_host_different_port( + tmp_path: Path, +) -> None: + root = "http://127.0.0.1:8000" + other_port = "http://127.0.0.1:9000/page" + session: Any = _FakeWebSession( + { + root: { + "body": f'Other', + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "subdomain-same-host-port-cache", + session=session, + ) + scope = CrawlScope(roots=[root], depth=1, include_subdomains=True) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root] + assert session.requests == [(root, {})] + + +def test_web_crawler_treats_explicit_default_port_as_same_origin( + tmp_path: Path, +) -> None: + root = "http://example.com" + explicit_root = "http://example.com:80" + child = "http://example.com/about" + session: Any = _FakeWebSession( + { + root: { + "body": f'About', + }, + child: { + "body": "
About
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "default-port-cache", + session=session, + ) + scope = CrawlScope(roots=[explicit_root], depth=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root, child] + + +def test_web_crawler_deduplicates_explicit_default_port_variants( + tmp_path: Path, +) -> None: + root = "https://example.com" + session: Any = _FakeWebSession( + { + root: { + "body": "
Root
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "default-port-variant-cache", + session=session, + ) + scope = CrawlScope(roots=[root, "https://example.com:443"], depth=0) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root] + assert session.requests == [(root, {})] + + +def test_web_crawler_normalizes_uppercase_url_schemes(tmp_path: Path) -> None: + origin = "http://example.com" + page = "https://example.com/page" + session: Any = _FakeWebSession( + { + origin: { + "body": ( + '' + "Page" + ), + }, + page: { + "body": "
Page
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "uppercase-scheme-cache", + session=session, + ) + scope = CrawlScope( + roots=["HTTP://example.com"], depth=1, include_external_links=True + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [origin, page] + + +def test_web_crawler_preserves_url_credentials(tmp_path: Path) -> None: + origin = "https://user:pass@example.com/private" + session: Any = _FakeWebSession( + { + origin: { + "body": "
Private
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "credential-url-cache", + session=session, + ) + + source = crawler.fetch_raw(origin) + + assert source.origin == origin + assert session.requests == [(origin, {})] + + +def test_web_crawler_discovers_urls_from_xml_sitemap(tmp_path: Path) -> None: + sitemap = "https://example.com/sitemap.xml" + page = "https://example.com/docs/page" + session: Any = _FakeWebSession( + { + sitemap: { + "body": ( + '' + '' + f"{page}" + "" + ), + "content_type": "application/xml", + }, + page: { + "body": "
Page
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "sitemap-cache", + session=session, + ) + scope = CrawlScope(roots=[sitemap], depth=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [sitemap, page] + + def test_web_crawler_allows_later_in_scope_occurrence_of_same_url( tmp_path: Path, ) -> None: @@ -590,6 +795,150 @@ def test_web_crawler_discovers_matching_descendants_from_filtered_seed( assert f"{root_url}docs/guide" in origins +def test_web_crawler_does_not_fetch_excluded_origins(tmp_path: Path) -> None: + root = "https://example.com" + admin = "https://example.com/admin" + session: Any = _FakeWebSession( + { + root: { + "body": f'Admin', + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "excluded-origin-cache", + session=session, + ) + scope = CrawlScope( + roots=[root], + depth=1, + exclude_patterns=[r"/admin$"], + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root] + assert session.requests == [(root, {})] + + +def test_web_crawler_deduplicates_root_url_with_and_without_slash( + tmp_path: Path, +) -> None: + root = "https://example.com" + session: Any = _FakeWebSession( + { + root: { + "body": 'Root', + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "root-slash-cache", + session=session, + ) + scope = CrawlScope(roots=[root], depth=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root] + assert session.requests == [(root, {})] + + +def test_web_crawler_deduplicates_root_scope_variants(tmp_path: Path) -> None: + root = "https://example.com" + session: Any = _FakeWebSession( + { + root: { + "body": "
Root
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "root-variant-cache", + session=session, + ) + scope = CrawlScope(roots=[root, f"{root}/"], depth=0) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root] + assert session.requests == [(root, {})] + + +def test_web_crawler_deduplicates_queried_root_scope_variants( + tmp_path: Path, +) -> None: + root = "https://example.com?x=1" + session: Any = _FakeWebSession( + { + root: { + "body": "
Root
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "queried-root-variant-cache", + session=session, + ) + scope = CrawlScope(roots=[root, "https://example.com/?x=1"], depth=0) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root] + assert session.requests == [(root, {})] + + +def test_web_crawler_normalizes_root_links_from_non_root_pages( + tmp_path: Path, +) -> None: + root = "https://example.com" + page = "https://example.com/docs" + session: Any = _FakeWebSession( + { + page: { + "body": 'Root', + }, + root: { + "body": "
Root
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "non-root-root-link-cache", + session=session, + ) + scope = CrawlScope(roots=[page], depth=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [page, root] + assert session.requests == [(page, {}), (root, {})] + + +def test_web_crawler_skips_links_with_malformed_ports(tmp_path: Path) -> None: + root = "https://example.com" + session: Any = _FakeWebSession( + { + root: { + "body": ( + '' + "Bad" + ), + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "bad-port-cache", + session=session, + ) + scope = CrawlScope(roots=[root], depth=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root] + assert session.requests == [(root, {})] + + def test_web_crawler_accepts_crawl_scope_for_roots_and_patterns( tmp_path: Path, ) -> None: @@ -637,6 +986,7 @@ def test_web_markdown_documents_reuses_refreshed_sources( } ) as server: root_url = f"http://127.0.0.1:{server.server_port}/" + root_origin = root_url.rstrip("/") crawler = WebCrawler( cache_dir=tmp_path / "markdown-docs-cache", ) @@ -647,7 +997,7 @@ def test_web_markdown_documents_reuses_refreshed_sources( request for request in getattr(server, "requests") if request["path"] == "/" ] - assert documents == [MarkdownDocument(origin=root_url, content="Root")] + assert documents == [MarkdownDocument(origin=root_origin, content="Root")] assert len(root_requests) == 1 @@ -664,6 +1014,7 @@ def test_web_markdown_documents_reuses_immediately_stale_discovery_cache( } ) as server: root_url = f"http://127.0.0.1:{server.server_port}/" + root_origin = root_url.rstrip("/") crawler = WebCrawler( cache_dir=tmp_path / "stale-markdown-docs-cache", cache_stale_after=timedelta(seconds=0), @@ -675,7 +1026,7 @@ def test_web_markdown_documents_reuses_immediately_stale_discovery_cache( request for request in getattr(server, "requests") if request["path"] == "/" ] - assert documents == [MarkdownDocument(origin=root_url, content="Root")] + assert documents == [MarkdownDocument(origin=root_origin, content="Root")] assert len(root_requests) == 1 @@ -787,32 +1138,128 @@ def test_web_crawler_cache_dir_uses_hashed_file_pair( } ) as server: root_url = f"http://127.0.0.1:{server.server_port}/" + root_origin = root_url.rstrip("/") cache_dir = tmp_path / "cache" crawler = WebCrawler(cache_dir=cache_dir) document = crawler.fetch_markdown(root_url) - base = _expected_cache_base(root_url) + base = _expected_cache_base(root_origin) metadata_path = cache_dir / f"{base}.metadata.json" content_path = cache_dir / f"{base}.html" - assert document == MarkdownDocument(origin=root_url, content="Root") + assert document == MarkdownDocument(origin=root_origin, content="Root") assert sorted(path.name for path in cache_dir.iterdir()) == [ content_path.name, metadata_path.name, ] record = json.loads(metadata_path.read_text(encoding="utf-8")) - assert record["key"] == root_url + assert record["key"] == root_origin assert record["content_path"] == content_path.name assert record["metadata"]["content_type"] == "text/html; charset=utf-8" - assert record["metadata"]["origin"] == root_url + assert record["metadata"]["origin"] == root_origin -def test_web_crawler_cache_dir_true_uses_default_backend_directory( +def test_web_crawler_rejects_cache_metadata_content_path_outside_cache( tmp_path: Path, - monkeypatch, ) -> None: - monkeypatch.chdir(tmp_path) - with _serve( + origin = "https://example.com/poison" + cache_dir = tmp_path / "cache" + cache_dir.mkdir() + outside = tmp_path / "outside.html" + outside.write_text("Poison", encoding="utf-8") + base = _expected_cache_base(origin) + metadata_path = cache_dir / f"{base}.metadata.json" + metadata_path.write_text( + json.dumps( + { + "key": origin, + "content_path": "../outside.html", + "metadata": { + "origin": origin, + "resolved_origin": origin, + "content_type": "text/html", + "status_code": 200, + "etag": None, + "last_modified": None, + "type_label": "html", + "fetched_at": "2026-01-01T00:00:00+00:00", + "revalidated_at": None, + }, + } + ), + encoding="utf-8", + ) + session: Any = _FakeWebSession( + { + origin: { + "body": "
Fresh
", + } + } + ) + crawler = WebCrawler(cache_dir=cache_dir, session=session) + + source = crawler.fetch_raw(origin) + + assert source.body_path.parent == cache_dir + assert source.body_path != outside + assert session.requests == [(origin, {})] + + +def test_web_crawler_rejects_cache_metadata_with_mismatched_key( + tmp_path: Path, +) -> None: + origin = "https://example.com/requested" + stale_origin = "https://example.com/stale" + cache_dir = tmp_path / "cache" + cache_dir.mkdir() + base = _expected_cache_base(origin) + content_path = cache_dir / f"{base}.html" + content_path.write_text("Stale", encoding="utf-8") + metadata_path = cache_dir / f"{base}.metadata.json" + metadata_path.write_text( + json.dumps( + { + "key": stale_origin, + "content_path": content_path.name, + "metadata": { + "origin": stale_origin, + "resolved_origin": stale_origin, + "content_type": "text/html", + "status_code": 200, + "etag": None, + "last_modified": None, + "type_label": "html", + "fetched_at": "2026-01-01T00:00:00+00:00", + "revalidated_at": None, + }, + } + ), + encoding="utf-8", + ) + session: Any = _FakeWebSession( + { + origin: { + "body": "
Fresh
", + } + } + ) + crawler = WebCrawler(cache_dir=cache_dir, session=session) + + source = crawler.fetch_raw(origin) + + assert source.origin == origin + assert source.body_path.read_text(encoding="utf-8") != ( + "Stale" + ) + assert session.requests == [(origin, {})] + + +def test_web_crawler_cache_dir_true_uses_default_backend_directory( + tmp_path: Path, + monkeypatch, +) -> None: + monkeypatch.chdir(tmp_path) + with _serve( { "/": { "body": "
Root
", @@ -822,18 +1269,73 @@ def test_web_crawler_cache_dir_true_uses_default_backend_directory( } ) as server: root_url = f"http://127.0.0.1:{server.server_port}/" + root_origin = root_url.rstrip("/") crawler = WebCrawler(cache_dir=True) crawler.fetch_markdown(root_url) cache_dir = tmp_path / ".raghilda" / "cache" / "web" - base = _expected_cache_base(root_url) + base = _expected_cache_base(root_origin) assert sorted(path.name for path in cache_dir.iterdir()) == [ f"{base}.html", f"{base}.metadata.json", ] +def test_web_crawler_relative_cache_dir_is_anchored_at_construction( + tmp_path: Path, + monkeypatch, +) -> None: + monkeypatch.chdir(tmp_path) + origin = "https://example.com/page" + session: Any = _FakeWebSession( + { + origin: { + "body": "
Page
", + } + } + ) + crawler = WebCrawler(cache_dir="cache", session=session) + other_cwd = tmp_path / "other" + other_cwd.mkdir() + monkeypatch.chdir(other_cwd) + + source = crawler.fetch_raw(origin) + + assert source.body_path.parent == tmp_path / "cache" + + +def test_web_crawler_scopes_fresh_cache_hits_to_custom_session( + tmp_path: Path, +) -> None: + origin = "https://example.com/private" + cache_dir = tmp_path / "session-cache" + first_session: Any = _FakeWebSession( + { + origin: { + "body": "
First
", + }, + } + ) + second_session: Any = _FakeWebSession( + { + origin: { + "body": "
Second
", + }, + } + ) + first_crawler = WebCrawler(cache_dir=cache_dir, session=first_session) + second_crawler = WebCrawler(cache_dir=cache_dir, session=second_session) + + first = first_crawler.fetch_raw(origin) + first_body = first.body_path.read_text(encoding="utf-8") + second = second_crawler.fetch_raw(origin) + + assert "First" in first_body + assert "Second" in second.body_path.read_text(encoding="utf-8") + assert second_session.requests == [(origin, {})] + + def test_web_crawler_disambiguates_colliding_sanitized_cache_prefixes( tmp_path: Path, ) -> None: @@ -880,7 +1382,65 @@ def test_web_crawler_disambiguates_colliding_sanitized_cache_prefixes( assert second_crawler.fetch_raw(first_origin).body_path.exists() assert second_crawler.fetch_raw(second_origin).body_path.exists() assert second_crawler.fetch_raw(third_origin).body_path.exists() - assert second_session.requests == [] + assert second_session.requests == [ + (first_origin, {}), + (second_origin, {}), + (third_origin, {}), + ] + + +def test_web_crawler_refresh_deletes_only_exact_cache_base(tmp_path: Path) -> None: + first_origin = "https://example.com" + first_base = _expected_cache_base(first_origin) + second_origin = f"https://example.com--{first_base.rsplit('--', 1)[1]}.child" + session: Any = _FakeWebSession( + { + first_origin: {"body": "
One
"}, + second_origin: {"body": "
Two
"}, + } + ) + crawler = WebCrawler(cache_dir=tmp_path / "exact-delete-cache", session=session) + + crawler.fetch_raw(first_origin) + crawler.fetch_raw(second_origin) + crawler.fetch_raw(first_origin, cache_force_refresh=True) + session.requests.clear() + crawler.fetch_raw(second_origin) + + assert session.requests == [] + + +def test_web_crawler_refresh_replaces_cached_body_atomically( + tmp_path: Path, + monkeypatch, +) -> None: + origin = "https://example.com" + session: Any = _FakeWebSession( + { + origin: { + "body": "
First
", + }, + } + ) + crawler = WebCrawler(cache_dir=tmp_path / "atomic-cache", session=session) + first = crawler.fetch_raw(origin) + session.routes[origin]["body"] = "
Second
" + replacements: list[tuple[Path, Path]] = [] + replace = crawl_module.os.replace + + def track_replace(src: str | Path, dst: str | Path) -> None: + replacements.append((Path(src), Path(dst))) + replace(src, dst) + + monkeypatch.setattr(crawl_module.os, "replace", track_replace) + + second = crawler.fetch_raw(origin, cache_force_refresh=True) + + assert first.body_path == second.body_path + assert second.body_path.read_text(encoding="utf-8") == ( + "
Second
" + ) + assert replacements[-1][1] == second.body_path def test_web_crawler_cache_writes_for_different_keys_do_not_contend( @@ -966,6 +1526,50 @@ def test_web_crawler_uses_magika_when_no_explicit_ext_is_available( assert source.body_path == cache_dir / f"{base}.html" +def test_web_crawler_type_filters_use_sniffed_cache_extension( + tmp_path: Path, + monkeypatch, +) -> None: + class _FakeMagikaOutput: + label = "html" + extensions = ["html"] + + class _FakeMagikaResult: + output = _FakeMagikaOutput() + + class _FakeMagika: + def identify_bytes(self, content: bytes) -> _FakeMagikaResult: + assert content.startswith(b"") + return _FakeMagikaResult() + + def identify_path(self, path: Path) -> _FakeMagikaResult: + assert path.suffix == ".html" + return _FakeMagikaResult() + + origin = "https://example.com/download" + session: Any = _FakeWebSession( + { + origin: { + "body": "
Download
", + "content_type": "application/octet-stream", + } + } + ) + monkeypatch.setattr(crawl_module, "_MAGIKA", _FakeMagika()) + crawler = WebCrawler(cache_dir=tmp_path / "sniffed-type-cache", session=session) + scope = CrawlScope(roots=[origin], depth=0, include_types=["html"]) + + origins = list(crawler.origins(scope, progress=False)) + source = crawler.fetch_raw(origin) + + assert origins == [origin] + assert source.metadata == { + "etag": None, + "last_modified": None, + "type_label": "html", + } + + def test_web_crawler_prefers_content_type_over_misleading_url_suffix( tmp_path: Path, ) -> None: @@ -989,6 +1593,30 @@ def test_web_crawler_prefers_content_type_over_misleading_url_suffix( assert document == MarkdownDocument(origin=origin, content="Rendered Readme") +def test_web_crawler_prefers_text_content_type_over_url_suffix( + tmp_path: Path, +) -> None: + origin = "https://example.com/plain.html" + session: Any = _FakeWebSession( + { + origin: { + "body": "plain text", + "content_type": "text/plain; charset=utf-8", + } + } + ) + cache_dir = tmp_path / "text-content-type-cache" + crawler = WebCrawler(cache_dir=cache_dir, session=session) + scope = CrawlScope(roots=[origin], depth=0, include_types=["text"]) + + origins = list(crawler.origins(scope, progress=False)) + source = crawler.fetch_raw(origin) + + assert origins == [origin] + assert source.body_path == cache_dir / f"{_expected_cache_base(origin)}.txt" + assert (source.metadata or {})["type_label"] == "text" + + def test_web_crawler_preserves_reserved_escapes_in_requested_origin( tmp_path: Path, ) -> None: @@ -1180,13 +1808,241 @@ def get( "result": { "id": job_id, "status": "completed", - "records": records, + "records": records, + }, + } + ) + + +class _DiscoveryFilteringCloudflareSession(_ParameterizedCloudflareSession): + def get( + self, + url: str, + *, + headers: dict[str, str], + params: dict[str, Any] | None = None, + timeout: float, + ) -> _CloudflareResponse: + response = super().get(url, headers=headers, params=params, timeout=timeout) + if params == {"limit": 1}: + return response + + payload = self._jobs[url.rsplit("/", 1)[-1]] + include_patterns = payload["options"].get("includePatterns", []) + exclude_patterns = payload["options"].get("excludePatterns", []) + records = response.json()["result"]["records"] + filtered_records = [ + record + for record in records + if ( + ( + not include_patterns + or any( + fnmatch.fnmatchcase(record["url"], pattern) + for pattern in include_patterns + ) + ) + and not any( + fnmatch.fnmatchcase(record["url"], pattern) + for pattern in exclude_patterns + ) + ) + ] + return _CloudflareResponse( + { + "success": True, + "result": { + "id": url.rsplit("/", 1)[-1], + "status": "completed", + "records": filtered_records, + }, + } + ) + + +class _OverlappingLimitedCloudflareSession(_ParameterizedCloudflareSession): + def get( + self, + url: str, + *, + headers: dict[str, str], + params: dict[str, Any] | None = None, + timeout: float, + ) -> _CloudflareResponse: + del headers, timeout + self.get_calls.append((url, params)) + job_id = url.rsplit("/", 1)[-1] + payload = self._jobs[job_id] + if params == {"limit": 1}: + return _CloudflareResponse( + {"success": True, "result": {"id": job_id, "status": "completed"}} + ) + records = [ + { + "url": "https://example.com/shared", + "status": "completed", + "markdown": "# Shared\n", + "metadata": { + "status": 200, + "title": "Shared", + "url": "https://example.com/shared", + }, + } + ] + if payload["url"] == "https://example.com/root-b": + records.append( + { + "url": "https://example.com/root-b/unique", + "status": "completed", + "markdown": "# Unique\n", + "metadata": { + "status": 200, + "title": "Unique", + "url": "https://example.com/root-b/unique", + }, + } + ) + if "limit" in payload: + records = records[: payload["limit"]] + return _CloudflareResponse( + { + "success": True, + "result": { + "id": job_id, + "status": "completed", + "records": records, + }, + } + ) + + +class _TrailingSlashCloudflareSession(_ParameterizedCloudflareSession): + def get( + self, + url: str, + *, + headers: dict[str, str], + params: dict[str, Any] | None = None, + timeout: float, + ) -> _CloudflareResponse: + response = super().get(url, headers=headers, params=params, timeout=timeout) + if params == {"limit": 1}: + return response + payload = self._jobs[url.rsplit("/", 1)[-1]] + records = response.json()["result"]["records"] + records[0]["url"] = f"{payload['url'].rstrip('/')}/" + records[0]["metadata"]["url"] = records[0]["url"] + return response + + +class _OutOfScopeCloudflareSession(_ParameterizedCloudflareSession): + def get( + self, + url: str, + *, + headers: dict[str, str], + params: dict[str, Any] | None = None, + timeout: float, + ) -> _CloudflareResponse: + response = super().get(url, headers=headers, params=params, timeout=timeout) + if params == {"limit": 1}: + return response + job_id = url.rsplit("/", 1)[-1] + payload = self._jobs[job_id] + root = payload["url"] + records = [ + { + "url": root, + "status": "completed", + "markdown": "# Root\n", + "metadata": { + "status": 200, + "title": "Root", + "url": root, + }, + }, + { + "url": "https://example.com/page", + "status": "completed", + "markdown": "# Page\n", + "metadata": { + "status": 200, + "title": "Page", + "url": "https://example.com/page", + }, + }, + { + "url": "https://docs.example.com/page", + "status": "completed", + "markdown": "# Subdomain\n", + "metadata": { + "status": 200, + "title": "Subdomain", + "url": "https://docs.example.com/page", + }, + }, + { + "url": "https://external.test/page", + "status": "completed", + "markdown": "# External\n", + "metadata": { + "status": 200, + "title": "External", + "url": "https://external.test/page", + }, + }, + ] + return _CloudflareResponse( + { + "success": True, + "result": { + "id": job_id, + "status": "completed", + "records": records, + }, + } + ) + + +class _RedirectCloudflareSession(_ParameterizedCloudflareSession): + def get( + self, + url: str, + *, + headers: dict[str, str], + params: dict[str, Any] | None = None, + timeout: float, + ) -> _CloudflareResponse: + response = super().get(url, headers=headers, params=params, timeout=timeout) + if params == {"limit": 1}: + return response + job_id = url.rsplit("/", 1)[-1] + root = self._jobs[job_id]["url"] + final_url = f"{root.rstrip('/')}/landing" + return _CloudflareResponse( + { + "success": True, + "result": { + "id": job_id, + "status": "completed", + "records": [ + { + "url": final_url, + "status": "completed", + "markdown": "# Landing\n", + "metadata": { + "status": 200, + "title": "Landing", + "url": final_url, + }, + } + ], }, } ) -class _DiscoveryFilteringCloudflareSession(_ParameterizedCloudflareSession): +class _CrossOriginRedirectCloudflareSession(_ParameterizedCloudflareSession): def get( self, url: str, @@ -1198,35 +2054,26 @@ def get( response = super().get(url, headers=headers, params=params, timeout=timeout) if params == {"limit": 1}: return response - - payload = self._jobs[url.rsplit("/", 1)[-1]] - include_patterns = payload["options"].get("includePatterns", []) - exclude_patterns = payload["options"].get("excludePatterns", []) - records = response.json()["result"]["records"] - filtered_records = [ - record - for record in records - if ( - ( - not include_patterns - or any( - fnmatch.fnmatchcase(record["url"], pattern) - for pattern in include_patterns - ) - ) - and not any( - fnmatch.fnmatchcase(record["url"], pattern) - for pattern in exclude_patterns - ) - ) - ] + job_id = url.rsplit("/", 1)[-1] + final_url = "https://example.com/landing" return _CloudflareResponse( { "success": True, "result": { - "id": url.rsplit("/", 1)[-1], + "id": job_id, "status": "completed", - "records": filtered_records, + "records": [ + { + "url": final_url, + "status": "completed", + "markdown": "# Landing\n", + "metadata": { + "status": 200, + "title": "Landing", + "url": final_url, + }, + } + ], }, } ) @@ -1300,12 +2147,12 @@ def test_cloudflare_markdown_documents_reuses_immediately_stale_discovery_cache( cache_stale_after=timedelta(seconds=0), poll_interval=0, ) - scope = CrawlScope(roots=["https://example.com/docs"], depth=0) + scope = CrawlScope(roots=["https://example.com"], depth=0) documents = list(crawler.markdown_documents(scope, progress=False)) assert documents == [ - MarkdownDocument(origin="https://example.com/docs", content="# Docs\n") + MarkdownDocument(origin="https://example.com", content="# Docs\n") ] assert len(session.post_calls) == 1 @@ -1339,6 +2186,32 @@ def test_cloudflare_crawler_accepts_crawl_scope_for_roots_and_patterns( ] +def test_cloudflare_crawler_filters_returned_records_to_web_scope( + tmp_path: Path, +) -> None: + session = _OutOfScopeCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-returned-scope-cache", + session=session, + poll_interval=0, + ) + scope = CrawlScope( + roots=["https://example.com/root"], + depth=1, + include_external_links=False, + include_subdomains=False, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [ + "https://example.com/root", + "https://example.com/page", + ] + + def test_cloudflare_crawler_cache_key_includes_crawl_parameters( tmp_path: Path, ) -> None: @@ -1428,6 +2301,59 @@ def test_cloudflare_fetch_raw_ignores_discovery_patterns_for_explicit_origin( assert "includePatterns" not in session.post_calls[0][1]["options"] +def test_cloudflare_fetch_raw_accepts_redirected_record( + tmp_path: Path, +) -> None: + cache = tmp_path / "cloudflare-redirect-cache" + session = _RedirectCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache, + session=session, + poll_interval=0, + ) + + source = crawler.fetch_raw("https://example.com") + + assert source.origin == "https://example.com" + assert source.resolved_origin == "https://example.com/landing" + assert source.body_path.read_text(encoding="utf-8") == "# Landing\n" + assert len(session.post_calls) == 1 + + cached_session = _RedirectCloudflareSession() + cached_crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache, + session=cached_session, + poll_interval=0, + ) + cached_source = cached_crawler.fetch_raw("https://example.com") + + assert cached_source.resolved_origin == "https://example.com/landing" + assert cached_session.post_calls == [] + + +def test_cloudflare_fetch_raw_accepts_cross_origin_redirected_record( + tmp_path: Path, +) -> None: + session = _CrossOriginRedirectCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-cross-origin-redirect-cache", + session=session, + poll_interval=0, + ) + + source = crawler.fetch_raw("http://example.com") + + assert source.origin == "http://example.com" + assert source.resolved_origin == "https://example.com/landing" + assert source.body_path.read_text(encoding="utf-8") == "# Landing\n" + + def test_cloudflare_fetch_raw_reuses_cache_directory_across_instances( tmp_path: Path, ) -> None: @@ -1461,6 +2387,64 @@ def test_cloudflare_fetch_raw_reuses_cache_directory_across_instances( assert second_session.post_calls == [] +def test_cloudflare_origins_reuses_root_cache_directory_across_instances( + tmp_path: Path, +) -> None: + cache = tmp_path / "cloudflare-cache" + scope = CrawlScope(roots=["https://example.com/docs"], depth=1) + first_session = _ParameterizedCloudflareSession() + first_crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache, + session=first_session, + poll_interval=0, + ) + + first_origins = list(first_crawler.origins(scope, progress=False)) + + second_session = _ParameterizedCloudflareSession() + second_crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache, + session=second_session, + poll_interval=0, + ) + + second_origins = list(second_crawler.origins(scope, progress=False)) + page_source = second_crawler.fetch_raw("https://example.com/docs/page") + + assert first_origins == [ + "https://example.com/docs", + "https://example.com/docs/page", + ] + assert second_origins == first_origins + assert page_source.origin == "https://example.com/docs/page" + assert len(first_session.post_calls) == 1 + assert second_session.post_calls == [] + + +def test_cloudflare_markdown_documents_canonicalizes_record_urls( + tmp_path: Path, +) -> None: + session = _TrailingSlashCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-record-url-cache", + session=session, + poll_interval=0, + ) + scope = CrawlScope(roots=["https://example.com"], depth=0) + + documents = list(crawler.markdown_documents(scope, progress=False)) + + assert documents == [ + MarkdownDocument(origin="https://example.com", content="# Docs\n") + ] + + def test_cloudflare_crawler_cache_dir_uses_hashed_file_pair( tmp_path: Path, ) -> None: @@ -1600,6 +2584,67 @@ def test_cloudflare_crawler_applies_limit_across_all_roots( assert len(session.post_calls) == 1 +def test_cloudflare_crawler_deduplicates_roots_before_counting_limit( + tmp_path: Path, +) -> None: + session = _ParameterizedCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-dedupe-cache", + session=session, + poll_interval=0, + ) + scope = CrawlScope( + roots=[ + "https://example.com/docs-a", + "https://example.com/docs-a", + "https://example.com/docs-b", + ], + depth=0, + limit=2, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [ + "https://example.com/docs-a", + "https://example.com/docs-b", + ] + assert [call[1]["url"] for call in session.post_calls] == [ + "https://example.com/docs-a", + "https://example.com/docs-b", + ] + + +def test_cloudflare_crawler_applies_limit_after_deduplication( + tmp_path: Path, +) -> None: + session = _OverlappingLimitedCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-overlap-cache", + session=session, + poll_interval=0, + ) + scope = CrawlScope( + roots=[ + "https://example.com/root-a", + "https://example.com/root-b", + ], + limit=2, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [ + "https://example.com/shared", + "https://example.com/root-b/unique", + ] + assert "limit" not in session.post_calls[1][1] + + def test_directory_crawler_counts_file_roots_toward_limit(tmp_path: Path) -> None: first = _write(tmp_path, "a.md", "# First") second = _write(tmp_path, "b.md", "# Second") @@ -1611,6 +2656,102 @@ def test_directory_crawler_counts_file_roots_toward_limit(tmp_path: Path) -> Non assert origins == [first.resolve().as_uri()] +def test_directory_crawler_deduplicates_roots_before_counting_limit( + tmp_path: Path, +) -> None: + docs = tmp_path / "docs" + first = _write(docs, "a.md", "# First") + second = _write(docs, "b.md", "# Second") + crawler = DirectoryCrawler() + scope = CrawlScope(roots=[first, docs], limit=2) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [ + first.resolve().as_uri(), + second.resolve().as_uri(), + ] + + +def test_directory_crawler_applies_limit_without_prewalking_tree( + tmp_path: Path, + monkeypatch, +) -> None: + first = _write(tmp_path, "a.md", "# First") + _write(tmp_path, "z/b.md", "# Second") + crawler = DirectoryCrawler() + + def fail_rglob(self: Path, pattern: str): + del self, pattern + raise AssertionError("DirectoryCrawler should not prewalk with rglob") + + monkeypatch.setattr(Path, "rglob", fail_rglob) + + origins = list( + crawler.origins(CrawlScope(roots=[tmp_path], depth=0, limit=1), progress=False) + ) + + assert origins == [first.resolve().as_uri()] + + +def test_directory_crawler_does_not_follow_symlinked_directories_outside_root( + tmp_path: Path, +) -> None: + root = tmp_path / "root" + inside = _write(root, "inside.md", "# Inside") + external_dir = tmp_path / "external" + outside = _write(external_dir, "outside.md", "# Outside") + link = root / "linked" + try: + link.symlink_to(external_dir, target_is_directory=True) + except OSError as exc: + pytest.skip(f"Symlink creation failed: {exc}") + crawler = DirectoryCrawler() + + origins = list(crawler.origins(CrawlScope(roots=[root], depth=2), progress=False)) + + assert origins == [inside.resolve().as_uri()] + assert outside.resolve().as_uri() not in origins + + +def test_directory_crawler_skips_type_sniffing_without_type_filters( + tmp_path: Path, + monkeypatch, +) -> None: + document = _write(tmp_path, "extensionless", "# Document") + + class _FailingMagika: + def identify_path(self, path: Path): + raise AssertionError(f"Unexpected type sniff for {path}") + + monkeypatch.setattr(crawl_module, "_MAGIKA", _FailingMagika()) + crawler = DirectoryCrawler() + + origins = list(crawler.origins(CrawlScope(roots=[tmp_path]), progress=False)) + + assert origins == [document.resolve().as_uri()] + + +def test_directory_crawler_coerces_scalar_patterns_and_types( + tmp_path: Path, +) -> None: + docs = tmp_path / "docs" + readme = _write(docs, "readme.md", "# Readme") + _write(docs, "skip.py", "print('skip')") + _write(tmp_path, "notes.md", "# Notes") + crawler = DirectoryCrawler() + scope = CrawlScope( + roots=[tmp_path], + include_patterns=r".*/docs/.*", + include_types="markdown", + exclude_types="python", + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [readme.resolve().as_uri()] + + def test_directory_crawler_accepts_crawl_scope_for_roots_and_patterns( tmp_path: Path, ) -> None: @@ -1797,6 +2938,20 @@ def test_directory_crawler_accepts_percent_escaped_file_uri_roots( assert origins == [markdown.resolve().as_uri()] +def test_directory_crawler_accepts_windows_drive_letter_string_roots( + tmp_path: Path, + monkeypatch, +) -> None: + root = tmp_path / "C:\\docs" + markdown = _write(root, "readme.md", "# Hello") + monkeypatch.chdir(tmp_path) + crawler = DirectoryCrawler() + + origins = list(crawler.origins(CrawlScope(roots=["C:\\docs"]), progress=False)) + + assert origins == [markdown.resolve().as_uri()] + + @pytest.mark.skipif(os.name != "nt", reason="Windows-specific file URI handling") def test_directory_crawler_round_trips_windows_file_uris( tmp_path: Path, diff --git a/tests/test_store_ingest.py b/tests/test_store_ingest.py index 8fe334d..1b2ad61 100644 --- a/tests/test_store_ingest.py +++ b/tests/test_store_ingest.py @@ -360,6 +360,57 @@ def fake_wait(pending, return_when): store.ingest(documents, max_workers=2) +def test_base_store_ingest_propagates_worker_cancelled_error() -> None: + store = _RecordingStore() + documents = [MarkdownDocument(origin="doc-1", content="# One")] + + def prepare(document: MarkdownDocument) -> MarkdownDocument: + del document + raise CancelledError("prepare cancelled") + + with pytest.raises(CancelledError, match="prepare cancelled"): + store.ingest(documents, prepare=prepare, max_workers=1) + + +def test_postgresql_store_ingest_serializes_upsert_calls() -> None: + pytest.importorskip("psycopg2") + from raghilda._postgres_store import PostgreSQLStore + + store = PostgreSQLStore.__new__(PostgreSQLStore) + store._ingest_upsert_lock = threading.Lock() + lock = threading.Lock() + in_flight = 0 + max_in_flight = 0 + + def upsert( + document: Document, + *, + skip_if_unchanged: bool = True, + ) -> WriteResult[Document]: + del skip_if_unchanged + nonlocal in_flight, max_in_flight + with lock: + in_flight += 1 + max_in_flight = max(max_in_flight, in_flight) + try: + time.sleep(0.02) + return WriteResult(action="inserted", document=document) + finally: + with lock: + in_flight -= 1 + + store.upsert = upsert # type: ignore[method-assign] + documents = [ + MarkdownDocument(origin="doc-1", content="# One"), + MarkdownDocument(origin="doc-2", content="# Two"), + ] + + summary = store.ingest(documents, max_workers=2) + + assert summary == IngestSummary(inserted=2, replaced=0, skipped=0) + assert max_in_flight == 1 + + def test_duckdb_store_ingest_prepares_chunked_documents() -> None: store = DuckDBStore.create( location=":memory:", From 7e1bd74b69714fe07a1671fbe94357da6449629f Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Tue, 2 Jun 2026 05:09:44 -0400 Subject: [PATCH 14/17] Keep Cloudflare redirected seed records in scope Preserve Cloudflare discovery results for root redirects while continuing to filter non-seed out-of-scope records. Add regressions for cross-origin redirected seeds and external-first API results. --- src/raghilda/crawl.py | 38 ++++++++++++++--- tests/test_crawl.py | 99 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+), 6 deletions(-) diff --git a/src/raghilda/crawl.py b/src/raghilda/crawl.py index 6e46179..feb361c 100644 --- a/src/raghilda/crawl.py +++ b/src/raghilda/crawl.py @@ -1599,12 +1599,16 @@ def _crawl_root( canonical_url = _canonicalize_web_url(record["url"]) if canonical_url is None: continue - if apply_patterns and not _allow_web_origin( - canonical_url, - scope_origin, - root_host, - include_external_links=include_external_links, - include_subdomains=include_subdomains, + if ( + apply_patterns + and not _allow_web_origin( + canonical_url, + scope_origin, + root_host, + include_external_links=include_external_links, + include_subdomains=include_subdomains, + ) + and not _is_cloudflare_seed_redirect_target(root, canonical_url) ): continue if canonical_url != record["url"]: @@ -1899,6 +1903,28 @@ def _allow_web_origin( ) +def _is_cloudflare_seed_redirect_target(root: str, target: str) -> bool: + root_parsed = urlparse(root) + target_parsed = urlparse(target) + if root_parsed.scheme not in {"http", "https"}: + return False + if target_parsed.scheme not in {"http", "https"}: + return False + if root_parsed.port is not None or target_parsed.port is not None: + return False + + root_host = _redirect_host_key(root_parsed.hostname or "") + target_host = _redirect_host_key(target_parsed.hostname or "") + return root_host != "" and root_host == target_host + + +def _redirect_host_key(host: str) -> str: + host = host.lower() + if host.startswith("www."): + return host[4:] + return host + + def _resolve_cache_dir( cache_dir: bool | str | Path | None, *, diff --git a/tests/test_crawl.py b/tests/test_crawl.py index 2cb149d..15317a8 100644 --- a/tests/test_crawl.py +++ b/tests/test_crawl.py @@ -2004,6 +2004,53 @@ def get( ) +class _ExternalFirstCloudflareSession(_ParameterizedCloudflareSession): + def get( + self, + url: str, + *, + headers: dict[str, str], + params: dict[str, Any] | None = None, + timeout: float, + ) -> _CloudflareResponse: + response = super().get(url, headers=headers, params=params, timeout=timeout) + if params == {"limit": 1}: + return response + job_id = url.rsplit("/", 1)[-1] + root = self._jobs[job_id]["url"] + return _CloudflareResponse( + { + "success": True, + "result": { + "id": job_id, + "status": "completed", + "records": [ + { + "url": "https://external.test/page", + "status": "completed", + "markdown": "# External\n", + "metadata": { + "status": 200, + "title": "External", + "url": "https://external.test/page", + }, + }, + { + "url": root, + "status": "completed", + "markdown": "# Root\n", + "metadata": { + "status": 200, + "title": "Root", + "url": root, + }, + }, + ], + }, + } + ) + + class _RedirectCloudflareSession(_ParameterizedCloudflareSession): def get( self, @@ -2212,6 +2259,58 @@ def test_cloudflare_crawler_filters_returned_records_to_web_scope( ] +def test_cloudflare_crawler_does_not_treat_external_first_record_as_seed( + tmp_path: Path, +) -> None: + session = _ExternalFirstCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-external-first-cache", + session=session, + poll_interval=0, + ) + scope = CrawlScope( + roots=["https://example.com/root"], + depth=1, + limit=1, + include_external_links=False, + include_subdomains=False, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == ["https://example.com/root"] + + +def test_cloudflare_markdown_documents_keeps_cross_origin_redirected_seed( + tmp_path: Path, +) -> None: + session = _CrossOriginRedirectCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-cross-origin-seed-cache", + session=session, + poll_interval=0, + ) + scope = CrawlScope( + roots=["http://example.com"], + depth=0, + include_external_links=False, + include_subdomains=False, + ) + + documents = list(crawler.markdown_documents(scope, progress=False)) + + assert documents == [ + MarkdownDocument( + origin="https://example.com/landing", + content="# Landing\n", + ) + ] + + def test_cloudflare_crawler_cache_key_includes_crawl_parameters( tmp_path: Path, ) -> None: From ea16f580eea015940c874a1d711ca84f925b1251 Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Tue, 2 Jun 2026 22:22:24 -0400 Subject: [PATCH 15/17] Add changelog --- CHANGELOG.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..0c4474f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,17 @@ +# Changelog + +## Unreleased + +### Added + +- Added `raghilda.crawl`, including `CrawlScope`, `FetchedSource`, + `DirectoryCrawler`, `WebCrawler`, and `CloudflareCrawler`, for discovering + directory, web, and Cloudflare sources and converting them to markdown + documents. +- Added `BaseStore.ingest()` and `IngestSummary` for bulk document ingestion + with optional document preparation, parallel writes, and inserted, replaced, + and skipped counts. + +### Fixed + +- Fixed sitemap URL extraction so each `` entry is collected as one URL. From 8378f5050475aac2cd34a87cfa8c106bd36771c1 Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Mon, 8 Jun 2026 09:14:44 -0400 Subject: [PATCH 16/17] Add crawling and ingestion guide --- great-docs.yml | 10 + user_guide/04-crawling-and-ingestion.qmd | 260 +++++++++++++++++++++++ 2 files changed, 270 insertions(+) create mode 100644 user_guide/04-crawling-and-ingestion.qmd diff --git a/great-docs.yml b/great-docs.yml index 05245cf..ee074e1 100644 --- a/great-docs.yml +++ b/great-docs.yml @@ -75,6 +75,16 @@ reference: - name: store.OpenAIStore - name: store.PostgreSQLStore + - title: Crawl + desc: Crawlers for discovering and converting source documents + contents: + - crawl.CrawlScope + - crawl.FetchedSource + - crawl.BaseCrawler + - crawl.DirectoryCrawler + - crawl.WebCrawler + - crawl.CloudflareCrawler + - title: Embedding desc: Embedding providers for generating vector representations contents: diff --git a/user_guide/04-crawling-and-ingestion.qmd b/user_guide/04-crawling-and-ingestion.qmd new file mode 100644 index 0000000..8429a6f --- /dev/null +++ b/user_guide/04-crawling-and-ingestion.qmd @@ -0,0 +1,260 @@ +--- +title: "Crawling and Ingestion" +guide-section: "Getting Started" +--- + +raghilda can crawl files or websites, convert each source to Markdown, chunk the +documents, and write them to a store as a streaming ingestion job. This is the +main path for building or refreshing a knowledge base from source material that +changes over time. + +The API has three parts: + +- `CrawlScope` describes what to crawl. +- A crawler discovers sources and returns `MarkdownDocument` objects. +- `store.ingest()` prepares and upserts the stream. + +## Crawl a website + +Use `WebCrawler` when you want raghilda to fetch pages directly with +`requests`. The crawler starts from one or more roots, follows links up to +`depth`, and yields matching pages as Markdown documents. + +```{python} +#| eval: false +from datetime import timedelta + +from raghilda.chunker import MarkdownChunker +from raghilda.crawl import CrawlScope, WebCrawler +from raghilda.embedding import EmbeddingOpenAI +from raghilda.store import DuckDBStore + +store = DuckDBStore.create( + location="docs.db", + embed=EmbeddingOpenAI(), + name="docs", + overwrite=True, +) + +crawler = WebCrawler( + cache_dir=True, + cache_stale_after=timedelta(days=1), + max_workers=4, +) +scope = CrawlScope( + roots=["https://quarto.org/docs/guide/"], + depth=2, + include_patterns=[r"^https://quarto\.org/docs/guide/"], + exclude_patterns=[r"/reference/"], + include_types=["html"], +) + +chunker = MarkdownChunker(chunk_size=1600, target_overlap=0.5) + +summary = store.ingest( + crawler.markdown_documents(scope), + prepare=chunker.chunk, + max_workers=4, +) +store.build_index() + +print(summary) +``` + +`CrawlScope` owns traversal policy: + +| Field | Description | +|-------|-------------| +| `roots` | Starting files, directories, or URLs. | +| `depth` | Number of link or directory levels to follow. `0` means only the roots. | +| `limit` | Maximum number of origins to yield. | +| `include_patterns` | Regular expressions that origins must match. | +| `exclude_patterns` | Regular expressions that remove origins from the crawl. | +| `include_types` | Type labels to include, such as `html`, `markdown`, `pdf`, `python`, or `text`. | +| `exclude_types` | Type labels to skip. | +| `include_external_links` | Allow links outside the root origin. Defaults to `False`. | +| `include_subdomains` | Allow subdomains under the root host. Defaults to `False`. | + +`WebCrawler(cache_dir=True)` stores fetched response bodies under +`.raghilda/cache/web`. With `cache_stale_after`, fresh cached responses are +reused, and stale responses are revalidated with `ETag` or `Last-Modified` +headers when the server provides them. Pass `cache_force_refresh=True` to +`origins()`, `fetch_raw()`, `fetch_markdown()`, or `markdown_documents()` when a +run must bypass the cache. + +## Crawl local files + +Use `DirectoryCrawler` for local Markdown, notebooks, PDFs, text files, and +other files supported by `read_as_markdown()`. + +```{python} +#| eval: false +from raghilda.chunker import MarkdownChunker +from raghilda.crawl import CrawlScope, DirectoryCrawler +from raghilda.store import DuckDBStore + +store = DuckDBStore.create( + location="local-docs.db", + embed=None, + name="local_docs", + overwrite=True, +) + +crawler = DirectoryCrawler(cache_dir=True, max_workers=4) +scope = CrawlScope( + roots=["docs"], + depth=3, + include_patterns=[r".*\.(md|qmd|ipynb|pdf)$"], + exclude_patterns=[r".*/_site/.*", r".*/\.venv/.*"], +) + +chunker = MarkdownChunker() +summary = store.ingest( + crawler.markdown_documents(scope), + prepare=chunker.chunk, + max_workers=4, +) + +print(summary) +``` + +Directory crawling always reads the current filesystem tree. If you enable +`cache_dir`, converted Markdown is reused only when the source file hash and +modification time still match the cached metadata. The crawler also skips its +own cache directory when the cache is inside a crawled root. + +## Inspect before ingesting + +The crawler interface is useful even when you are not ready to write to a +store. Use `origins()` to inspect what the scope discovers, or use +`fetch_markdown()` to convert one source. + +```{python} +#| eval: false +from raghilda.crawl import CrawlScope, WebCrawler + +crawler = WebCrawler(cache_dir=True) +scope = CrawlScope( + roots=["https://example.com/docs/"], + depth=1, + limit=10, +) + +for origin in crawler.origins(scope): + print(origin) + +doc = crawler.fetch_markdown("https://example.com/docs/") +print(doc.origin) +print(doc.content[:500]) +``` + +All crawler classes implement the same public methods: + +| Method | Returns | +|--------|---------| +| `origins(scope)` | A lazy iterator of source origins. | +| `fetch_raw(origin)` | A `FetchedSource` with the cached body path and metadata. | +| `fetch_markdown(origin)` | One `MarkdownDocument`. | +| `markdown_documents(scope)` | A lazy iterator of `MarkdownDocument` objects. | + +## Customize conversion + +By default, crawlers convert fetched sources with raghilda's Markdown reader. +Pass a `convert` function when a site or file collection needs custom cleanup. +The function receives a `FetchedSource` and returns a `MarkdownDocument`. + +```{python} +#| eval: false +from raghilda.crawl import CrawlScope, FetchedSource, WebCrawler +from raghilda.document import MarkdownDocument +from raghilda.read import read_as_markdown + + +def convert_reference_page(source: FetchedSource) -> MarkdownDocument: + doc = read_as_markdown(str(source.body_path)) + markdown = doc.content + markdown = markdown.replace("Edit this page", "") + return MarkdownDocument(origin=source.origin, content=markdown) + + +crawler = WebCrawler(cache_dir=True) +scope = CrawlScope(roots=["https://example.com/reference/"], depth=1) +documents = crawler.markdown_documents(scope, convert=convert_reference_page) +``` + +Keep chunking in `store.ingest(prepare=...)`, not in the converter. The +converter should return one unchunked Markdown document per origin; `prepare` +can then apply the same chunking policy to every document. + +## Use Cloudflare crawling + +Use `CloudflareCrawler` when you want Cloudflare to perform the browser-rendered +crawl and return Markdown records. This is useful for sites that need rendering +or where you want Cloudflare's crawl service to manage discovery. + +```{python} +#| eval: false +import os +from datetime import timedelta + +from raghilda.chunker import MarkdownChunker +from raghilda.crawl import CloudflareCrawler, CrawlScope +from raghilda.store import DuckDBStore + +store = DuckDBStore.create( + location="rendered-docs.db", + embed=None, + name="rendered_docs", + overwrite=True, +) + +crawler = CloudflareCrawler( + account_id=os.environ["CLOUDFLARE_ACCOUNT_ID"], + api_token=os.environ["CLOUDFLARE_API_TOKEN"], + cache_dir=True, + cache_stale_after=timedelta(days=1), + render=True, +) +scope = CrawlScope( + roots=["https://example.com/docs/"], + depth=2, + include_patterns=["https://example.com/docs/**"], + exclude_patterns=["https://example.com/docs/archive/**"], + limit=250, +) + +summary = store.ingest( + crawler.markdown_documents(scope), + prepare=MarkdownChunker().chunk, + max_workers=4, +) + +print(summary) +``` + +For Cloudflare crawls, `include_patterns` and `exclude_patterns` use +Cloudflare-style wildcard patterns, such as `https://example.com/docs/**`. +`include_external_links` and `include_subdomains` are passed through to the +Cloudflare crawl request. + +## Refresh a store + +`store.ingest()` upserts each prepared document and returns an `IngestSummary` +with counts for inserted, replaced, and skipped documents. The input stream is +consumed lazily, and `prepare` runs in the worker pool. + +```{python} +#| eval: false +summary = store.ingest( + crawler.markdown_documents(scope, cache_force_refresh=True), + prepare=chunker.chunk, + max_workers=4, +) + +print(f"Inserted: {summary.inserted}") +print(f"Replaced: {summary.replaced}") +print(f"Skipped: {summary.skipped}") +``` + +Use `upsert()` directly when you need per-document `WriteResult` objects. +Use `ingest()` when you want one aggregate summary for a crawl or refresh job. From bdadc900f165e22734219123facbf53a1624aa2f Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Mon, 8 Jun 2026 09:32:54 -0400 Subject: [PATCH 17/17] Frame crawling guide as advanced path --- user_guide/04-crawling-and-ingestion.qmd | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/user_guide/04-crawling-and-ingestion.qmd b/user_guide/04-crawling-and-ingestion.qmd index 8429a6f..d877d16 100644 --- a/user_guide/04-crawling-and-ingestion.qmd +++ b/user_guide/04-crawling-and-ingestion.qmd @@ -3,12 +3,20 @@ title: "Crawling and Ingestion" guide-section: "Getting Started" --- -raghilda can crawl files or websites, convert each source to Markdown, chunk the -documents, and write them to a store as a streaming ingestion job. This is the -main path for building or refreshing a knowledge base from source material that -changes over time. - -The API has three parts: +raghilda's core workflow is intentionally sequential: find a source, read it, +chunk it, and upsert it. That is the recommended first path for building a store +because every step is visible, easy to inspect, and easy to change. + +As your source collection grows, store creation can become mostly waiting on +network requests, file conversion, chunking, and writes. The crawling and +ingestion API is the next step when you want that work to run concurrently. It +can make store creation substantially faster while still letting you inspect the +origins, fetched sources, converted Markdown documents, and final ingest +summary. + +The tradeoff is a few extra concepts. Use this API when the simple sequential +workflow is too slow, or when you need a repeatable refresh job for a larger +site, document collection, or codebase. The API has three parts: - `CrawlScope` describes what to crawl. - A crawler discovers sources and returns `MarkdownDocument` objects.